diff --git a/.gitattributes b/.gitattributes index f10c3ccf16a03dedf0da7d515eb3fb4d231e306c..b095778923416b46d0dedc8d146882cc6992fa3e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -98,3 +98,6 @@ mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/queryPhraseTable mosesdecoder/moses/bin/gcc-9/release/link-static/threading-multi/libmoses.a filter=lfs diff=lfs merge=lfs -text mosesdecoder/moses/bin/gcc-9/release/link-static/threading-multi/moses_test filter=lfs diff=lfs merge=lfs -text mosesdecoder/moses/LM/bin/BackwardTest.test/gcc-9/release/link-static/threading-multi/BackwardTest filter=lfs diff=lfs merge=lfs -text +mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2_lib.a filter=lfs diff=lfs merge=lfs -text +mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2decoder.a filter=lfs diff=lfs merge=lfs -text +mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/moses2 filter=lfs diff=lfs merge=lfs -text diff --git a/mosesdecoder/moses2/AlignmentInfo.cpp b/mosesdecoder/moses2/AlignmentInfo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2e19fa481e7b3862e0f75bb233b187b1bd0d00f4 --- /dev/null +++ b/mosesdecoder/moses2/AlignmentInfo.cpp @@ -0,0 +1,176 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2011 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ +#include +#include +#include +#include "AlignmentInfo.h" +#include "legacy/Util2.h" +#include "util/exception.hh" + +namespace Moses2 +{ + +AlignmentInfo::AlignmentInfo(const std::set > &pairs) + : m_collection(pairs) +{ + BuildNonTermIndexMaps(); +} + +AlignmentInfo::AlignmentInfo(const std::vector &aln) +{ + assert(aln.size()%2==0); + for (size_t i = 0; i < aln.size(); i+= 2) + m_collection.insert(std::make_pair(size_t(aln[i]),size_t(aln[i+1]))); + BuildNonTermIndexMaps(); +} + +AlignmentInfo::AlignmentInfo(const std::string &str) +{ + std::vector points = Tokenize(str, " "); + std::vector::const_iterator iter; + for (iter = points.begin(); iter != points.end(); iter++) { + std::vector point = Tokenize(*iter, "-"); + UTIL_THROW_IF2(point.size() != 2, "Bad format of word alignment point: " << *iter); + Add(point[0], point[1]); + } +} + +void AlignmentInfo::BuildNonTermIndexMaps() +{ + if (m_collection.empty()) { + return; + } + const_iterator p = begin(); + size_t maxIndex = p->second; + for (++p; p != end(); ++p) { + if (p->second > maxIndex) { + maxIndex = p->second; + } + } + m_nonTermIndexMap.resize(maxIndex+1, NOT_FOUND); + m_nonTermIndexMap2.resize(maxIndex+1, NOT_FOUND); + size_t i = 0; + for (p = begin(); p != end(); ++p) { + if (m_nonTermIndexMap[p->second] != NOT_FOUND) { + // 1-to-many. Definitely a set of terminals. Don't bother storing 1-to-1 index map + m_nonTermIndexMap.clear(); + m_nonTermIndexMap2.clear(); + return; + } + m_nonTermIndexMap[p->second] = i++; + m_nonTermIndexMap2[p->second] = p->first; + } +} + +std::set AlignmentInfo::GetAlignmentsForSource(size_t sourcePos) const +{ + std::set ret; + CollType::const_iterator iter; + for (iter = begin(); iter != end(); ++iter) { + // const std::pair &align = *iter; + if (iter->first == sourcePos) { + ret.insert(iter->second); + } + } + return ret; +} + +std::set AlignmentInfo::GetAlignmentsForTarget(size_t targetPos) const +{ + std::set ret; + CollType::const_iterator iter; + for (iter = begin(); iter != end(); ++iter) { + // const std::pair &align = *iter; + if (iter->second == targetPos) { + ret.insert(iter->first); + } + } + return ret; +} + + +bool +compare_target(std::pair const* a, + std::pair const* b) +{ + if(a->second < b->second) return true; + if(a->second == b->second) return (a->first < b->first); + return false; +} + + +std::vector< const std::pair* > +AlignmentInfo:: +GetSortedAlignments(WordAlignmentSort SortOrder) const +{ + std::vector< const std::pair* > ret; + + CollType::const_iterator iter; + for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) { + const std::pair &alignPair = *iter; + ret.push_back(&alignPair); + } + + switch (SortOrder) { + case NoSort: + break; + + case TargetOrder: + std::sort(ret.begin(), ret.end(), compare_target); + break; + + default: + UTIL_THROW(util::Exception, "Unknown word alignment sort option: " + << SortOrder); + } + + return ret; + +} + +std::vector AlignmentInfo::GetSourceIndex2PosMap() const +{ + std::set sourcePoses; + + CollType::const_iterator iter; + for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) { + size_t sourcePos = iter->first; + sourcePoses.insert(sourcePos); + } + std::vector ret(sourcePoses.begin(), sourcePoses.end()); + return ret; +} + +std::string AlignmentInfo::Debug(const System &system) const +{ + std::stringstream out; + out << *this; + return out.str(); +} + +std::ostream& operator<<(std::ostream& out, const AlignmentInfo& obj) +{ + AlignmentInfo::const_iterator iter; + for (iter = obj.begin(); iter != obj.end(); ++iter) { + out << iter->first << "-" << iter->second << " "; + } + return out; +} + +} diff --git a/mosesdecoder/moses2/AlignmentInfo.h b/mosesdecoder/moses2/AlignmentInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..89b31a1fc44c160baed53909b06ba9b06f21399e --- /dev/null +++ b/mosesdecoder/moses2/AlignmentInfo.h @@ -0,0 +1,148 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2011 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include "TypeDef.h" + +namespace Moses2 +{ + +class AlignmentInfoCollection; +class System; + +/** Collection of non-terminal alignment pairs, ordered by source index. + * Usually held by a TargetPhrase to map non-terms in hierarchical/syntax models + */ +class AlignmentInfo +{ + friend struct AlignmentInfoOrderer; + friend struct AlignmentInfoHasher; + friend class AlignmentInfoCollection; + friend class VW; + + friend std::ostream& operator<<(std::ostream& out, const AlignmentInfo& obj); + +public: + typedef std::set > CollType; + typedef std::vector NonTermIndexMap; + typedef CollType::const_iterator const_iterator; + + const_iterator begin() const { + return m_collection.begin(); + } + const_iterator end() const { + return m_collection.end(); + } + + void Add(size_t sourcePos, size_t targetPos) { + m_collection.insert(std::pair(sourcePos, targetPos)); + } + /** Provides a map from target-side to source-side non-terminal indices. + * The target-side index should be the rule symbol index (COUNTING terminals). + * The index returned is the rule non-terminal index (IGNORING terminals). + */ + const NonTermIndexMap &GetNonTermIndexMap() const { + return m_nonTermIndexMap; + } + + /** Like GetNonTermIndexMap but the return value is the symbol index (i.e. + * the index counting both terminals and non-terminals) */ + const NonTermIndexMap &GetNonTermIndexMap2() const { + return m_nonTermIndexMap2; + } + + const CollType &GetAlignments() const { + return m_collection; + } + + std::set GetAlignmentsForSource(size_t sourcePos) const; + std::set GetAlignmentsForTarget(size_t targetPos) const; + + size_t GetSize() const { + return m_collection.size(); + } + + std::vector< const std::pair* > + GetSortedAlignments(Moses2::WordAlignmentSort SortOrder) const; + + std::vector GetSourceIndex2PosMap() const; + + bool operator==(const AlignmentInfo& rhs) const { + return m_collection == rhs.m_collection && + m_nonTermIndexMap == rhs.m_nonTermIndexMap; + } + + std::string Debug(const System &system) const; + +private: + //! AlignmentInfo objects should only be created by an AlignmentInfoCollection + explicit AlignmentInfo(const std::set > &pairs); + explicit AlignmentInfo(const std::vector &aln); + + // used only by VW to load word alignment between sentences + explicit AlignmentInfo(const std::string &str); + + void BuildNonTermIndexMaps(); + + CollType m_collection; + NonTermIndexMap m_nonTermIndexMap; + NonTermIndexMap m_nonTermIndexMap2; +}; + +/** Define an arbitrary strict weak ordering between AlignmentInfo objects + * for use by AlignmentInfoCollection. + */ +struct AlignmentInfoOrderer { + bool operator()(const AlignmentInfo &a, const AlignmentInfo &b) const { + if (a.m_collection == b.m_collection) { + return a.m_nonTermIndexMap < b.m_nonTermIndexMap; + } else { + return a.m_collection < b.m_collection; + } + } +}; + +/** + * Hashing functoid + **/ +struct AlignmentInfoHasher { + size_t operator()(const AlignmentInfo& a) const { + size_t seed = 0; + boost::hash_combine(seed,a.m_collection); + boost::hash_combine(seed,a.m_nonTermIndexMap); + return seed; + } + +}; + +inline size_t hash_value(const AlignmentInfo& a) +{ + static AlignmentInfoHasher hasher; + return hasher(a); +} + +} diff --git a/mosesdecoder/moses2/AlignmentInfoCollection.cpp b/mosesdecoder/moses2/AlignmentInfoCollection.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a6116400c417b6fd51204b03a42b6552e6b401c6 --- /dev/null +++ b/mosesdecoder/moses2/AlignmentInfoCollection.cpp @@ -0,0 +1,62 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2011 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "AlignmentInfoCollection.h" + +using namespace std; + +namespace Moses2 +{ + +AlignmentInfoCollection AlignmentInfoCollection::s_instance; + +AlignmentInfoCollection::AlignmentInfoCollection() +{ + std::set > pairs; + m_emptyAlignmentInfo = Add(pairs); +} + +AlignmentInfoCollection::~AlignmentInfoCollection() +{} + +const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const +{ + return *m_emptyAlignmentInfo; +} + +AlignmentInfo const * +AlignmentInfoCollection:: +Add(AlignmentInfo const& ainfo) +{ +#ifdef WITH_THREADS + { + boost::shared_lock read_lock(m_accessLock); + AlignmentInfoSet::const_iterator i = m_collection.find(ainfo); + if (i != m_collection.end()) + return &*i; + } + boost::unique_lock lock(m_accessLock); +#endif + std::pair ret = m_collection.insert(ainfo); + return &(*ret.first); +} + + + +} diff --git a/mosesdecoder/moses2/AlignmentInfoCollection.h b/mosesdecoder/moses2/AlignmentInfoCollection.h new file mode 100644 index 0000000000000000000000000000000000000000..0d409430d76cc87fb60bab7f660d21a919e58b1a --- /dev/null +++ b/mosesdecoder/moses2/AlignmentInfoCollection.h @@ -0,0 +1,81 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2011 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include "AlignmentInfo.h" + +#include + +#ifdef WITH_THREADS +#include +#include +#endif + +namespace Moses2 +{ + +/** Singleton collection of all AlignmentInfo objects. + * Used as a cache of all alignment info to save space. + */ +class AlignmentInfoCollection +{ +public: + static AlignmentInfoCollection &Instance() { + return s_instance; + } + + /** Returns a pointer to an AlignmentInfo object with the same source-target + * alignment pairs as given in the argument. If the collection already + * contains such an object then returns a pointer to it; otherwise a new + * one is inserted. + */ +private: + const AlignmentInfo* Add(AlignmentInfo const& ainfo); + +public: + template + AlignmentInfo const * + Add(ALNREP const & aln) { + return this->Add(AlignmentInfo(aln)); + } + + //! Returns a pointer to an empty AlignmentInfo object. + const AlignmentInfo &GetEmptyAlignmentInfo() const; + +private: + typedef std::set AlignmentInfoSet; + + + //! Only a single static variable should be created. + AlignmentInfoCollection(); + ~AlignmentInfoCollection(); + + static AlignmentInfoCollection s_instance; + +#ifdef WITH_THREADS + //reader-writer lock + mutable boost::shared_mutex m_accessLock; +#endif + + AlignmentInfoSet m_collection; + const AlignmentInfo *m_emptyAlignmentInfo; +}; + +} diff --git a/mosesdecoder/moses2/ArcLists.cpp b/mosesdecoder/moses2/ArcLists.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1143024c0bcb9aee43c4ae39bb54710d1229273b --- /dev/null +++ b/mosesdecoder/moses2/ArcLists.cpp @@ -0,0 +1,127 @@ +/* + * ArcList.cpp + * + * Created on: 26 Oct 2015 + * Author: hieu + */ +#include +#include +#include +#include +#include "ArcLists.h" +#include "HypothesisBase.h" +#include "util/exception.hh" + +using namespace std; + +namespace Moses2 +{ + +ArcLists::ArcLists() +{ + // TODO Auto-generated constructor stub + +} + +ArcLists::~ArcLists() +{ + BOOST_FOREACH(const Coll::value_type &collPair, m_coll) { + const ArcList *arcList = collPair.second; + delete arcList; + } +} + +void ArcLists::AddArc(bool added, const HypothesisBase *currHypo, + const HypothesisBase *otherHypo) +{ + //cerr << added << " " << currHypo << " " << otherHypo << endl; + ArcList *arcList; + if (added) { + // we're winners! + if (otherHypo) { + // there was a existing losing hypo + arcList = &GetAndDetachArcList(otherHypo); + } else { + // there was no existing hypo + arcList = new ArcList; + } + m_coll[currHypo] = arcList; + } else { + // we're losers! + // there should be a winner, we're not doing beam pruning + UTIL_THROW_IF2(otherHypo == NULL, "There must have been a winning hypo"); + arcList = &GetArcList(otherHypo); + } + + // in any case, add the curr hypo + arcList->push_back(currHypo); +} + +ArcList &ArcLists::GetArcList(const HypothesisBase *hypo) +{ + Coll::iterator iter = m_coll.find(hypo); + UTIL_THROW_IF2(iter == m_coll.end(), "Can't find arc list"); + ArcList &arcList = *iter->second; + return arcList; +} + +const ArcList &ArcLists::GetArcList(const HypothesisBase *hypo) const +{ + Coll::const_iterator iter = m_coll.find(hypo); + + if (iter == m_coll.end()) { + cerr << "looking for:" << hypo << " have " << m_coll.size() << " :"; + BOOST_FOREACH(const Coll::value_type &collPair, m_coll) { + const HypothesisBase *hypo = collPair.first; + cerr << hypo << " "; + } + } + + UTIL_THROW_IF2(iter == m_coll.end(), "Can't find arc list for " << hypo); + ArcList &arcList = *iter->second; + return arcList; +} + +ArcList &ArcLists::GetAndDetachArcList(const HypothesisBase *hypo) +{ + Coll::iterator iter = m_coll.find(hypo); + UTIL_THROW_IF2(iter == m_coll.end(), "Can't find arc list"); + ArcList &arcList = *iter->second; + + m_coll.erase(iter); + + return arcList; +} + +void ArcLists::Sort() +{ + BOOST_FOREACH(Coll::value_type &collPair, m_coll) { + ArcList &list = *collPair.second; + std::sort(list.begin(), list.end(), HypothesisFutureScoreOrderer() ); + } +} + +void ArcLists::Delete(const HypothesisBase *hypo) +{ + //cerr << "hypo=" << hypo->Debug() << endl; + //cerr << "m_coll=" << m_coll.size() << endl; + Coll::iterator iter = m_coll.find(hypo); + UTIL_THROW_IF2(iter == m_coll.end(), "Can't find arc list"); + ArcList *arcList = iter->second; + + m_coll.erase(iter); + delete arcList; +} + +std::string ArcLists::Debug(const System &system) const +{ + stringstream strm; + BOOST_FOREACH(const Coll::value_type &collPair, m_coll) { + const ArcList *arcList = collPair.second; + strm << arcList << "(" << arcList->size() << ") "; + } + return strm.str(); +} + +} + diff --git a/mosesdecoder/moses2/ArcLists.h b/mosesdecoder/moses2/ArcLists.h new file mode 100644 index 0000000000000000000000000000000000000000..742c9d9e25617e4ecfb5d0a6afd02b72fce269fe --- /dev/null +++ b/mosesdecoder/moses2/ArcLists.h @@ -0,0 +1,43 @@ +/* + * ArcList.h + * + * Created on: 26 Oct 2015 + * Author: hieu + */ +#pragma once +#include +#include + +namespace Moses2 +{ +class System; + +class HypothesisBase; + +typedef std::vector ArcList; + +class ArcLists +{ +public: + ArcLists(); + virtual ~ArcLists(); + + void AddArc(bool added, const HypothesisBase *currHypo, + const HypothesisBase *otherHypo); + void Sort(); + void Delete(const HypothesisBase *hypo); + + const ArcList &GetArcList(const HypothesisBase *hypo) const; + + std::string Debug(const System &system) const; +protected: + typedef boost::unordered_map Coll; + Coll m_coll; + + ArcList &GetArcList(const HypothesisBase *hypo); + ArcList &GetAndDetachArcList(const HypothesisBase *hypo); + +}; + +} + diff --git a/mosesdecoder/moses2/Array.h b/mosesdecoder/moses2/Array.h new file mode 100644 index 0000000000000000000000000000000000000000..fa6db557ea84b33fc5ee2e00f884f3bc2882a900 --- /dev/null +++ b/mosesdecoder/moses2/Array.h @@ -0,0 +1,83 @@ +#pragma once +#include +#include +#include "MemPool.h" + +namespace Moses2 +{ + +template +class Array +{ +public: + typedef T* iterator; + typedef const T* const_iterator; + //! iterators + const_iterator begin() const { + return m_arr; + } + const_iterator end() const { + return m_arr + m_size; + } + + iterator begin() { + return m_arr; + } + iterator end() { + return m_arr + m_size; + } + + Array(MemPool &pool, size_t size = 0, const T &val = T()) { + m_size = size; + m_maxSize = size; + m_arr = pool.Allocate(size); + for (size_t i = 0; i < size; ++i) { + m_arr[i] = val; + } + } + + size_t size() const { + return m_size; + } + + const T& operator[](size_t ind) const { + return m_arr[ind]; + } + + T& operator[](size_t ind) { + return m_arr[ind]; + } + + T *GetArray() { + return m_arr; + } + + size_t hash() const { + size_t seed = 0; + for (size_t i = 0; i < m_size; ++i) { + boost::hash_combine(seed, m_arr[i]); + } + return seed; + } + + int Compare(const Array &compare) const { + + int cmp = memcmp(m_arr, compare.m_arr, sizeof(T) * m_size); + return cmp; + } + + bool operator==(const Array &compare) const { + int cmp = Compare(compare); + return cmp == 0; + } + + void resize(size_t newSize) { + assert(m_size <= m_maxSize); + m_size = newSize; + } +protected: + size_t m_size, m_maxSize; + T *m_arr; +}; + +} diff --git a/mosesdecoder/moses2/DLLEntryApi.cpp b/mosesdecoder/moses2/DLLEntryApi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..060bea67828323945a520f410a3df99ea5107045 --- /dev/null +++ b/mosesdecoder/moses2/DLLEntryApi.cpp @@ -0,0 +1,74 @@ +#include "Moses2Wrapper.h" +#include +#include + + +// Generic helper definitions for shared library support +#if defined _WIN32 +#define IMPORT __declspec(dllimport) +#define EXPORT __declspec(dllexport) +#else // !(defined _WIN32 || defined __CYGWIN__) -- i.e., not Windows +#define __stdcall +#if __GNUC__ >= 4 +#define IMPORT __attribute__ ((visibility ("default"))) +#define EXPORT __attribute__ ((visibility ("default"))) +#else // __GNUC__ < 4, which does not support the __attribute__ tag +#define IMPORT +#define EXPORT +#endif // __GNUC__ >= 4 +#endif + + +using namespace std; +using namespace Moses2; + +extern "C" EXPORT MosesApiErrorCode __stdcall GetMosesSystem(const char* filePath, Moses2::Moses2Wrapper * *pObject) { + + if (*pObject == NULL) { + *pObject = new Moses2::Moses2Wrapper(filePath); + return MS_API_OK; + } + else { + return MS_API_E_FAILURE; + } +} + +extern "C" EXPORT MosesApiErrorCode __stdcall Translate(Moses2::Moses2Wrapper * pObject, long id, const char* input, char** output) { + if (pObject != NULL) + { + std::string tr = pObject->Translate(input, id); + *output = Moses2Wrapper::CopyString(tr.c_str()); + return MS_API_OK; + } + else { + return MS_API_E_FAILURE; + } +} + +extern "C" EXPORT MosesApiErrorCode __stdcall FreeMemory(char* output) { + if (output != nullptr) { + Moses2Wrapper::Free(output); + return MS_API_OK; + } + else { + return MS_API_E_FAILURE; + } +} + +extern "C" EXPORT MosesApiErrorCode __stdcall ReleaseSystem(Moses2::Moses2Wrapper **pObject) { + if (*pObject != NULL) + { + delete* pObject; + *pObject = NULL; + return MS_API_OK; + } + else { + return MS_API_E_FAILURE; + } +} + +extern "C" EXPORT MosesApiErrorCode __stdcall EngineVersion() { + //std::cout << "windows build on v1142/ msvc 14.27.29110"<< std::endl; + std::cout << "0.0.1" << std::endl; + return MS_API_OK; +} \ No newline at end of file diff --git a/mosesdecoder/moses2/EstimatedScores.cpp b/mosesdecoder/moses2/EstimatedScores.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e71647ce529df13cd503675af1be2c02c72eacdd --- /dev/null +++ b/mosesdecoder/moses2/EstimatedScores.cpp @@ -0,0 +1,117 @@ +// $Id$ +// vim:tabstop=2 + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include +#include "EstimatedScores.h" + +using namespace std; + +namespace Moses2 +{ +/** + * Calculate future score estimate for a given coverage bitmap + * + * /param bitmap coverage bitmap + */ + +float EstimatedScores::CalcEstimatedScore(Bitmap const &bitmap) const +{ + const size_t notInGap = numeric_limits::max(); + size_t startGap = notInGap; + float estimatedScore = 0.0f; + for (size_t currPos = 0; currPos < bitmap.GetSize(); currPos++) { + // start of a new gap? + if (bitmap.GetValue(currPos) == false && startGap == notInGap) { + startGap = currPos; + } + // end of a gap? + else if (bitmap.GetValue(currPos) == true && startGap != notInGap) { + estimatedScore += GetValue(startGap, currPos - 1); + startGap = notInGap; + } + } + // coverage ending with gap? + if (startGap != notInGap) { + estimatedScore += GetValue(startGap, bitmap.GetSize() - 1); + } + + return estimatedScore; +} + +/** + * Calculare future score estimate for a given coverage bitmap + * and an additional span that is also covered. This function is used + * to compute future score estimates for hypotheses that we may want + * build, but first want to check. + * + * Note: this function is implemented a bit more complex than + * the basic one (w/o additional phrase) for speed reasons, + * which is probably overkill. + * + * /param bitmap coverage bitmap + * /param startPos start of the span that is added to the coverage + * /param endPos end of the span that is added to the coverage + */ + +float EstimatedScores::CalcEstimatedScore(Bitmap const &bitmap, size_t startPos, + size_t endPos) const +{ + const size_t notInGap = numeric_limits::max(); + float estimatedScore = 0.0f; + size_t startGap = bitmap.GetFirstGapPos(); + if (startGap == NOT_FOUND) return estimatedScore; // everything filled + + // start loop at first gap + size_t startLoop = startGap + 1; + if (startPos == startGap) { // unless covered by phrase + startGap = notInGap; + startLoop = endPos + 1; // -> postpone start + } + + size_t lastCovered = bitmap.GetLastPos(); + if (endPos > lastCovered || lastCovered == NOT_FOUND) lastCovered = endPos; + + for (size_t currPos = startLoop; currPos <= lastCovered; currPos++) { + // start of a new gap? + if (startGap == notInGap && bitmap.GetValue(currPos) == false + && (currPos < startPos || currPos > endPos)) { + startGap = currPos; + } + // end of a gap? + else if (startGap != notInGap + && (bitmap.GetValue(currPos) == true + || (startPos <= currPos && currPos <= endPos))) { + estimatedScore += GetValue(startGap, currPos - 1); + startGap = notInGap; + } + } + // coverage ending with gap? + if (lastCovered != bitmap.GetSize() - 1) { + estimatedScore += GetValue(lastCovered + 1, bitmap.GetSize() - 1); + } + + return estimatedScore; +} + +} + diff --git a/mosesdecoder/moses2/EstimatedScores.h b/mosesdecoder/moses2/EstimatedScores.h new file mode 100644 index 0000000000000000000000000000000000000000..f854707839b6df1c4e44412b4f456711029eb15a --- /dev/null +++ b/mosesdecoder/moses2/EstimatedScores.h @@ -0,0 +1,59 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include "legacy/Util2.h" +#include "legacy/Bitmap.h" +#include "legacy/Matrix.h" + +namespace Moses2 +{ +class MemPool; +class System; + +//! A square array of floats to store future costs in the phrase-based decoder +class EstimatedScores: public Matrix +{ +public: + EstimatedScores(MemPool &pool, size_t size) : + Matrix(pool, size, size) { + } + + ~EstimatedScores(); // not implemented + + float CalcEstimatedScore(Bitmap const&) const; + float CalcEstimatedScore(Bitmap const&, size_t startPos, size_t endPos) const; + + std::ostream &Debug(std::ostream &out, const System &system) const { + for (size_t endPos = 0; endPos < GetSize(); endPos++) { + for (size_t startPos = 0; startPos < GetSize(); startPos++) + out << GetValue(startPos, endPos) << " "; + out << std::endl; + } + return out; + } + +}; + +} + diff --git a/mosesdecoder/moses2/FF/Distortion.cpp b/mosesdecoder/moses2/FF/Distortion.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3c0cd8ceee4416eb271c3647e5990a73599c5bec --- /dev/null +++ b/mosesdecoder/moses2/FF/Distortion.cpp @@ -0,0 +1,182 @@ +/* + * Distortion.cpp + * + * Created on: 28 Oct 2015 + * Author: hieu + */ +#include +#include "Distortion.h" +#include "../PhraseBased/Hypothesis.h" +#include "../PhraseBased/Manager.h" +#include "../legacy/Range.h" +#include "../legacy/Bitmap.h" + +using namespace std; + +namespace Moses2 +{ + +struct DistortionState_traditional: public FFState { + Range range; + int first_gap; + + DistortionState_traditional() : + range() { + // uninitialised + } + + void Set(const Range& wr, int fg) { + range = wr; + first_gap = fg; + } + + size_t hash() const { + return range.GetEndPos(); + } + virtual bool operator==(const FFState& other) const { + const DistortionState_traditional& o = + static_cast(other); + return range.GetEndPos() == o.range.GetEndPos(); + } + + virtual std::string ToString() const { + stringstream sb; + sb << first_gap << " " << range; + return sb.str(); + } + +}; + +/////////////////////////////////////////////////////////////////////// +Distortion::Distortion(size_t startInd, const std::string &line) : + StatefulFeatureFunction(startInd, line) +{ + ReadParameters(); +} + +Distortion::~Distortion() +{ + // TODO Auto-generated destructor stub +} + +FFState* Distortion::BlankState(MemPool &pool, const System &sys) const +{ + return new (pool.Allocate()) DistortionState_traditional(); +} + +void Distortion::EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const +{ + DistortionState_traditional &stateCast = + static_cast(state); + + // fake previous translated phrase start and end + size_t start = NOT_FOUND; + size_t end = NOT_FOUND; + /* + if (input.m_frontSpanCoveredLength > 0) { + // can happen with --continue-partial-translation + start = 0; + end = input.m_frontSpanCoveredLength -1; + } + */ + + stateCast.range = Range(start, end); + stateCast.first_gap = NOT_FOUND; +} + +void Distortion::EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ +} + +void Distortion::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ +} + +void Distortion::EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const +{ + const DistortionState_traditional &prev = + static_cast(prevState); + SCORE distortionScore = CalculateDistortionScore(prev.range, + hypo.GetInputPath().range, prev.first_gap); + //cerr << "distortionScore=" << distortionScore << endl; + + scores.PlusEquals(mgr.system, *this, distortionScore); + + DistortionState_traditional &stateCast = + static_cast(state); + stateCast.Set(hypo.GetInputPath().range, hypo.GetBitmap().GetFirstGapPos()); + + //cerr << "hypo=" << hypo.Debug(mgr.system) << endl; +} + +SCORE Distortion::CalculateDistortionScore(const Range &prev, const Range &curr, + const int FirstGap) const +{ + bool useEarlyDistortionCost = false; + if (!useEarlyDistortionCost) { + return -(SCORE) ComputeDistortionDistance(prev, curr); + } else { + /* Pay distortion score as soon as possible, from Moore and Quirk MT Summit 2007 + Definitions: + S : current source range + S' : last translated source phrase range + S'' : longest fully-translated initial segment + */ + + int prefixEndPos = (int) FirstGap - 1; + if ((int) FirstGap == -1) prefixEndPos = -1; + + // case1: S is adjacent to S'' => return 0 + if ((int) curr.GetStartPos() == prefixEndPos + 1) { + //IFVERBOSE(4) std::cerr<< "MQ07disto:case1" << std::endl; + return 0; + } + + // case2: S is to the left of S' => return 2(length(S)) + if ((int) curr.GetEndPos() < (int) prev.GetEndPos()) { + //IFVERBOSE(4) std::cerr<< "MQ07disto:case2" << std::endl; + return (float) -2 * (int) curr.GetNumWordsCovered(); + } + + // case3: S' is a subsequence of S'' => return 2(nbWordBetween(S,S'')+length(S)) + if ((int) prev.GetEndPos() <= prefixEndPos) { + //IFVERBOSE(4) std::cerr<< "MQ07disto:case3" << std::endl; + int z = (int) curr.GetStartPos() - prefixEndPos - 1; + return (float) -2 * (z + (int) curr.GetNumWordsCovered()); + } + + // case4: otherwise => return 2(nbWordBetween(S,S')+length(S)) + //IFVERBOSE(4) std::cerr<< "MQ07disto:case4" << std::endl; + return (float) -2 + * ((int) curr.GetNumWordsBetween(prev) + (int) curr.GetNumWordsCovered()); + + } +} + +int Distortion::ComputeDistortionDistance(const Range& prev, + const Range& current) const +{ + int dist = 0; + if (prev.GetNumWordsCovered() == 0) { + dist = current.GetStartPos(); + } else { + dist = (int) prev.GetEndPos() - (int) current.GetStartPos() + 1; + } + return abs(dist); +} + +void Distortion::EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const +{ + UTIL_THROW2("Not implemented"); +} + +} diff --git a/mosesdecoder/moses2/FF/Distortion.h b/mosesdecoder/moses2/FF/Distortion.h new file mode 100644 index 0000000000000000000000000000000000000000..685aa1445b387e49085a622d316ad7fb7be3985f --- /dev/null +++ b/mosesdecoder/moses2/FF/Distortion.h @@ -0,0 +1,59 @@ +/* + * Distortion.h + * + * Created on: 28 Oct 2015 + * Author: hieu + */ + +#ifndef DISTORTION_H_ +#define DISTORTION_H_ + +#include "StatefulFeatureFunction.h" +#include "../legacy/Range.h" +#include "../TypeDef.h" + +namespace Moses2 +{ + +class Distortion: public StatefulFeatureFunction +{ +public: + Distortion(size_t startInd, const std::string &line); + virtual ~Distortion(); + + virtual FFState* BlankState(MemPool &pool, const System &sys) const; + virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void EvaluateWhenApplied(const std::deque &hypos) const { + } + + virtual void EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const; + + virtual void EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const; + +protected: + SCORE CalculateDistortionScore(const Range &prev, const Range &curr, + const int FirstGap) const; + + int ComputeDistortionDistance(const Range& prev, const Range& current) const; + +}; + +} + +#endif /* DISTORTION_H_ */ diff --git a/mosesdecoder/moses2/FF/ExampleStatefulFF.cpp b/mosesdecoder/moses2/FF/ExampleStatefulFF.cpp new file mode 100644 index 0000000000000000000000000000000000000000..86b364f5320a392c4a23a70416f5a0370696be92 --- /dev/null +++ b/mosesdecoder/moses2/FF/ExampleStatefulFF.cpp @@ -0,0 +1,96 @@ +/* + * ExampleStatefulFF.cpp + * + * Created on: 27 Oct 2015 + * Author: hieu + */ +#include +#include "ExampleStatefulFF.h" +#include "../PhraseBased/Manager.h" +#include "../PhraseBased/Hypothesis.h" + +using namespace std; + +namespace Moses2 +{ + +class ExampleState: public FFState +{ +public: + int targetLen; + + ExampleState() { + // uninitialised + } + + virtual size_t hash() const { + return (size_t) targetLen; + } + virtual bool operator==(const FFState& o) const { + const ExampleState& other = static_cast(o); + return targetLen == other.targetLen; + } + + virtual std::string ToString() const { + stringstream sb; + sb << targetLen; + return sb.str(); + } + +}; + +//////////////////////////////////////////////////////////////////////////////////////// +ExampleStatefulFF::ExampleStatefulFF(size_t startInd, const std::string &line) : + StatefulFeatureFunction(startInd, line) +{ + ReadParameters(); +} + +ExampleStatefulFF::~ExampleStatefulFF() +{ + // TODO Auto-generated destructor stub +} + +FFState* ExampleStatefulFF::BlankState(MemPool &pool, const System &sys) const +{ + return new (pool.Allocate()) ExampleState(); +} + +void ExampleStatefulFF::EmptyHypothesisState(FFState &state, + const ManagerBase &mgr, const InputType &input, + const Hypothesis &hypo) const +{ + ExampleState &stateCast = static_cast(state); + stateCast.targetLen = 0; +} + +void ExampleStatefulFF::EvaluateInIsolation(MemPool &pool, + const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ +} + +void ExampleStatefulFF::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ +} + +void ExampleStatefulFF::EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const +{ + ExampleState &stateCast = static_cast(state); + stateCast.targetLen = hypo.GetTargetPhrase().GetSize(); +} + +void ExampleStatefulFF::EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const +{ + UTIL_THROW2("Not implemented"); +} + +} + diff --git a/mosesdecoder/moses2/FF/ExampleStatefulFF.h b/mosesdecoder/moses2/FF/ExampleStatefulFF.h new file mode 100644 index 0000000000000000000000000000000000000000..437f545154f92fcd7543e480a66dd9cd7bb7e562 --- /dev/null +++ b/mosesdecoder/moses2/FF/ExampleStatefulFF.h @@ -0,0 +1,46 @@ +/* + * ExampleStatefulFF.h + * + * Created on: 27 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include "StatefulFeatureFunction.h" + +namespace Moses2 +{ + +class ExampleStatefulFF: public StatefulFeatureFunction +{ +public: + ExampleStatefulFF(size_t startInd, const std::string &line); + virtual ~ExampleStatefulFF(); + + virtual FFState* BlankState(MemPool &pool, const System &sys) const; + virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const; + + virtual void EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const; + +}; + +} + diff --git a/mosesdecoder/moses2/FF/ExampleStatelessFF.cpp b/mosesdecoder/moses2/FF/ExampleStatelessFF.cpp new file mode 100644 index 0000000000000000000000000000000000000000..29716aaf80e7838cb37fa8c26cfd1658ab4dd2fa --- /dev/null +++ b/mosesdecoder/moses2/FF/ExampleStatelessFF.cpp @@ -0,0 +1,40 @@ +/* + * SkeletonStatefulFF.cpp + * + * Created on: 27 Oct 2015 + * Author: hieu + */ +#include "../Scores.h" + +#include "ExampleStatelessFF.h" + +namespace Moses2 +{ + +ExampleStatelessFF::ExampleStatelessFF(size_t startInd, + const std::string &line) : + StatelessFeatureFunction(startInd, line) +{ + ReadParameters(); +} + +ExampleStatelessFF::~ExampleStatelessFF() +{ + // TODO Auto-generated destructor stub +} + +void ExampleStatelessFF::EvaluateInIsolation(MemPool &pool, + const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ +} + +void ExampleStatelessFF::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ +} + +} + diff --git a/mosesdecoder/moses2/FF/ExampleStatelessFF.h b/mosesdecoder/moses2/FF/ExampleStatelessFF.h new file mode 100644 index 0000000000000000000000000000000000000000..20b1acaaf00690c2844943ed61935106d9e5da33 --- /dev/null +++ b/mosesdecoder/moses2/FF/ExampleStatelessFF.h @@ -0,0 +1,34 @@ +/* + * SkeletonStatefulFF.h + * + * Created on: 27 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include "StatelessFeatureFunction.h" + +namespace Moses2 +{ + +class ExampleStatelessFF: public StatelessFeatureFunction +{ +public: + ExampleStatelessFF(size_t startInd, const std::string &line); + virtual ~ExampleStatelessFF(); + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + +}; + +} + diff --git a/mosesdecoder/moses2/FF/FFState.cpp b/mosesdecoder/moses2/FF/FFState.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c92b213fac671241036c2dcea40535fd28ba8c4f --- /dev/null +++ b/mosesdecoder/moses2/FF/FFState.cpp @@ -0,0 +1 @@ +#include "FFState.h" diff --git a/mosesdecoder/moses2/FF/FFState.h b/mosesdecoder/moses2/FF/FFState.h new file mode 100644 index 0000000000000000000000000000000000000000..41789b7dc906320d6ad8b5facb6ef2d260633575 --- /dev/null +++ b/mosesdecoder/moses2/FF/FFState.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include "util/exception.hh" + +namespace Moses2 +{ + +class FFState +{ +public: + virtual ~FFState() { + } + virtual size_t hash() const = 0; + virtual bool operator==(const FFState& other) const = 0; + + virtual bool operator!=(const FFState& other) const { + return !(*this == other); + } + + virtual std::string ToString() const = 0; +}; + +//////////////////////////////////////////////////////////////////////////////////////// +inline std::ostream& operator<<(std::ostream& out, const FFState& obj) +{ + out << obj.ToString(); + return out; +} + +//////////////////////////////////////////////////////////////////////////////////////// +class DummyState: public FFState +{ +public: + DummyState() { + } + + virtual size_t hash() const { + return 0; + } + + virtual bool operator==(const FFState& other) const { + return true; + } + +}; + +} + diff --git a/mosesdecoder/moses2/FF/FeatureFunction.cpp b/mosesdecoder/moses2/FF/FeatureFunction.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6b4617dc565d1af2fc56e93510415f77a445622e --- /dev/null +++ b/mosesdecoder/moses2/FF/FeatureFunction.cpp @@ -0,0 +1,82 @@ +/* + * FeatureFunction.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include "FeatureFunction.h" +#include "../System.h" +#include "../legacy/Util2.h" +#include "util/exception.hh" + +using namespace std; + +namespace Moses2 +{ + +FeatureFunction::FeatureFunction(size_t startInd, const std::string &line) + :m_startInd(startInd) + ,m_numScores(1) + ,m_PhraseTableInd(NOT_FOUND) + ,m_tuneable(true) +{ + ParseLine(line); + //cerr << GetName() << " " << m_startInd << "-" << (m_startInd + m_numScores - 1) << endl; +} + +FeatureFunction::~FeatureFunction() +{ + // TODO Auto-generated destructor stub +} + +void FeatureFunction::ParseLine(const std::string &line) +{ + vector toks = Tokenize(line); + UTIL_THROW_IF2(toks.empty(), "Empty line"); + + string nameStub = toks[0]; + + set keys; + + for (size_t i = 1; i < toks.size(); ++i) { + vector args = TokenizeFirstOnly(toks[i], "="); + UTIL_THROW_IF2(args.size() != 2, + "Incorrect format for feature function arg: " << toks[i]); + + pair::iterator, bool> ret = keys.insert(args[0]); + UTIL_THROW_IF2(!ret.second, "Duplicate key in line " << line); + + if (args[0] == "num-features") { + m_numScores = Scan(args[1]); + } else if (args[0] == "name") { + m_name = args[1]; + } else { + m_args.push_back(args); + } + } +} + +void FeatureFunction::ReadParameters() +{ + while (!m_args.empty()) { + const vector &args = m_args[0]; + SetParameter(args[0], args[1]); + + m_args.erase(m_args.begin()); + } +} + +void FeatureFunction::SetParameter(const std::string& key, + const std::string& value) +{ + if (key == "tuneable") { + m_tuneable = Scan(value); + } else { + UTIL_THROW2(GetName() << ": Unknown argument " << key << "=" << value); + } +} + +} + diff --git a/mosesdecoder/moses2/FF/FeatureFunction.h b/mosesdecoder/moses2/FF/FeatureFunction.h new file mode 100644 index 0000000000000000000000000000000000000000..4fa2ee7c33b9ab43f800b8bbb4f52bdb7be2eaaa --- /dev/null +++ b/mosesdecoder/moses2/FF/FeatureFunction.h @@ -0,0 +1,118 @@ +/* + * FeatureFunction.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include "../TypeDef.h" +#include "../Phrase.h" + +namespace Moses2 +{ +template +class TargetPhrase; + +class System; +class PhraseImpl; +class TargetPhrases; +class TargetPhraseImpl; +class Scores; +class ManagerBase; +class MemPool; +class InputType; + +namespace SCFG +{ +class TargetPhrase; +class TargetPhrases; +class Word; +} + +class FeatureFunction +{ +public: + + FeatureFunction(size_t startInd, const std::string &line); + virtual ~FeatureFunction(); + virtual void Load(System &system) { + } + + size_t GetStartInd() const { + return m_startInd; + } + size_t GetNumScores() const { + return m_numScores; + } + const std::string &GetName() const { + return m_name; + } + void SetName(const std::string &val) { + m_name = val; + } + + virtual size_t HasPhraseTableInd() const { + return false; + } + void SetPhraseTableInd(size_t ind) { + m_PhraseTableInd = ind; + } + size_t GetPhraseTableInd() const { + return m_PhraseTableInd; + } + + //! if false, then this feature is not displayed in the n-best list. + // use with care + virtual bool IsTuneable() const { + return m_tuneable; + } + + virtual void SetParameter(const std::string& key, const std::string& value); + + // may have more factors than actually need, but not guaranteed. + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const = 0; + + // For SCFG decoding, the source can contain non-terminals, NOT the raw + // source from the input sentence + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const = 0; + + // used by lexicalised reordering model to add scores to tp data structures + virtual void EvaluateAfterTablePruning(MemPool &pool, + const TargetPhrases &tps, const Phrase &sourcePhrase) const { + } + + virtual void EvaluateAfterTablePruning(MemPool &pool, + const SCFG::TargetPhrases &tps, const Phrase &sourcePhrase) const { + } + + virtual void InitializeForInput(const ManagerBase &mgr, const InputType &input) { }; + + // clean up temporary memory, called after processing each sentence + virtual void CleanUpAfterSentenceProcessing(const System &system, const InputType &input) const { + } + +protected: + size_t m_startInd; + size_t m_numScores; + size_t m_PhraseTableInd; + std::string m_name; + std::vector > m_args; + bool m_tuneable; + + virtual void ReadParameters(); + void ParseLine(const std::string &line); +}; + +} + diff --git a/mosesdecoder/moses2/FF/FeatureFunctions.cpp b/mosesdecoder/moses2/FF/FeatureFunctions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3ea75b21da2ac203cbf04874b9d3f0415c267248 --- /dev/null +++ b/mosesdecoder/moses2/FF/FeatureFunctions.cpp @@ -0,0 +1,291 @@ +/* + * FeatureFunctions.cpp + * + * Created on: 27 Oct 2015 + * Author: hieu + */ + +#include +#include "FeatureRegistry.h" +#include "FeatureFunctions.h" +#include "StatefulFeatureFunction.h" +#include "../System.h" +#include "../Scores.h" +#include "../MemPool.h" + +#include "../TranslationModel/PhraseTable.h" +#include "../TranslationModel/UnknownWordPenalty.h" +#include "../SCFG/TargetPhraseImpl.h" +#include "../SCFG/Word.h" +#include "../PhraseBased/TargetPhraseImpl.h" +#include "util/exception.hh" + +using namespace std; + +namespace Moses2 +{ +FeatureFunctions::FeatureFunctions(System &system) : + m_system(system), m_ffStartInd(0) +{ +} + +FeatureFunctions::~FeatureFunctions() +{ + RemoveAllInColl(m_featureFunctions); +} + +void FeatureFunctions::Load() +{ + // load, everything but pts + BOOST_FOREACH(const FeatureFunction *ff, m_featureFunctions) { + FeatureFunction *nonConstFF = const_cast(ff); + PhraseTable *pt = dynamic_cast(nonConstFF); + + if (pt) { + // do nothing. load pt last + } else { + cerr << "Loading " << nonConstFF->GetName() << endl; + nonConstFF->Load(m_system); + cerr << "Finished loading " << nonConstFF->GetName() << endl; + } + } + +// load pt + BOOST_FOREACH(const PhraseTable *pt, phraseTables) { + PhraseTable *nonConstPT = const_cast(pt); + cerr << "Loading " << nonConstPT->GetName() << endl; + nonConstPT->Load(m_system); + cerr << "Finished loading " << nonConstPT->GetName() << endl; + } +} + +void FeatureFunctions::Create() +{ + const Parameter ¶ms = m_system.params; + + const PARAM_VEC *ffParams = params.GetParam("feature"); + UTIL_THROW_IF2(ffParams == NULL, "Must have [feature] section"); + + BOOST_FOREACH(const std::string &line, *ffParams) { + FeatureFunction *ff = Create(line); + + m_featureFunctions.push_back(ff); + + StatefulFeatureFunction *sfff = dynamic_cast(ff); + if (sfff) { + sfff->SetStatefulInd(m_statefulFeatureFunctions.size()); + m_statefulFeatureFunctions.push_back(sfff); + } + + if (ff->HasPhraseTableInd()) { + ff->SetPhraseTableInd(m_withPhraseTableInd.size()); + m_withPhraseTableInd.push_back(ff); + } + + PhraseTable *pt = dynamic_cast(ff); + if (pt) { + pt->SetPtInd(phraseTables.size()); + phraseTables.push_back(pt); + } + + UnknownWordPenalty *unkWP = dynamic_cast(pt); + if (unkWP) { + m_unkWP = unkWP; + + // legacy support + if (m_system.options.unk.drop) { + unkWP->SetParameter("drop", "true"); + } + if (m_system.options.unk.mark) { + unkWP->SetParameter("prefix", m_system.options.unk.prefix); + unkWP->SetParameter("suffix", m_system.options.unk.suffix); + } + } + } + + OverrideFeatures(); +} + +FeatureFunction *FeatureFunctions::Create(const std::string &line) +{ + vector toks = Tokenize(line); + + FeatureFunction *ff = FeatureRegistry::Instance().Construct(m_ffStartInd, toks[0], line); + UTIL_THROW_IF2(ff == NULL, "Feature function not created"); + + // name + if (ff->GetName() == "") { + ff->SetName(GetDefaultName(toks[0])); + } + + m_ffStartInd += ff->GetNumScores(); + + return ff; +} + +std::string FeatureFunctions::GetDefaultName(const std::string &stub) +{ + size_t ind; + boost::unordered_map::iterator iter = + m_defaultNames.find(stub); + if (iter == m_defaultNames.end()) { + m_defaultNames[stub] = 0; + ind = 0; + } else { + ind = ++(iter->second); + } + return stub + SPrint(ind); +} + +const FeatureFunction *FeatureFunctions::FindFeatureFunction( + const std::string &name) const +{ + BOOST_FOREACH(const FeatureFunction *ff, m_featureFunctions) { + if (ff->GetName() == name) { + return ff; + } + } + return NULL; +} + +FeatureFunction *FeatureFunctions::FindFeatureFunction( + const std::string &name) +{ + BOOST_FOREACH(const FeatureFunction *ff, m_featureFunctions) { + if (ff->GetName() == name) { + return const_cast(ff); + } + } + return NULL; +} + +const PhraseTable *FeatureFunctions::GetPhraseTableExcludeUnknownWordPenalty(size_t ptInd) +{ + // assume only 1 unk wp + std::vector tmpVec(phraseTables); + std::vector::iterator iter; + for (iter = tmpVec.begin(); iter != tmpVec.end(); ++iter) { + const PhraseTable *pt = *iter; + if (pt == m_unkWP) { + tmpVec.erase(iter); + break; + } + } + + const PhraseTable *pt = tmpVec[ptInd]; + return pt; +} + +void FeatureFunctions::EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, TargetPhraseImpl &targetPhrase) const +{ + SCORE estimatedScore = 0; + + BOOST_FOREACH(const FeatureFunction *ff, m_featureFunctions) { + Scores& scores = targetPhrase.GetScores(); + ff->EvaluateInIsolation(pool, system, source, targetPhrase, scores, estimatedScore); + } + + targetPhrase.SetEstimatedScore(estimatedScore); +} + +void FeatureFunctions::EvaluateInIsolation( + MemPool &pool, + const System &system, + const Phrase &source, + SCFG::TargetPhraseImpl &targetPhrase) const +{ + SCORE estimatedScore = 0; + + BOOST_FOREACH(const FeatureFunction *ff, m_featureFunctions) { + Scores& scores = targetPhrase.GetScores(); + ff->EvaluateInIsolation(pool, system, source, targetPhrase, scores, estimatedScore); + } + + targetPhrase.SetEstimatedScore(estimatedScore); +} + +void FeatureFunctions::EvaluateAfterTablePruning(MemPool &pool, + const TargetPhrases &tps, const Phrase &sourcePhrase) const +{ + BOOST_FOREACH(const FeatureFunction *ff, m_featureFunctions) { + ff->EvaluateAfterTablePruning(pool, tps, sourcePhrase); + } +} + +void FeatureFunctions::EvaluateAfterTablePruning(MemPool &pool, const SCFG::TargetPhrases &tps, + const Phrase &sourcePhrase) const +{ + BOOST_FOREACH(const FeatureFunction *ff, m_featureFunctions) { + ff->EvaluateAfterTablePruning(pool, tps, sourcePhrase); + } +} + +void FeatureFunctions::EvaluateWhenAppliedBatch(const Batch &batch) const +{ + BOOST_FOREACH(const StatefulFeatureFunction *ff, m_statefulFeatureFunctions) { + ff->EvaluateWhenAppliedBatch(m_system, batch); + } +} + +void FeatureFunctions::InitializeForInput(const ManagerBase &mgr, const InputType &input) +{ + BOOST_FOREACH(FeatureFunction *ff, m_featureFunctions) { + ff->InitializeForInput(mgr, input); + } +} + +void FeatureFunctions::CleanUpAfterSentenceProcessing(const InputType &input) const +{ + BOOST_FOREACH(const FeatureFunction *ff, m_featureFunctions) { + ff->CleanUpAfterSentenceProcessing(m_system, input); + } +} + +void FeatureFunctions::ShowWeights(const Weights &allWeights) +{ + BOOST_FOREACH(const FeatureFunction *ff, m_featureFunctions) { + cout << ff->GetName(); + if (ff->IsTuneable()) { + cout << "="; + vector weights = allWeights.GetWeights(*ff); + for (size_t i = 0; i < weights.size(); ++i) { + cout << " " << weights[i]; + } + cout << endl; + } else { + cout << " UNTUNEABLE" << endl; + } + } +} + +void FeatureFunctions::OverrideFeatures() +{ + const Parameter ¶meter = m_system.params; + + const PARAM_VEC *params = parameter.GetParam("feature-overwrite"); + for (size_t i = 0; params && i < params->size(); ++i) { + const string &str = params->at(i); + vector toks = Tokenize(str); + UTIL_THROW_IF2(toks.size() <= 1, "Incorrect format for feature override: " << str); + + FeatureFunction *ff = FindFeatureFunction(toks[0]); + UTIL_THROW_IF2(ff == NULL, "Feature function not found: " << toks[0]); + + for (size_t j = 1; j < toks.size(); ++j) { + const string &keyValStr = toks[j]; + vector keyVal = Tokenize(keyValStr, "="); + UTIL_THROW_IF2(keyVal.size() != 2, "Incorrect format for parameter override: " << keyValStr); + + cerr << "Override " << ff->GetName() << " " + << keyVal[0] << "=" << keyVal[1] << endl; + + ff->SetParameter(keyVal[0], keyVal[1]); + + } + } + +} + +} + diff --git a/mosesdecoder/moses2/FF/FeatureFunctions.h b/mosesdecoder/moses2/FF/FeatureFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..43a5793c4d3bb4f8a5e71ea6d0113368bd3b582e --- /dev/null +++ b/mosesdecoder/moses2/FF/FeatureFunctions.h @@ -0,0 +1,113 @@ +/* + * FeatureFunctions.h + * + * Created on: 27 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include "../legacy/Parameter.h" +#include "../Phrase.h" + +namespace Moses2 +{ +template +class TargetPhrase; + +class System; +class FeatureFunction; +class StatefulFeatureFunction; +class PhraseTable; +class Manager; +class MemPool; +class PhraseImpl; +class TargetPhrases; +class TargetPhraseImpl; +class Scores; +class Hypothesis; +class UnknownWordPenalty; +class Weights; +class InputType; + +namespace SCFG +{ +class TargetPhraseImpl; +class TargetPhrases; +class Word; +} + +class FeatureFunctions +{ +public: + std::vector phraseTables; + + FeatureFunctions(System &system); + virtual ~FeatureFunctions(); + + const std::vector &GetFeatureFunctions() const { + return m_featureFunctions; + } + + const std::vector &GetStatefulFeatureFunctions() const { + return m_statefulFeatureFunctions; + } + + const std::vector &GetWithPhraseTableInd() const { + return m_withPhraseTableInd; + } + + size_t GetNumScores() const { + return m_ffStartInd; + } + + void Create(); + void Load(); + + const FeatureFunction *FindFeatureFunction(const std::string &name) const; + + const PhraseTable *GetPhraseTableExcludeUnknownWordPenalty(size_t ptInd); + const UnknownWordPenalty *GetUnknownWordPenalty() const { + return m_unkWP; + } + + // the pool here must be the system pool if the rule was loaded during load, or the mgr pool if it was loaded on demand + void EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, TargetPhraseImpl &targetPhrase) const; + void EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, SCFG::TargetPhraseImpl &targetPhrase) const; + + void EvaluateAfterTablePruning(MemPool &pool, const TargetPhrases &tps, + const Phrase &sourcePhrase) const; + void EvaluateAfterTablePruning(MemPool &pool, const SCFG::TargetPhrases &tps, + const Phrase &sourcePhrase) const; + + void EvaluateWhenAppliedBatch(const Batch &batch) const; + + void InitializeForInput(const ManagerBase &mgr, const InputType &input); + void CleanUpAfterSentenceProcessing(const InputType &input) const; + + void ShowWeights(const Weights &allWeights); + +protected: + std::vector m_featureFunctions; + std::vector m_statefulFeatureFunctions; + std::vector m_withPhraseTableInd; + const UnknownWordPenalty *m_unkWP; + + boost::unordered_map m_defaultNames; + System &m_system; + size_t m_ffStartInd; + + FeatureFunction *Create(const std::string &line); + std::string GetDefaultName(const std::string &stub); + void OverrideFeatures(); + FeatureFunction *FindFeatureFunction(const std::string &name); + +}; + +} + diff --git a/mosesdecoder/moses2/FF/FeatureRegistry.cpp b/mosesdecoder/moses2/FF/FeatureRegistry.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8e457d37169c2b6eeee44d443de675fc07b0e332 --- /dev/null +++ b/mosesdecoder/moses2/FF/FeatureRegistry.cpp @@ -0,0 +1,128 @@ +#include "FeatureRegistry.h" + +#include "../TranslationModel/Memory/PhraseTableMemory.h" +#include "../TranslationModel/ProbingPT.h" +#include "../TranslationModel/UnknownWordPenalty.h" +#include "../TranslationModel/Transliteration.h" +#include "../TranslationModel/Dynamic/DynamicPhraseTable.h" + +#include "../LM/KENLM.h" +#include "../LM/KENLMBatch.h" +#include "../LM/LanguageModel.h" +#include "../LM/GPULM.h" + +#include "Distortion.h" +#include "LexicalReordering/LexicalReordering.h" +#include "PhrasePenalty.h" +#include "WordPenalty.h" +#include "OSM/OpSequenceModel.h" + +#include "ExampleStatefulFF.h" +#include "ExampleStatelessFF.h" + +using namespace std; + + +namespace Moses2 +{ +FeatureRegistry FeatureRegistry::s_instance; + +template +class DefaultFeatureFactory: public FeatureFactory +{ +public: + FeatureFunction *Create(size_t startInd, const std::string &line) const { + return new F(startInd, line); + } +}; + +//////////////////////////////////////////////////////////////////// +class KenFactory: public FeatureFactory +{ +public: + FeatureFunction *Create(size_t startInd, const std::string &line) const { + return ConstructKenLM(startInd, line); + } +}; + +//////////////////////////////////////////////////////////////////// +FeatureRegistry::FeatureRegistry() +{ + // Feature with same name as class +#define MOSES_FNAME(name) Add(#name, new DefaultFeatureFactory< name >()); + // Feature with different name than class. +#define MOSES_FNAME2(name, type) Add(name, new DefaultFeatureFactory< type >()); + + MOSES_FNAME2("PhraseDictionaryMemory", PhraseTableMemory); + MOSES_FNAME(ProbingPT); + MOSES_FNAME2("PhraseDictionaryTransliteration", Transliteration); + MOSES_FNAME(UnknownWordPenalty); + MOSES_FNAME(DynamicPhraseTable); + + Add("KENLM", new KenFactory()); + + MOSES_FNAME(KENLMBatch); + MOSES_FNAME(GPULM); + + MOSES_FNAME(LanguageModel); + + MOSES_FNAME(Distortion); + MOSES_FNAME(LexicalReordering); + MOSES_FNAME(PhrasePenalty); + MOSES_FNAME(WordPenalty); + MOSES_FNAME(OpSequenceModel); + + MOSES_FNAME(ExampleStatefulFF); + MOSES_FNAME(ExampleStatelessFF); +} + +FeatureRegistry::~FeatureRegistry() +{ + +} + +void FeatureRegistry::Add(const std::string &name, FeatureFactory *factory) +{ + std::pair > to_ins(name, + boost::shared_ptr(factory)); + if (!registry_.insert(to_ins).second) { + cerr << "Duplicate feature name " << name << endl; + abort(); + } +} + +FeatureFunction *FeatureRegistry::Construct(size_t startInd, + const std::string &name, const std::string &line) const +{ + Map::const_iterator i = registry_.find(name); + if (i == registry_.end()) { + cerr << "Feature name " << name << " is not registered."; + abort(); + } + FeatureFactory *fact = i->second.get(); + FeatureFunction *ff = fact->Create(startInd, line); + return ff; +} + +void FeatureRegistry::PrintFF() const +{ + std::vector ffs; + std::cerr << "Available feature functions:" << std::endl; + Map::const_iterator iter; + for (iter = registry_.begin(); iter != registry_.end(); ++iter) { + const std::string &ffName = iter->first; + ffs.push_back(ffName); + } + + std::vector::const_iterator iterVec; + std::sort(ffs.begin(), ffs.end()); + for (iterVec = ffs.begin(); iterVec != ffs.end(); ++iterVec) { + const std::string &ffName = *iterVec; + std::cerr << ffName << " "; + } + + std::cerr << std::endl; +} + +} + diff --git a/mosesdecoder/moses2/FF/FeatureRegistry.h b/mosesdecoder/moses2/FF/FeatureRegistry.h new file mode 100644 index 0000000000000000000000000000000000000000..1e6fd399d99ae2f748abd653c8d98e61d36ef276 --- /dev/null +++ b/mosesdecoder/moses2/FF/FeatureRegistry.h @@ -0,0 +1,52 @@ +#pragma once +#include +#include + +namespace Moses2 +{ +class FeatureFunction; + +//////////////////////////////////////////////////////////////////// +class FeatureFactory +{ +public: + virtual ~FeatureFactory() { + } + + virtual FeatureFunction *Create(size_t startInd, const std::string &line) const = 0; + +protected: + FeatureFactory() { + } +}; + +//////////////////////////////////////////////////////////////////// +class FeatureRegistry +{ +public: + static const FeatureRegistry &Instance() { + return s_instance; + } + + ~FeatureRegistry(); + + FeatureFunction *Construct(size_t startInd, const std::string &name, + const std::string &line) const; + void PrintFF() const; + +private: + static FeatureRegistry s_instance; + + typedef boost::unordered_map > Map; + Map registry_; + + FeatureRegistry(); + + void Add(const std::string &name, FeatureFactory *factory); + +}; + +//////////////////////////////////////////////////////////////////// + +} + diff --git a/mosesdecoder/moses2/FF/LexicalReordering/BidirectionalReorderingState.cpp b/mosesdecoder/moses2/FF/LexicalReordering/BidirectionalReorderingState.cpp new file mode 100644 index 0000000000000000000000000000000000000000..36e232f914b0aca15269a908a8af220beef29818 --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/BidirectionalReorderingState.cpp @@ -0,0 +1,79 @@ +/* + * BidirectionalReorderingState.cpp + * + * Created on: 22 Mar 2016 + * Author: hieu + */ +#include +#include "BidirectionalReorderingState.h" +#include "../../legacy/Util2.h" +#include "../../PhraseBased/Manager.h" + +using namespace std; + +namespace Moses2 +{ + +BidirectionalReorderingState::BidirectionalReorderingState( + const LRModel &config, LRState *bw, LRState *fw, size_t offset) : + LRState(config, LRModel::Bidirectional, offset), m_backward(bw), m_forward( + fw) +{ +} + +BidirectionalReorderingState::~BidirectionalReorderingState() +{ + // TODO Auto-generated destructor stub +} + +void BidirectionalReorderingState::Init(const LRState *prev, + const TargetPhrase &topt, const InputPathBase &path, bool first, + const Bitmap *coverage) +{ + if (m_backward) { + m_backward->Init(prev, topt, path, first, coverage); + } + if (m_forward) { + m_forward->Init(prev, topt, path, first, coverage); + } +} + +std::string BidirectionalReorderingState::ToString() const +{ + return "BidirectionalReorderingState " + SPrint(this) + " " + + SPrint(m_backward) + " " + SPrint(m_forward); +} + +size_t BidirectionalReorderingState::hash() const +{ + size_t ret = m_backward->hash(); + boost::hash_combine(ret, m_forward->hash()); + + return ret; +} + +bool BidirectionalReorderingState::operator==(const FFState& o) const +{ + if (&o == this) return true; + + BidirectionalReorderingState const &other = + static_cast(o); + + bool ret = (*m_backward == *other.m_backward) + && (*m_forward == *other.m_forward); + return ret; +} + +void BidirectionalReorderingState::Expand(const ManagerBase &mgr, + const LexicalReordering &ff, const Hypothesis &hypo, size_t phraseTableInd, + Scores &scores, FFState &state) const +{ + BidirectionalReorderingState &stateCast = + static_cast(state); + m_backward->Expand(mgr, ff, hypo, phraseTableInd, scores, + *stateCast.m_backward); + m_forward->Expand(mgr, ff, hypo, phraseTableInd, scores, + *stateCast.m_forward); +} + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/FF/LexicalReordering/BidirectionalReorderingState.h b/mosesdecoder/moses2/FF/LexicalReordering/BidirectionalReorderingState.h new file mode 100644 index 0000000000000000000000000000000000000000..289809798db75a5e7197cbe84fc3a96f12413db2 --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/BidirectionalReorderingState.h @@ -0,0 +1,40 @@ +/* + * BidirectionalReorderingState.h + * + * Created on: 22 Mar 2016 + * Author: hieu + */ +#pragma once +#include "LRState.h" + +namespace Moses2 +{ + +class BidirectionalReorderingState: public LRState +{ +public: + BidirectionalReorderingState(const LRModel &config, LRState *bw, LRState *fw, + size_t offset); + + virtual ~BidirectionalReorderingState(); + + void Init(const LRState *prev, const TargetPhrase &topt, + const InputPathBase &path, bool first, const Bitmap *coverage); + + size_t hash() const; + virtual bool operator==(const FFState& other) const; + + virtual std::string ToString() const; + + void Expand(const ManagerBase &mgr, const LexicalReordering &ff, + const Hypothesis &hypo, size_t phraseTableInd, Scores &scores, + FFState &state) const; + +protected: + LRState *m_backward; + LRState *m_forward; + +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/FF/LexicalReordering/HReorderingBackwardState.cpp b/mosesdecoder/moses2/FF/LexicalReordering/HReorderingBackwardState.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a54cd7fcfda60506916c1c799debc102ed0b8409 --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/HReorderingBackwardState.cpp @@ -0,0 +1,71 @@ +/* + * HReorderingBackwardState.cpp + * + * Created on: 22 Mar 2016 + * Author: hieu + */ + +#include "HReorderingBackwardState.h" +#include "../../PhraseBased/Hypothesis.h" +#include "../../PhraseBased/Manager.h" + +namespace Moses2 +{ + +HReorderingBackwardState::HReorderingBackwardState(MemPool &pool, + const LRModel &config, size_t offset) : + LRState(config, LRModel::Backward, offset), reoStack(pool) +{ + // TODO Auto-generated constructor stub + +} + +HReorderingBackwardState::~HReorderingBackwardState() +{ + // TODO Auto-generated destructor stub +} + +void HReorderingBackwardState::Init(const LRState *prev, + const TargetPhrase &topt, const InputPathBase &path, bool first, + const Bitmap *coverage) +{ + prevTP = &topt; + reoStack.Init(); +} + +size_t HReorderingBackwardState::hash() const +{ + size_t ret = reoStack.hash(); + return ret; +} + +bool HReorderingBackwardState::operator==(const FFState& o) const +{ + const HReorderingBackwardState& other = + static_cast(o); + bool ret = reoStack == other.reoStack; + return ret; +} + +std::string HReorderingBackwardState::ToString() const +{ + return "HReorderingBackwardState " + SPrint(m_offset); +} + +void HReorderingBackwardState::Expand(const ManagerBase &mgr, + const LexicalReordering &ff, const Hypothesis &hypo, size_t phraseTableInd, + Scores &scores, FFState &state) const +{ + HReorderingBackwardState &nextState = + static_cast(state); + nextState.Init(this, hypo.GetTargetPhrase(), hypo.GetInputPath(), false, + NULL); + nextState.reoStack = reoStack; + + const Range &swrange = hypo.GetInputPath().range; + int reoDistance = nextState.reoStack.ShiftReduce(swrange); + ReorderingType reoType = m_configuration.GetOrientation(reoDistance); + CopyScores(mgr.system, scores, hypo.GetTargetPhrase(), reoType); +} + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/FF/LexicalReordering/HReorderingBackwardState.h b/mosesdecoder/moses2/FF/LexicalReordering/HReorderingBackwardState.h new file mode 100644 index 0000000000000000000000000000000000000000..8cdea5a440eef667e55e5fe9f0fcbdd37aec2d74 --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/HReorderingBackwardState.h @@ -0,0 +1,37 @@ +/* + * HReorderingBackwardState.h + * + * Created on: 22 Mar 2016 + * Author: hieu + */ +#pragma once +#include "LRState.h" +#include "ReorderingStack.h" + +namespace Moses2 +{ + +class HReorderingBackwardState: public LRState +{ +private: + ReorderingStack reoStack; + +public: + HReorderingBackwardState(MemPool &pool, const LRModel &config, size_t offset); + + virtual void Init(const LRState *prev, const TargetPhrase &topt, + const InputPathBase &path, bool first, const Bitmap *coverage); + + virtual ~HReorderingBackwardState(); + + size_t hash() const; + virtual bool operator==(const FFState& other) const; + virtual std::string ToString() const; + void Expand(const ManagerBase &mgr, const LexicalReordering &ff, + const Hypothesis &hypo, size_t phraseTableInd, Scores &scores, + FFState &state) const; + +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/FF/LexicalReordering/HReorderingForwardState.cpp b/mosesdecoder/moses2/FF/LexicalReordering/HReorderingForwardState.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1041115f7d29a158f795d3de9a43ef945e6a585d --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/HReorderingForwardState.cpp @@ -0,0 +1,87 @@ +/* + * HReorderingForwardState.cpp + * + * Created on: 22 Mar 2016 + * Author: hieu + */ + +#include "HReorderingForwardState.h" +#include "../../InputPathBase.h" +#include "../../PhraseBased/Manager.h" +#include "../../PhraseBased/Hypothesis.h" + +namespace Moses2 +{ + +HReorderingForwardState::HReorderingForwardState(const LRModel &config, + size_t offset) : + LRState(config, LRModel::Forward, offset), m_first(true) +{ + prevPath = NULL; + m_coverage = NULL; +} + +HReorderingForwardState::~HReorderingForwardState() +{ + // TODO Auto-generated destructor stub +} + +void HReorderingForwardState::Init(const LRState *prev, + const TargetPhrase &topt, const InputPathBase &path, bool first, + const Bitmap *coverage) +{ + prevTP = &topt; + prevPath = &path; + m_first = first; + m_coverage = coverage; +} + +size_t HReorderingForwardState::hash() const +{ + size_t ret; + ret = hash_value(prevPath->range); + return ret; +} + +bool HReorderingForwardState::operator==(const FFState& o) const +{ + if (&o == this) return true; + + HReorderingForwardState const& other = + static_cast(o); + + int compareScores = ( + (prevPath->range == other.prevPath->range) ? + ComparePrevScores(other.prevTP) : + (prevPath->range < other.prevPath->range) ? -1 : 1); + return compareScores == 0; +} + +std::string HReorderingForwardState::ToString() const +{ + return "HReorderingForwardState " + SPrint(m_offset); +} + +void HReorderingForwardState::Expand(const ManagerBase &mgr, + const LexicalReordering &ff, const Hypothesis &hypo, size_t phraseTableInd, + Scores &scores, FFState &state) const +{ + const Range &cur = hypo.GetInputPath().range; + // keep track of the current coverage ourselves so we don't need the hypothesis + Manager &mgrCast = const_cast(static_cast(mgr)); + Bitmaps &bms = mgrCast.GetBitmaps(); + const Bitmap &cov = bms.GetBitmap(*m_coverage, cur); + + if (!m_first) { + LRModel::ReorderingType reoType; + reoType = m_configuration.GetOrientation(prevPath->range, cur, cov); + CopyScores(mgr.system, scores, hypo.GetTargetPhrase(), reoType); + } + + HReorderingForwardState &stateCast = + static_cast(state); + stateCast.Init(this, hypo.GetTargetPhrase(), hypo.GetInputPath(), false, + &cov); +} + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/FF/LexicalReordering/HReorderingForwardState.h b/mosesdecoder/moses2/FF/LexicalReordering/HReorderingForwardState.h new file mode 100644 index 0000000000000000000000000000000000000000..51358daa34908ee85545ee43a79b65460d7412e1 --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/HReorderingForwardState.h @@ -0,0 +1,41 @@ +/* + * HReorderingForwardState.h + * + * Created on: 22 Mar 2016 + * Author: hieu + */ +#pragma once +#include "LRState.h" + +namespace Moses2 +{ +class Range; +class Bitmap; +class InputPathBase; + +class HReorderingForwardState: public LRState +{ +public: + HReorderingForwardState(const LRModel &config, size_t offset); + virtual ~HReorderingForwardState(); + + void Init(const LRState *prev, const TargetPhrase &topt, + const InputPathBase &path, bool first, const Bitmap *coverage); + + size_t hash() const; + virtual bool operator==(const FFState& other) const; + virtual std::string ToString() const; + void Expand(const ManagerBase &mgr, const LexicalReordering &ff, + const Hypothesis &hypo, size_t phraseTableInd, Scores &scores, + FFState &state) const; + +protected: + bool m_first; + //const Range &m_prevRange; + const InputPathBase *prevPath; + const Bitmap *m_coverage; + +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/FF/LexicalReordering/LRModel.cpp b/mosesdecoder/moses2/FF/LexicalReordering/LRModel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c2a9140096c046e46dd0ee34773f630b70f52b30 --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/LRModel.cpp @@ -0,0 +1,196 @@ +/* + * LRModel.cpp + * + * Created on: 23 Mar 2016 + * Author: hieu + */ + +#include "LRModel.h" +#include "../../legacy/Util2.h" +#include "../../legacy/Range.h" +#include "../../legacy/Bitmap.h" +#include "../../MemPool.h" +#include "util/exception.hh" +#include "PhraseBasedReorderingState.h" +#include "BidirectionalReorderingState.h" +#include "HReorderingBackwardState.h" +#include "HReorderingForwardState.h" + +using namespace std; + +namespace Moses2 +{ + +bool IsMonotonicStep(Range const& prev, // words range of last source phrase + Range const& cur, // words range of current source phrase + Bitmap const& cov) // coverage bitmap +{ + size_t e = prev.GetEndPos() + 1; + size_t s = cur.GetStartPos(); + return (s == e || (s >= e && !cov.GetValue(e))); +} + +bool IsSwap(Range const& prev, Range const& cur, Bitmap const& cov) +{ + size_t s = prev.GetStartPos(); + size_t e = cur.GetEndPos(); + return (e + 1 == s || (e < s && !cov.GetValue(s - 1))); +} + +LRModel::LRModel(const std::string &modelType, LexicalReordering &ff) : + m_modelType(None), m_phraseBased(true), m_collapseScores(false), m_direction( + Backward), m_scoreProducer(&ff) +{ + std::vector config = Tokenize(modelType, "-"); + + for (size_t i = 0; i < config.size(); ++i) { + if (config[i] == "hier") { + m_phraseBased = false; + } else if (config[i] == "phrase") { + m_phraseBased = true; + } else if (config[i] == "wbe") { + m_phraseBased = true; + } + // no word-based decoding available, fall-back to phrase-based + // This is the old lexical reordering model combination of moses + + else if (config[i] == "msd") { + m_modelType = MSD; + } else if (config[i] == "mslr") { + m_modelType = MSLR; + } else if (config[i] == "monotonicity") { + m_modelType = Monotonic; + } else if (config[i] == "leftright") { + m_modelType = LeftRight; + } + + // unidirectional is deprecated, use backward instead + else if (config[i] == "unidirectional") { + m_direction = Backward; + } else if (config[i] == "backward") { + m_direction = Backward; + } else if (config[i] == "forward") { + m_direction = Forward; + } else if (config[i] == "bidirectional") { + m_direction = Bidirectional; + } + + else if (config[i] == "f") { + m_condition = F; + } else if (config[i] == "fe") { + m_condition = FE; + } + + else if (config[i] == "collapseff") { + m_collapseScores = true; + } else if (config[i] == "allff") { + m_collapseScores = false; + } else { + std::cerr + << "Illegal part in the lexical reordering configuration string: " + << config[i] << std::endl; + exit(1); + } + } + + if (m_modelType == None) { + std::cerr << "You need to specify the type of the reordering model " + << "(msd, monotonicity,...)" << std::endl; + exit(1); + } + +} + +LRModel::~LRModel() +{ + // TODO Auto-generated destructor stub +} + +size_t LRModel::GetNumberOfTypes() const +{ + return ((m_modelType == MSD) ? 3 : (m_modelType == MSLR) ? 4 : 2); +} + +/// return orientation for the first phrase +LRModel::ReorderingType LRModel::GetOrientation(Range const& cur) const +{ + UTIL_THROW_IF2(m_modelType == None, "Reordering Model Type is None"); + return ((m_modelType == LeftRight) ? R : (cur.GetStartPos() == 0) ? M : + (m_modelType == MSD) ? D : (m_modelType == MSLR) ? DR : NM); +} + +LRModel::ReorderingType LRModel::GetOrientation(Range const& prev, + Range const& cur) const +{ + UTIL_THROW_IF2(m_modelType == None, "No reordering model type specified"); + return ( + (m_modelType == LeftRight) ? prev.GetEndPos() <= cur.GetStartPos() ? R : L + : (cur.GetStartPos() == prev.GetEndPos() + 1) ? M : + (m_modelType == Monotonic) ? NM : + (prev.GetStartPos() == cur.GetEndPos() + 1) ? S : + (m_modelType == MSD) ? D : + (cur.GetStartPos() > prev.GetEndPos()) ? DR : DL); +} + +LRModel::ReorderingType LRModel::GetOrientation(int const reoDistance) const +{ + // this one is for HierarchicalReorderingBackwardState + return ((m_modelType == LeftRight) ? (reoDistance >= 1) ? R : L + : (reoDistance == 1) ? M : (m_modelType == Monotonic) ? NM : + (reoDistance == -1) ? S : (m_modelType == MSD) ? D : + (reoDistance > 1) ? DR : DL); +} + +LRState *LRModel::CreateLRState(MemPool &pool) const +{ + LRState *bwd = NULL, *fwd = NULL; + size_t offset = 0; + + switch (m_direction) { + case Backward: + case Bidirectional: + if (m_phraseBased) { + bwd = + new (pool.Allocate()) PhraseBasedReorderingState( + *this, Backward, offset); + //cerr << "bwd=" << bwd << bwd->ToString() << endl; + } else { + bwd = + new (pool.Allocate()) HReorderingBackwardState( + pool, *this, offset); + } + offset += m_collapseScores ? 1 : GetNumberOfTypes(); + if (m_direction == Backward) return bwd; // else fall through + case Forward: + if (m_phraseBased) { + fwd = + new (pool.Allocate()) PhraseBasedReorderingState( + *this, Forward, offset); + //cerr << "fwd=" << fwd << fwd->ToString() << endl; + } else { + fwd = + new (pool.Allocate()) HReorderingForwardState( + *this, offset); + } + offset += m_collapseScores ? 1 : GetNumberOfTypes(); + if (m_direction == Forward) return fwd; + } + + //cerr << "LRStates:" << *bwd << endl << *fwd << endl; + BidirectionalReorderingState *ret = + new (pool.Allocate()) BidirectionalReorderingState( + *this, bwd, fwd, 0); + return ret; +} + +LRModel::ReorderingType LRModel::GetOrientation(Range const& prev, + Range const& cur, Bitmap const& cov) const +{ + return ( + (m_modelType == LeftRight) ? cur.GetStartPos() > prev.GetEndPos() ? R : L + : IsMonotonicStep(prev, cur, cov) ? M : (m_modelType == Monotonic) ? NM : + IsSwap(prev, cur, cov) ? S : (m_modelType == MSD) ? D : + cur.GetStartPos() > prev.GetEndPos() ? DR : DL); +} + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/FF/LexicalReordering/LRModel.h b/mosesdecoder/moses2/FF/LexicalReordering/LRModel.h new file mode 100644 index 0000000000000000000000000000000000000000..0309d53869a9a1070641a6b332e9a2280bbb4a5b --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/LRModel.h @@ -0,0 +1,99 @@ +/* + * LRModel.h + * + * Created on: 23 Mar 2016 + * Author: hieu + */ +#pragma once +#include + +namespace Moses2 +{ + +class MemPool; +class Range; +class Bitmap; +class LRState; +class LexicalReordering; + +class LRModel +{ +public: + enum ModelType { + Monotonic, MSD, MSLR, LeftRight, None + }; + enum Direction { + Forward, Backward, Bidirectional + }; + enum Condition { + F, E, FE + }; + + enum ReorderingType { + M = 0, // monotonic + NM = 1, // non-monotonic + S = 1, // swap + D = 2, // discontinuous + DL = 2, // discontinuous, left + DR = 3, // discontinuous, right + R = 0, // right + L = 1, // left + MAX = 3, // largest possible + NONE = 4 // largest possible + }; + + LRModel(const std::string &modelType, LexicalReordering &ff); + virtual ~LRModel(); + + ModelType GetModelType() const { + return m_modelType; + } + Direction GetDirection() const { + return m_direction; + } + Condition GetCondition() const { + return m_condition; + } + + bool IsPhraseBased() const { + return m_phraseBased; + } + + bool CollapseScores() const { + return m_collapseScores; + } + + size_t GetNumberOfTypes() const; + + LexicalReordering* + GetScoreProducer() const { + return m_scoreProducer; + } + + LRState *CreateLRState(MemPool &pool) const; + + ReorderingType // for first phrase in phrase-based + GetOrientation(Range const& cur) const; + + ReorderingType // for non-first phrases in phrase-based + GetOrientation(Range const& prev, Range const& cur) const; + + ReorderingType // for HReorderingForwardState + GetOrientation(Range const& prev, Range const& cur, Bitmap const& cov) const; + + ReorderingType // for HReorderingBackwarddState + GetOrientation(int const reoDistance) const; + +protected: + + ModelType m_modelType; + bool m_phraseBased; + bool m_collapseScores; + Direction m_direction; + Condition m_condition; + LexicalReordering *m_scoreProducer; + +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/FF/LexicalReordering/LRState.cpp b/mosesdecoder/moses2/FF/LexicalReordering/LRState.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a8a3bf6d04f1e84d90bb86f1e0e9ce5551addbc3 --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/LRState.cpp @@ -0,0 +1,93 @@ +/* + * LRState.cpp + * + * Created on: 22 Mar 2016 + * Author: hieu + */ +#include "LRState.h" +#include "LexicalReordering.h" +#include "../../Scores.h" +#include "../../TargetPhrase.h" + +using namespace std; + +namespace Moses2 +{ + +class InputType; + +LRState::LRState(const LRModel &config, LRModel::Direction dir, size_t offset) : + m_configuration(config), m_direction(dir), m_offset(offset) +{ +} + +int LRState::ComparePrevScores(const TargetPhrase *other) const +{ + LexicalReordering* producer = m_configuration.GetScoreProducer(); + size_t phraseTableInd = producer->GetPhraseTableInd(); + const SCORE *myScores = (const SCORE*) prevTP->ffData[phraseTableInd]; //producer-> + const SCORE *yrScores = (const SCORE*) other->ffData[phraseTableInd]; //producer-> + + if (myScores == yrScores) return 0; + + // The pointers are NULL if a phrase pair isn't found in the reordering table. + if (yrScores == NULL) return -1; + if (myScores == NULL) return 1; + + size_t stop = m_offset + m_configuration.GetNumberOfTypes(); + for (size_t i = m_offset; i < stop; i++) { + if ((myScores)[i] < (yrScores)[i]) return -1; + if ((myScores)[i] > (yrScores)[i]) return 1; + } + return 0; +} + +void LRState::CopyScores(const System &system, Scores &accum, + const TargetPhrase &topt, ReorderingType reoType) const +{ + // don't call this on a bidirectional object + UTIL_THROW_IF2( + m_direction != LRModel::Backward && m_direction != LRModel::Forward, + "Unknown direction: " << m_direction); + + TargetPhrase const* relevantOpt = ( + (m_direction == LRModel::Backward) ? &topt : prevTP); + + LexicalReordering* producer = m_configuration.GetScoreProducer(); + size_t phraseTableInd = producer->GetPhraseTableInd(); + const SCORE *cached = (const SCORE*) relevantOpt->ffData[phraseTableInd]; //producer-> + + if (cached == NULL) { + return; + } + + size_t off_remote = m_offset + reoType; + size_t off_local = m_configuration.CollapseScores() ? m_offset : off_remote; + + UTIL_THROW_IF2(off_local >= producer->GetNumScores(), + "offset out of vector bounds!"); + + // look up applicable score from vector of scores + //UTIL_THROW_IF2(off_remote >= cached->size(), "offset out of vector bounds!"); + //Scores scores(producer->GetNumScoreComponents(),0); + SCORE score = cached[off_remote]; + accum.PlusEquals(system, *producer, score, off_local); + + // else: use default scores (if specified) + /* + else if (producer->GetHaveDefaultScores()) { + Scores scores(producer->GetNumScoreComponents(),0); + scores[off_local] = producer->GetDefaultScore(off_remote); + accum->PlusEquals(m_configuration.GetScoreProducer(), scores); + } + */ + // note: if no default score, no cost + /* + const SparseReordering* sparse = m_configuration.GetSparseReordering(); + if (sparse) sparse->CopyScores(*relevantOpt, m_prevOption, input, reoType, + m_direction, accum); + */ +} + +} + diff --git a/mosesdecoder/moses2/FF/LexicalReordering/LRState.h b/mosesdecoder/moses2/FF/LexicalReordering/LRState.h new file mode 100644 index 0000000000000000000000000000000000000000..c53b9de7894fe91a6ce8189fc666c489bf8b2e8a --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/LRState.h @@ -0,0 +1,48 @@ +#pragma once +#include "../FFState.h" +#include "LRModel.h" + +namespace Moses2 +{ +template +class TargetPhrase; + +class LexicalReordering; +class Hypothesis; +class System; +class Scores; +class Bitmap; +class ManagerBase; +class InputType; +class InputPathBase; +class Word; + +class LRState: public FFState +{ +public: + typedef LRModel::ReorderingType ReorderingType; + const TargetPhrase *prevTP; + + LRState(const LRModel &config, LRModel::Direction dir, size_t offset); + + virtual void Init(const LRState *prev, const TargetPhrase &topt, + const InputPathBase &path, bool first, const Bitmap *coverage) = 0; + + virtual void Expand(const ManagerBase &mgr, const LexicalReordering &ff, + const Hypothesis &hypo, size_t phraseTableInd, Scores &scores, + FFState &state) const = 0; + + void CopyScores(const System &system, Scores &accum, const TargetPhrase &topt, + ReorderingType reoType) const; + +protected: + const LRModel& m_configuration; + LRModel::Direction m_direction; + size_t m_offset; + + int + ComparePrevScores(const TargetPhrase *other) const; + +}; + +} diff --git a/mosesdecoder/moses2/FF/LexicalReordering/LexicalReordering.cpp b/mosesdecoder/moses2/FF/LexicalReordering/LexicalReordering.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6f510574c26ca64b316d7447bedb0df5b1a9db38 --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/LexicalReordering.cpp @@ -0,0 +1,226 @@ +/* + * LexicalReordering.cpp + * + * Created on: 15 Dec 2015 + * Author: hieu + */ + +#include +#include "util/exception.hh" +#include "LexicalReordering.h" +#include "LRModel.h" +#include "PhraseBasedReorderingState.h" +#include "BidirectionalReorderingState.h" +#include "../../TranslationModel/PhraseTable.h" +#include "../../System.h" +#include "../../PhraseBased/PhraseImpl.h" +#include "../../PhraseBased/Manager.h" +#include "../../PhraseBased/Hypothesis.h" +#include "../../PhraseBased/TargetPhrases.h" +#include "../../PhraseBased/TargetPhraseImpl.h" +#include "../../legacy/InputFileStream.h" +#include "../../legacy/Util2.h" + +#ifdef HAVE_CMPH +#include "../../TranslationModel/CompactPT/LexicalReorderingTableCompact.h" +#endif + + +using namespace std; + +namespace Moses2 +{ + +/////////////////////////////////////////////////////////////////////// + +LexicalReordering::LexicalReordering(size_t startInd, const std::string &line) + : StatefulFeatureFunction(startInd, line) + , m_blank(NULL) + , m_propertyInd(-1) + , m_coll(NULL) + , m_configuration(NULL) +#ifdef HAVE_CMPH + , m_compactModel(NULL) +#endif +{ + ReadParameters(); + assert(m_configuration); + //assert(m_numScores == 6); +} + +LexicalReordering::~LexicalReordering() +{ + delete m_coll; + delete m_configuration; +#ifdef HAVE_CMPH + delete m_compactModel; +#endif +} + +void LexicalReordering::Load(System &system) +{ + MemPool &pool = system.GetSystemPool(); + + if (m_propertyInd >= 0) { + // Using integrate Lex RO. No loading needed +#ifdef HAVE_CMPH + } else if (FileExists(m_path + ".minlexr")) { + m_compactModel = new LexicalReorderingTableCompact(m_path + ".minlexr", + m_FactorsF, m_FactorsE, m_FactorsC); + m_blank = new (pool.Allocate()) PhraseImpl(pool, 0); +#endif + } else { + m_coll = new Coll(); + InputFileStream file(m_path); + string line; + size_t lineNum = 0; + + while (getline(file, line)) { + if (++lineNum % 1000000 == 0) { + cerr << lineNum << " "; + } + + std::vector toks = TokenizeMultiCharSeparator(line, "|||"); + assert(toks.size() == 3); + PhraseImpl *source = PhraseImpl::CreateFromString(pool, system.GetVocab(), + system, toks[0]); + PhraseImpl *target = PhraseImpl::CreateFromString(pool, system.GetVocab(), + system, toks[1]); + std::vector scores = Tokenize(toks[2]); + std::transform(scores.begin(), scores.end(), scores.begin(), + TransformScore); + std::transform(scores.begin(), scores.end(), scores.begin(), FloorScore); + + Key key(source, target); + (*m_coll)[key] = scores; + } + } +} + +void LexicalReordering::SetParameter(const std::string& key, + const std::string& value) +{ + if (key == "path") { + m_path = value; + } else if (key == "type") { + m_configuration = new LRModel(value, *this); + } else if (key == "input-factor") { + m_FactorsF = Tokenize(value); + } else if (key == "output-factor") { + m_FactorsE = Tokenize(value); + } else if (key == "property-index") { + m_propertyInd = Scan(value); + } else { + StatefulFeatureFunction::SetParameter(key, value); + } +} + +FFState* LexicalReordering::BlankState(MemPool &pool, const System &sys) const +{ + FFState *ret = m_configuration->CreateLRState(pool); + return ret; +} + +void LexicalReordering::EmptyHypothesisState(FFState &state, + const ManagerBase &mgr, const InputType &input, + const Hypothesis &hypo) const +{ + BidirectionalReorderingState &stateCast = + static_cast(state); + stateCast.Init(NULL, hypo.GetTargetPhrase(), hypo.GetInputPath(), true, + &hypo.GetBitmap()); +} + +void LexicalReordering::EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ +} + +void LexicalReordering::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + UTIL_THROW2("Don't use with SCFG models"); +} + + +void LexicalReordering::EvaluateAfterTablePruning(MemPool &pool, + const TargetPhrases &tps, const Phrase &sourcePhrase) const +{ + BOOST_FOREACH(const TargetPhraseImpl *tp, tps) { + EvaluateAfterTablePruning(pool, *tp, sourcePhrase); + } +} + +void LexicalReordering::EvaluateAfterTablePruning(MemPool &pool, + const TargetPhraseImpl &targetPhrase, const Phrase &sourcePhrase) const +{ + if (m_propertyInd >= 0) { + SCORE *scoreArr = targetPhrase.GetScoresProperty(m_propertyInd); + targetPhrase.ffData[m_PhraseTableInd] = scoreArr; +#ifdef HAVE_CMPH + } else if (m_compactModel) { + // using external compact binary model + const Values values = m_compactModel->GetScore(sourcePhrase, targetPhrase, + *m_blank); + if (values.size()) { + assert(values.size() == m_numScores); + + SCORE *scoreArr = pool.Allocate(m_numScores); + for (size_t i = 0; i < m_numScores; ++i) { + scoreArr[i] = values[i]; + } + targetPhrase.ffData[m_PhraseTableInd] = scoreArr; + } else { + targetPhrase.ffData[m_PhraseTableInd] = NULL; + } +#endif + } else if (m_coll) { + // using external memory model + + // cache data in target phrase + const Values *values = GetValues(sourcePhrase, targetPhrase); + assert(values->size() == m_numScores); + + if (values) { + SCORE *scoreArr = pool.Allocate(m_numScores); + for (size_t i = 0; i < m_numScores; ++i) { + scoreArr[i] = (*values)[i]; + } + targetPhrase.ffData[m_PhraseTableInd] = scoreArr; + } else { + targetPhrase.ffData[m_PhraseTableInd] = NULL; + } + } +} + +void LexicalReordering::EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const +{ + const LRState &prevStateCast = static_cast(prevState); + prevStateCast.Expand(mgr, *this, hypo, m_PhraseTableInd, scores, state); +} + +const LexicalReordering::Values *LexicalReordering::GetValues( + const Phrase &source, const Phrase &target) const +{ + Key key(&source, &target); + Coll::const_iterator iter; + iter = m_coll->find(key); + if (iter == m_coll->end()) { + return NULL; + } else { + return &iter->second; + } +} + +void LexicalReordering::EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const +{ + UTIL_THROW2("Not implemented"); +} + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/FF/LexicalReordering/LexicalReordering.h b/mosesdecoder/moses2/FF/LexicalReordering/LexicalReordering.h new file mode 100644 index 0000000000000000000000000000000000000000..59f63eba264fc0e5315a0f998370aaa7277e2faf --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/LexicalReordering.h @@ -0,0 +1,115 @@ +/* + * LexicalReordering.h + * + * Created on: 15 Dec 2015 + * Author: hieu + */ + +#pragma once +#include +#include +#include "../StatefulFeatureFunction.h" +#include "../../TypeDef.h" +#include "../../Phrase.h" +#include "../../legacy/Range.h" + +namespace Moses2 +{ + +class LexicalReorderingTableCompact; +class LRModel; +class TargetPhraseImpl; + +class LexicalReordering: public StatefulFeatureFunction +{ +public: + LexicalReordering(size_t startInd, const std::string &line); + virtual ~LexicalReordering(); + + virtual void Load(System &system); + + virtual void SetParameter(const std::string& key, const std::string& value); + + virtual size_t HasPhraseTableInd() const { + return true; + } + + virtual FFState* BlankState(MemPool &pool, const System &sys) const; + virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateAfterTablePruning(MemPool &pool, const TargetPhrases &tps, + const Phrase &sourcePhrase) const; + + virtual void EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const; + + virtual void EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const; + +protected: + std::string m_path; + FactorList m_FactorsF; + FactorList m_FactorsE; + FactorList m_FactorsC; + + LRModel *m_configuration; + + virtual void + EvaluateAfterTablePruning(MemPool &pool, const TargetPhraseImpl &targetPhrase, + const Phrase &sourcePhrase) const; + + // PROPERTY IN PT + int m_propertyInd; + + // COMPACT MODEL +#ifdef HAVE_CMPH + LexicalReorderingTableCompact *m_compactModel; +#endif + + Phrase *m_blank; + + // MEMORY MODEL + typedef std::pair*, const Phrase* > Key; + typedef std::vector Values; + + struct KeyComparer { + size_t operator()(const Key &obj) const { + size_t seed = obj.first->hash(); + boost::hash_combine(seed, obj.second->hash()); + return seed; + } + + bool operator()(const Key& a, const Key& b) const { + if ((*a.first) != (*b.first)) { + return false; + } + if ((*a.second) != (*b.second)) { + return false; + } + return true; + } + + }; + + typedef boost::unordered_map Coll; + Coll *m_coll; + + const Values *GetValues(const Phrase &source, const Phrase &target) const; +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/FF/LexicalReordering/PhraseBasedReorderingState.cpp b/mosesdecoder/moses2/FF/LexicalReordering/PhraseBasedReorderingState.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6b80600212512e39b287359b92c0fa5699314d2a --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/PhraseBasedReorderingState.cpp @@ -0,0 +1,84 @@ +/* + * PhraseLR.cpp + * + * Created on: 22 Mar 2016 + * Author: hieu + */ + +#include "PhraseBasedReorderingState.h" +#include "LexicalReordering.h" +#include "../../PhraseBased/Hypothesis.h" +#include "../../InputPathBase.h" +#include "../../PhraseBased/Manager.h" + +using namespace std; + +namespace Moses2 +{ + +PhraseBasedReorderingState::PhraseBasedReorderingState(const LRModel &config, + LRModel::Direction dir, size_t offset) : + LRState(config, dir, offset) +{ + // uninitialised + prevPath = NULL; + prevTP = NULL; +} + +void PhraseBasedReorderingState::Init(const LRState *prev, + const TargetPhrase &topt, const InputPathBase &path, bool first, + const Bitmap *coverage) +{ + prevTP = &topt; + prevPath = &path; + m_first = first; +} + +size_t PhraseBasedReorderingState::hash() const +{ + size_t ret; + ret = (size_t) &prevPath->range; + boost::hash_combine(ret, m_direction); + + return ret; +} + +bool PhraseBasedReorderingState::operator==(const FFState& o) const +{ + if (&o == this) return true; + + const PhraseBasedReorderingState &other = + static_cast(o); + if (&prevPath->range == &other.prevPath->range) { + if (m_direction == LRModel::Forward) { + int compareScore = ComparePrevScores(other.prevTP); + return compareScore == 0; + } else { + return true; + } + } else { + return false; + } +} + +void PhraseBasedReorderingState::Expand(const ManagerBase &mgr, + const LexicalReordering &ff, const Hypothesis &hypo, size_t phraseTableInd, + Scores &scores, FFState &state) const +{ + if ((m_direction != LRModel::Forward) || !m_first) { + LRModel const& lrmodel = m_configuration; + Range const &cur = hypo.GetInputPath().range; + LRModel::ReorderingType reoType = ( + m_first ? + lrmodel.GetOrientation(cur) : + lrmodel.GetOrientation(prevPath->range, cur)); + CopyScores(mgr.system, scores, hypo.GetTargetPhrase(), reoType); + } + + PhraseBasedReorderingState &stateCast = + static_cast(state); + stateCast.Init(this, hypo.GetTargetPhrase(), hypo.GetInputPath(), false, + NULL); +} + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/FF/LexicalReordering/PhraseBasedReorderingState.h b/mosesdecoder/moses2/FF/LexicalReordering/PhraseBasedReorderingState.h new file mode 100644 index 0000000000000000000000000000000000000000..77994e4771f0076c7517660cf6b27c81910c9e83 --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/PhraseBasedReorderingState.h @@ -0,0 +1,44 @@ +/* + * PhraseLR.h + * + * Created on: 22 Mar 2016 + * Author: hieu + */ + +#pragma once +#include "LRState.h" + +namespace Moses2 +{ + +class InputPathBase; + +class PhraseBasedReorderingState: public LRState +{ +public: + const InputPathBase *prevPath; + bool m_first; + + PhraseBasedReorderingState(const LRModel &config, LRModel::Direction dir, + size_t offset); + + void Init(const LRState *prev, const TargetPhrase &topt, + const InputPathBase &path, bool first, const Bitmap *coverage); + + size_t hash() const; + virtual bool operator==(const FFState& other) const; + + virtual std::string ToString() const { + return "PhraseBasedReorderingState"; + } + + void Expand(const ManagerBase &mgr, const LexicalReordering &ff, + const Hypothesis &hypo, size_t phraseTableInd, Scores &scores, + FFState &state) const; + +protected: + +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/FF/LexicalReordering/ReorderingStack.cpp b/mosesdecoder/moses2/FF/LexicalReordering/ReorderingStack.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a4bf3c3359aac988d68792074b20931b50105db --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/ReorderingStack.cpp @@ -0,0 +1,99 @@ +/* + * ReorderingStack.cpp + ** Author: Ankit K. Srivastava + ** Date: Jan 26, 2010 + */ + +#include +#include "ReorderingStack.h" +#include "../../MemPool.h" + +namespace Moses2 +{ +ReorderingStack::ReorderingStack(MemPool &pool) : + m_stack(pool) +{ + +} + +void ReorderingStack::Init() +{ + m_stack.clear(); +} + +size_t ReorderingStack::hash() const +{ + std::size_t ret = boost::hash_range(m_stack.begin(), m_stack.end()); + return ret; +} + +bool ReorderingStack::operator==(const ReorderingStack& o) const +{ + const ReorderingStack& other = static_cast(o); + return m_stack == other.m_stack; +} + +// Method to push (shift element into the stack and reduce if reqd) +int ReorderingStack::ShiftReduce(const Range &input_span) +{ + int distance; // value to return: the initial distance between this and previous span + + // stack is empty + if (m_stack.empty()) { + m_stack.push_back(input_span); + return input_span.GetStartPos() + 1; // - (-1) + } + + // stack is non-empty + Range prev_span = m_stack.back(); //access last element added + + //calculate the distance we are returning + if (input_span.GetStartPos() > prev_span.GetStartPos()) { + distance = input_span.GetStartPos() - prev_span.GetEndPos(); + } else { + distance = input_span.GetEndPos() - prev_span.GetStartPos(); + } + + if (distance == 1) { //monotone + m_stack.pop_back(); + Range new_span(prev_span.GetStartPos(), input_span.GetEndPos()); + Reduce(new_span); + } else if (distance == -1) { //swap + m_stack.pop_back(); + Range new_span(input_span.GetStartPos(), prev_span.GetEndPos()); + Reduce(new_span); + } else { // discontinuous + m_stack.push_back(input_span); + } + + return distance; +} + +// Method to reduce, if possible the spans +void ReorderingStack::Reduce(Range current) +{ + bool cont_loop = true; + + while (cont_loop && m_stack.size() > 0) { + + Range previous = m_stack.back(); + + if (current.GetStartPos() - previous.GetEndPos() == 1) { //mono&merge + m_stack.pop_back(); + Range t(previous.GetStartPos(), current.GetEndPos()); + current = t; + } else if (previous.GetStartPos() - current.GetEndPos() == 1) { //swap&merge + m_stack.pop_back(); + Range t(current.GetStartPos(), previous.GetEndPos()); + current = t; + } else { // discontinuous, no more merging + cont_loop = false; + } + } // finished reducing, exit + + // add to stack + m_stack.push_back(current); +} + +} + diff --git a/mosesdecoder/moses2/FF/LexicalReordering/ReorderingStack.h b/mosesdecoder/moses2/FF/LexicalReordering/ReorderingStack.h new file mode 100644 index 0000000000000000000000000000000000000000..fab986bc02870819f947a1df94e423854f9235b4 --- /dev/null +++ b/mosesdecoder/moses2/FF/LexicalReordering/ReorderingStack.h @@ -0,0 +1,41 @@ +/* + * ReorderingStack.h + ** Author: Ankit K. Srivastava + ** Date: Jan 26, 2010 + */ + +#pragma once + +//#include +#include +//#include "Factor.h" +//#include "Phrase.h" +//#include "TypeDef.h" +//#include "Util.h" +#include "../../legacy/Range.h" +#include "../../Vector.h" + +namespace Moses2 +{ +class MemPool; + +class ReorderingStack +{ +private: + + Vector m_stack; + +public: + ReorderingStack(MemPool &pool); + + size_t hash() const; + bool operator==(const ReorderingStack& other) const; + + void Init(); + int ShiftReduce(const Range &input_span); + +private: + void Reduce(Range input_span); +}; + +} diff --git a/mosesdecoder/moses2/FF/OSM/KenOSM.cpp b/mosesdecoder/moses2/FF/OSM/KenOSM.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6b410fc9e6b2b45a9ec695b5b832efefcef3b806 --- /dev/null +++ b/mosesdecoder/moses2/FF/OSM/KenOSM.cpp @@ -0,0 +1,33 @@ +#include "KenOSM.h" + +namespace Moses2 +{ + +OSMLM* ConstructOSMLM(const char *file, util::LoadMethod load_method) +{ + lm::ngram::ModelType model_type; + lm::ngram::Config config; + config.load_method = load_method; + if (lm::ngram::RecognizeBinary(file, model_type)) { + switch(model_type) { + case lm::ngram::PROBING: + return new KenOSM(file, config); + case lm::ngram::REST_PROBING: + return new KenOSM(file, config); + case lm::ngram::TRIE: + return new KenOSM(file, config); + case lm::ngram::QUANT_TRIE: + return new KenOSM(file, config); + case lm::ngram::ARRAY_TRIE: + return new KenOSM(file, config); + case lm::ngram::QUANT_ARRAY_TRIE: + return new KenOSM(file, config); + default: + UTIL_THROW2("Unrecognized kenlm model type " << model_type); + } + } else { + return new KenOSM(file, config); + } +} + +} // namespace diff --git a/mosesdecoder/moses2/FF/OSM/KenOSM.h b/mosesdecoder/moses2/FF/OSM/KenOSM.h new file mode 100644 index 0000000000000000000000000000000000000000..f1275232f0ea274f9acb4db9568ea7d2fea75ff0 --- /dev/null +++ b/mosesdecoder/moses2/FF/OSM/KenOSM.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include "lm/model.hh" + +namespace Moses2 +{ + +class KenOSMBase +{ +public: + virtual ~KenOSMBase() {} + + virtual float Score(const lm::ngram::State&, StringPiece, + lm::ngram::State&) const = 0; + + virtual const lm::ngram::State &BeginSentenceState() const = 0; + + virtual const lm::ngram::State &NullContextState() const = 0; +}; + +template +class KenOSM : public KenOSMBase +{ +public: + KenOSM(const char *file, const lm::ngram::Config &config) + : m_kenlm(file, config) {} + + float Score(const lm::ngram::State &in_state, + StringPiece word, + lm::ngram::State &out_state) const { + return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().Index(word), + out_state); + } + + const lm::ngram::State &BeginSentenceState() const { + return m_kenlm.BeginSentenceState(); + } + + const lm::ngram::State &NullContextState() const { + return m_kenlm.NullContextState(); + } + +private: + KenModel m_kenlm; +}; + +typedef KenOSMBase OSMLM; + +OSMLM* ConstructOSMLM(const char *file, util::LoadMethod load_method); + + +} // namespace diff --git a/mosesdecoder/moses2/FF/OSM/OpSequenceModel.cpp b/mosesdecoder/moses2/FF/OSM/OpSequenceModel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..093e5d81949ef91a69260a943219d3241d65d075 --- /dev/null +++ b/mosesdecoder/moses2/FF/OSM/OpSequenceModel.cpp @@ -0,0 +1,248 @@ +#include +#include "OpSequenceModel.h" +#include "osmHyp.h" +#include "lm/state.hh" +#include "../../PhraseBased/Manager.h" +#include "../../PhraseBased/Hypothesis.h" +#include "../../PhraseBased/TargetPhraseImpl.h" +#include "../../PhraseBased/Sentence.h" +#include "../../TranslationModel/UnknownWordPenalty.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ + +//////////////////////////////////////////////////////////////////////////////////////// + +OpSequenceModel::OpSequenceModel(size_t startInd, const std::string &line) : + StatefulFeatureFunction(startInd, line) +{ + sFactor = 0; + tFactor = 0; + numFeatures = 5; + load_method = util::READ; + + ReadParameters(); +} + +OpSequenceModel::~OpSequenceModel() +{ + // TODO Auto-generated destructor stub +} + +void OpSequenceModel::Load(System &system) +{ + readLanguageModel(m_lmPath.c_str()); +} + +FFState* OpSequenceModel::BlankState(MemPool &pool, const System &sys) const +{ + return new (pool.Allocate()) osmState(); +} + +void OpSequenceModel::EmptyHypothesisState(FFState &state, + const ManagerBase &mgr, const InputType &input, + const Hypothesis &hypo) const +{ + lm::ngram::State startState = OSM->BeginSentenceState(); + + osmState &stateCast = static_cast(state); + stateCast.setState(startState); +} + +void OpSequenceModel::EvaluateInIsolation(MemPool &pool, + const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + osmHypothesis obj; + obj.setState(OSM->NullContextState()); + + Bitmap myBitmap (pool, source.GetSize()); + myBitmap.Init(std::vector()); + + vector mySourcePhrase; + vector myTargetPhrase; + vector scoresVec; + vector alignments; + int startIndex = 0; + int endIndex = source.GetSize(); + + const AlignmentInfo &align = targetPhrase.GetAlignTerm(); + AlignmentInfo::const_iterator iter; + + for (iter = align.begin(); iter != align.end(); ++iter) { + alignments.push_back(iter->first); + alignments.push_back(iter->second); + } + + for (size_t i = 0; i < targetPhrase.GetSize(); i++) { + if (&targetPhrase.pt == system.featureFunctions.GetUnknownWordPenalty() && sFactor == 0 && tFactor == 0) + myTargetPhrase.push_back("_TRANS_SLF_"); + else + myTargetPhrase.push_back(targetPhrase[i][tFactor]->GetString().as_string()); + } + + for (size_t i = 0; i < source.GetSize(); i++) { + mySourcePhrase.push_back(source[i][sFactor]->GetString().as_string()); + } + + obj.setPhrases(mySourcePhrase , myTargetPhrase); + obj.constructCepts(alignments,startIndex,endIndex-1,targetPhrase.GetSize()); + obj.computeOSMFeature(startIndex,myBitmap); + obj.calculateOSMProb(*OSM); + obj.populateScores(scoresVec,numFeatures); + + SCORE weightedScore = Scores::CalcWeightedScore(system, *this, + scoresVec.data()); + estimatedScore += weightedScore; + +} + +void OpSequenceModel::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + UTIL_THROW2("Not implemented"); +} + +void OpSequenceModel::EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const +{ + const TargetPhrase &target = hypo.GetTargetPhrase(); + const Bitmap &bitmap = hypo.GetBitmap(); + Bitmap myBitmap(bitmap); + const ManagerBase &manager = hypo.GetManager(); + const InputType &source = manager.GetInput(); + const Sentence &sourceSentence = static_cast(source); + + osmHypothesis obj; + vector mySourcePhrase; + vector myTargetPhrase; + vector scoresVec; + + + //target.GetWord(0) + + //cerr << target <<" --- "<(curr_hypo.GetManager().GetSource()); + + + const Range & sourceRange = hypo.GetInputPath().range; + int startIndex = sourceRange.GetStartPos(); + int endIndex = sourceRange.GetEndPos(); + const AlignmentInfo &align = hypo.GetTargetPhrase().GetAlignTerm(); + // osmState * statePtr; + + vector alignments; + + + + AlignmentInfo::const_iterator iter; + + for (iter = align.begin(); iter != align.end(); ++iter) { + //cerr << iter->first << "----" << iter->second << " "; + alignments.push_back(iter->first); + alignments.push_back(iter->second); + } + + + //cerr<GetString().as_string()); + // cerr<GetString().as_string()); + + } + + + //cerr<(state); + obj.saveState(stateCast); +} + +void OpSequenceModel::EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const +{ + UTIL_THROW2("Not implemented"); +} + +void OpSequenceModel::SetParameter(const std::string& key, const std::string& value) +{ + + if (key == "path") { + m_lmPath = value; + } else if (key == "support-features") { + if(value == "no") + numFeatures = 1; + else + numFeatures = 5; + } else if (key == "input-factor") { + sFactor = Scan(value); + } else if (key == "output-factor") { + tFactor = Scan(value); + } else if (key == "load") { + if (value == "lazy") { + load_method = util::LAZY; + } else if (value == "populate_or_lazy") { + load_method = util::POPULATE_OR_LAZY; + } else if (value == "populate_or_read" || value == "populate") { + load_method = util::POPULATE_OR_READ; + } else if (value == "read") { + load_method = util::READ; + } else if (value == "parallel_read") { + load_method = util::PARALLEL_READ; + } else { + UTIL_THROW2("Unknown KenLM load method " << value); + } + } else { + StatefulFeatureFunction::SetParameter(key, value); + } +} + +void OpSequenceModel :: readLanguageModel(const char *lmFile) +{ + string unkOp = "_TRANS_SLF_"; + OSM = ConstructOSMLM(m_lmPath.c_str(), load_method); + + lm::ngram::State startState = OSM->NullContextState(); + lm::ngram::State endState; + unkOpProb = OSM->Score(startState,unkOp,endState); +} + +} diff --git a/mosesdecoder/moses2/FF/OSM/OpSequenceModel.h b/mosesdecoder/moses2/FF/OSM/OpSequenceModel.h new file mode 100644 index 0000000000000000000000000000000000000000..f8b99e95c19bf3971d274b2d7771dbdca287b575 --- /dev/null +++ b/mosesdecoder/moses2/FF/OSM/OpSequenceModel.h @@ -0,0 +1,57 @@ +#include "../StatefulFeatureFunction.h" +#include "util/mmap.hh" +#include "KenOSM.h" + +namespace Moses2 +{ + + +class OpSequenceModel : public StatefulFeatureFunction +{ +public: + OSMLM* OSM; + float unkOpProb; + int numFeatures; // Number of features used ... + int sFactor; // Source Factor ... + int tFactor; // Target Factor ... + util::LoadMethod load_method; // method to load model + + OpSequenceModel(size_t startInd, const std::string &line); + virtual ~OpSequenceModel(); + + virtual void Load(System &system); + + virtual FFState* BlankState(MemPool &pool, const System &sys) const; + virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const; + + virtual void EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const; + + void SetParameter(const std::string& key, const std::string& value); + +protected: + std::string m_lmPath; + + void readLanguageModel(const char *); + +}; + +} + + diff --git a/mosesdecoder/moses2/FF/OSM/osmHyp.cpp b/mosesdecoder/moses2/FF/OSM/osmHyp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ede841a80d0284788c9acb1ff2fb123b9e3391c6 --- /dev/null +++ b/mosesdecoder/moses2/FF/OSM/osmHyp.cpp @@ -0,0 +1,601 @@ +#include "osmHyp.h" +#include + +using namespace std; +using namespace lm::ngram; + +namespace Moses2 +{ +void osmState::setState(const lm::ngram::State & val) +{ + j = 0; + E = 0; + lmState = val; +} + +void osmState::saveState(int jVal, int eVal, map & gapVal) +{ + gap.clear(); + gap = gapVal; + j = jVal; + E = eVal; +} + +size_t osmState::hash() const +{ + size_t ret = j; + + boost::hash_combine(ret, E); + boost::hash_combine(ret, gap); + boost::hash_combine(ret, lmState.length); + + return ret; +} + +bool osmState::operator==(const FFState& otherBase) const +{ + const osmState &other = static_cast(otherBase); + if (j != other.j) + return false; + if (E != other.E) + return false; + if (gap != other.gap) + return false; + if (lmState.length != other.lmState.length) + return false; + + return true; +} + +std::string osmState :: getName() const +{ + + return "done"; +} + +////////////////////////////////////////////////// + +osmHypothesis :: osmHypothesis() +{ + opProb = 0; + gapWidth = 0; + gapCount = 0; + openGapCount = 0; + deletionCount = 0; + gapCount = 0; + j = 0; + E = 0; + gap.clear(); +} + +void osmHypothesis :: setState(const FFState* prev_state) +{ + + if(prev_state != NULL) { + + j = static_cast (prev_state)->getJ(); + E = static_cast (prev_state)->getE(); + gap = static_cast (prev_state)->getGap(); + lmState = static_cast (prev_state)->getLMState(); + } +} + +void osmHypothesis :: saveState(osmState &state) +{ + state.setState(lmState); + state.saveState(j,E,gap); +} + +int osmHypothesis :: isTranslationOperation(int x) +{ + if (operations[x].find("_JMP_BCK_") != -1) + return 0; + + if (operations[x].find("_JMP_FWD_") != -1) + return 0; + + if (operations[x].find("_CONT_CEPT_") != -1) + return 0; + + if (operations[x].find("_INS_GAP_") != -1) + return 0; + + return 1; + +} + +void osmHypothesis :: removeReorderingOperations() +{ + gapCount = 0; + deletionCount = 0; + openGapCount = 0; + gapWidth = 0; + + std::vector tupleSequence; + + for (int x = 0; x < operations.size(); x++) { + // cout< & coverageVector) +{ + + int firstOG =-1; + + for(int nd = 0; nd < coverageVector.size(); nd++) { + if(coverageVector[nd]==0) { + firstOG = nd; + return firstOG; + } + } + + return firstOG; + +} + +string osmHypothesis :: intToString(int num) +{ + return SPrint(num); + +} + +void osmHypothesis :: generateOperations(int & startIndex , int j1 , int contFlag , Bitmap & coverageVector , string english , string german , set & targetNullWords , vector & currF) +{ + + int gFlag = 0; + int gp = 0; + int ans; + + + if ( j < j1) { // j1 is the index of the source word we are about to generate ... + //if(coverageVector[j]==0) // if source word at j is not generated yet ... + if(coverageVector.GetValue(j)==0) { // if source word at j is not generated yet ... + operations.push_back("_INS_GAP_"); + gFlag++; + gap[j]="Unfilled"; + } + if (j == E) { + j = j1; + } else { + operations.push_back("_JMP_FWD_"); + j=E; + } + } + + if (j1 < j) { + // if(j < E && coverageVector[j]==0) + if(j < E && coverageVector.GetValue(j)==0) { + operations.push_back("_INS_GAP_"); + gFlag++; + gap[j]="Unfilled"; + } + + j=closestGap(gap,j1,gp); + operations.push_back("_JMP_BCK_"+ intToString(gp)); + + //cout<<"I am j "< 0) + gapCount++; + + openGapCount += getOpenGaps(); + + //if (coverageVector[j] == 0 && targetNullWords.find(j) != targetNullWords.end()) + if (j < coverageVector.GetSize()) { + if (coverageVector.GetValue(j) == 0 && targetNullWords.find(j) != targetNullWords.end()) { + j1 = j; + german = currF[j1-startIndex]; + english = "_INS_"; + generateOperations(startIndex, j1, 2 , coverageVector , english , german , targetNullWords , currF); + } + } + +} + +void osmHypothesis :: print() +{ + for (int i = 0; i< operations.size(); i++) { + cerr< gap, int j1, int & gp) +{ + + int dist=1172; + int value=-1; + int temp=0; + gp=0; + int opGap=0; + + map :: iterator iter; + + iter=gap.end(); + + do { + iter--; + //cout<<"Trapped "<first<first==j1 && iter->second== "Unfilled") { + opGap++; + gp = opGap; + return j1; + + } + + if(iter->second =="Unfilled") { + opGap++; + temp = iter->first - j1; + + if(temp<0) + temp=temp * -1; + + if(dist>temp && iter->first < j1) { + dist=temp; + value=iter->first; + gp=opGap; + } + } + + + } while(iter!=gap.begin()); + + return value; +} + + + +int osmHypothesis :: getOpenGaps() +{ + map :: iterator iter; + + int nd = 0; + for (iter = gap.begin(); iter!=gap.end(); iter++) { + if(iter->second == "Unfilled") + nd++; + } + + return nd; + +} + +void osmHypothesis :: generateDeleteOperations(std::string english, int currTargetIndex, std::set doneTargetIndexes) +{ + + operations.push_back("_DEL_" + english); + currTargetIndex++; + + while(doneTargetIndexes.find(currTargetIndex) != doneTargetIndexes.end()) { + currTargetIndex++; + } + + if (sourceNullWords.find(currTargetIndex) != sourceNullWords.end()) { + english = currE[currTargetIndex]; + generateDeleteOperations(english,currTargetIndex,doneTargetIndexes); + } + +} + +void osmHypothesis :: computeOSMFeature(int startIndex , Bitmap & coverageVector) +{ + + set doneTargetIndexes; + set eSide; + set fSide; + set :: iterator iter; + string english; + string source; + int j1; + int targetIndex = 0; + doneTargetIndexes.clear(); + + + if (targetNullWords.size() != 0) { // Source words to be deleted in the start of this phrase ... + iter = targetNullWords.begin(); + + if (*iter == startIndex) { + + j1 = startIndex; + source = currF[j1-startIndex]; + english = "_INS_"; + generateOperations(startIndex, j1, 2 , coverageVector , english , source , targetNullWords , currF); + } + } + + if (sourceNullWords.find(targetIndex) != sourceNullWords.end()) { // first word has to be deleted ... + english = currE[targetIndex]; + generateDeleteOperations(english,targetIndex, doneTargetIndexes); + } + + + for (size_t i = 0; i < ceptsInPhrase.size(); i++) { + source = ""; + english = ""; + + fSide = ceptsInPhrase[i].first; + eSide = ceptsInPhrase[i].second; + + iter = eSide.begin(); + targetIndex = *iter; + english += currE[*iter]; + iter++; + + for (; iter != eSide.end(); iter++) { + if(*iter == targetIndex+1) + targetIndex++; + else + doneTargetIndexes.insert(*iter); + + english += "^_^"; + english += currE[*iter]; + } + + iter = fSide.begin(); + source += currF[*iter]; + iter++; + + for (; iter != fSide.end(); iter++) { + source += "^_^"; + source += currF[*iter]; + } + + iter = fSide.begin(); + j1 = *iter + startIndex; + iter++; + + generateOperations(startIndex, j1, 0 , coverageVector , english , source , targetNullWords , currF); + + + for (; iter != fSide.end(); iter++) { + j1 = *iter + startIndex; + generateOperations(startIndex, j1, 1 , coverageVector , english , source , targetNullWords , currF); + } + + targetIndex++; // Check whether the next target word is unaligned ... + + while(doneTargetIndexes.find(targetIndex) != doneTargetIndexes.end()) { + targetIndex++; + } + + if(sourceNullWords.find(targetIndex) != sourceNullWords.end()) { + english = currE[targetIndex]; + generateDeleteOperations(english,targetIndex, doneTargetIndexes); + } + } + + //removeReorderingOperations(); + + //print(); + +} + +void osmHypothesis :: getMeCepts ( set & eSide , set & fSide , map > & tS , map > & sT) +{ + set :: iterator iter; + + int sz = eSide.size(); + vector t; + + for (iter = eSide.begin(); iter != eSide.end(); iter++) { + t = tS[*iter]; + + for (size_t i = 0; i < t.size(); i++) { + fSide.insert(t[i]); + } + + } + + for (iter = fSide.begin(); iter != fSide.end(); iter++) { + + t = sT[*iter]; + + for (size_t i = 0 ; i sz) { + getMeCepts(eSide,fSide,tS,sT); + } + +} + +void osmHypothesis :: constructCepts(vector & align , int startIndex , int endIndex, int targetPhraseLength) +{ + + std::map > sT; + std::map > tS; + std::set eSide; + std::set fSide; + std::set :: iterator iter; + std :: map > :: iterator iter2; + std :: pair < set , set > cept; + int src; + int tgt; + + + for (size_t i = 0; i < align.size(); i+=2) { + src = align[i]; + tgt = align[i+1]; + tS[tgt].push_back(src); + sT[src].push_back(tgt); + } + + for (int i = startIndex; i<= endIndex; i++) { // What are unaligned source words in this phrase ... + if (sT.find(i-startIndex) == sT.end()) { + targetNullWords.insert(i); + } + } + + for (int i = 0; i < targetPhraseLength; i++) { // What are unaligned target words in this phrase ... + if (tS.find(i) == tS.end()) { + sourceNullWords.insert(i); + } + } + + + while (tS.size() != 0 && sT.size() != 0) { + + iter2 = tS.begin(); + + eSide.clear(); + fSide.clear(); + eSide.insert (iter2->first); + + getMeCepts(eSide, fSide, tS , sT); + + for (iter = eSide.begin(); iter != eSide.end(); iter++) { + iter2 = tS.find(*iter); + tS.erase(iter2); + } + + for (iter = fSide.begin(); iter != fSide.end(); iter++) { + iter2 = sT.find(*iter); + sT.erase(iter2); + } + + cept = make_pair (fSide , eSide); + ceptsInPhrase.push_back(cept); + } + + + + /* + + cerr<<"Extracted Cepts "< "; + + for (iter = fSide.begin(); iter != fSide.end(); iter++) + { + cerr<<*iter<<" "; + } + + cerr<"<"< & scores , const int numFeatures) +{ + scores.clear(); + scores.push_back(opProb); + + if (numFeatures == 1) + return; + + scores.push_back(gapWidth); + scores.push_back(gapCount); + scores.push_back(openGapCount); + scores.push_back(deletionCount); +} + + +} // namespace + diff --git a/mosesdecoder/moses2/FF/OSM/osmHyp.h b/mosesdecoder/moses2/FF/OSM/osmHyp.h new file mode 100644 index 0000000000000000000000000000000000000000..338b73ec2ddc73857f3ab7daa7a4a8024ca836d6 --- /dev/null +++ b/mosesdecoder/moses2/FF/OSM/osmHyp.h @@ -0,0 +1,112 @@ +#pragma once + +# include +# include +# include +# include +#include "KenOSM.h" +# include "../FFState.h" +# include "../../legacy/Bitmap.h" + +namespace Moses2 +{ + +class osmState : public FFState +{ +public: + osmState() + {} + + void setState(const lm::ngram::State & val); + + virtual size_t hash() const; + virtual bool operator==(const FFState& other) const; + + virtual std::string ToString() const { + return "osmState"; + } + + void saveState(int jVal, int eVal, std::map & gapVal); + int getJ()const { + return j; + } + int getE()const { + return E; + } + std::map getGap() const { + return gap; + } + + lm::ngram::State getLMState() const { + return lmState; + } + + void print() const; + std::string getName() const; + +protected: + int j, E; + std::map gap; + lm::ngram::State lmState; +}; + +class osmHypothesis +{ + +private: + + + std::vector operations; // List of operations required to generated this hyp ... + std::map gap; // Maintains gap history ... + int j; // Position after the last source word generated ... + int E; // Position after the right most source word so far generated ... + lm::ngram::State lmState; // KenLM's Model State ... + + int gapCount; // Number of gaps inserted ... + int deletionCount; + int openGapCount; + int gapWidth; + double opProb; + + std::vector currE; + std::vector currF; + std::vector < std::pair < std::set , std::set > > ceptsInPhrase; + std::set targetNullWords; + std::set sourceNullWords; + + int closestGap(std::map gap,int j1, int & gp); + int firstOpenGap(std::vector & coverageVector); + std::string intToString(int); + int getOpenGaps(); + int isTranslationOperation(int j); + void removeReorderingOperations(); + + void getMeCepts ( std::set & eSide , std::set & fSide , std::map > & tS , std::map > & sT); + +public: + + osmHypothesis(); + ~osmHypothesis() {}; + void generateOperations(int & startIndex, int j1 , int contFlag , Bitmap & coverageVector , std::string english , std::string german , std::set & targetNullWords , std::vector & currF); + void generateDeleteOperations(std::string english, int currTargetIndex, std::set doneTargetIndexes); + void calculateOSMProb(OSMLM& ptrOp); + void computeOSMFeature(int startIndex , Bitmap & coverageVector); + void constructCepts(std::vector & align , int startIndex , int endIndex, int targetPhraseLength); + void setPhrases(std::vector & val1 , std::vector & val2) { + currF = val1; + currE = val2; + } + void setState(const FFState* prev_state); + void saveState(osmState &state); + void print(); + void populateScores(std::vector & scores , const int numFeatures); + void setState(const lm::ngram::State & val) { + lmState = val; + } + +}; + +} // namespace + + + diff --git a/mosesdecoder/moses2/FF/PhrasePenalty.cpp b/mosesdecoder/moses2/FF/PhrasePenalty.cpp new file mode 100644 index 0000000000000000000000000000000000000000..84087740da6de710d459ac557cce50a7c067dce9 --- /dev/null +++ b/mosesdecoder/moses2/FF/PhrasePenalty.cpp @@ -0,0 +1,40 @@ +/* + * SkeletonStatefulFF.cpp + * + * Created on: 27 Oct 2015 + * Author: hieu + */ + +#include "PhrasePenalty.h" +#include "../Scores.h" + +namespace Moses2 +{ + +PhrasePenalty::PhrasePenalty(size_t startInd, const std::string &line) : + StatelessFeatureFunction(startInd, line) +{ + ReadParameters(); +} + +PhrasePenalty::~PhrasePenalty() +{ + // TODO Auto-generated destructor stub +} + +void PhrasePenalty::EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + scores.PlusEquals(system, *this, 1); +} + +void PhrasePenalty::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + scores.PlusEquals(system, *this, 1); +} + +} + diff --git a/mosesdecoder/moses2/FF/PhrasePenalty.h b/mosesdecoder/moses2/FF/PhrasePenalty.h new file mode 100644 index 0000000000000000000000000000000000000000..855bdbf09c61dcb8b82208d3df2e6546c711d877 --- /dev/null +++ b/mosesdecoder/moses2/FF/PhrasePenalty.h @@ -0,0 +1,34 @@ +/* + * SkeletonStatefulFF.h + * + * Created on: 27 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include "StatelessFeatureFunction.h" + +namespace Moses2 +{ + +class PhrasePenalty: public StatelessFeatureFunction +{ +public: + PhrasePenalty(size_t startInd, const std::string &line); + virtual ~PhrasePenalty(); + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + +}; + +} + diff --git a/mosesdecoder/moses2/FF/PointerState.cpp b/mosesdecoder/moses2/FF/PointerState.cpp new file mode 100644 index 0000000000000000000000000000000000000000..facb0a2f9cd750b9fa71496773a7a61a870d81a1 --- /dev/null +++ b/mosesdecoder/moses2/FF/PointerState.cpp @@ -0,0 +1,6 @@ +#include "PointerState.h" + +namespace Moses2 +{ + +} diff --git a/mosesdecoder/moses2/FF/PointerState.h b/mosesdecoder/moses2/FF/PointerState.h new file mode 100644 index 0000000000000000000000000000000000000000..a73b576500f8c2ea3deb7600787691a5a746bf62 --- /dev/null +++ b/mosesdecoder/moses2/FF/PointerState.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include "FFState.h" + +namespace Moses2 +{ + +struct PointerState: public FFState { + const void* lmstate; + + explicit PointerState() { + // uninitialised + } + + PointerState(const void* lms) { + lmstate = lms; + } + virtual size_t hash() const { + return (size_t) lmstate; + } + virtual bool operator==(const FFState& other) const { + const PointerState& o = static_cast(other); + return lmstate == o.lmstate; + } + + virtual std::string ToString() const { + std::stringstream sb; + sb << lmstate; + return sb.str(); + } + +}; + +} + diff --git a/mosesdecoder/moses2/FF/StatefulFeatureFunction.cpp b/mosesdecoder/moses2/FF/StatefulFeatureFunction.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6d8045dd3cc7c48956df1676ea6f5fc9e0a5b6b0 --- /dev/null +++ b/mosesdecoder/moses2/FF/StatefulFeatureFunction.cpp @@ -0,0 +1,69 @@ +/* + * StatefulFeatureFunction.cpp + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#ifdef __linux +#include +#endif +#include +#include +#include + +#include +#include "StatefulFeatureFunction.h" +#include "../PhraseBased/Hypothesis.h" + +using namespace std; + +namespace Moses2 +{ + +StatefulFeatureFunction::StatefulFeatureFunction(size_t startInd, + const std::string &line) : + FeatureFunction(startInd, line) +{ +} + +StatefulFeatureFunction::~StatefulFeatureFunction() +{ + // TODO Auto-generated destructor stub +} + +void StatefulFeatureFunction::EvaluateWhenAppliedBatch( + const System &system, + const Batch &batch) const +{ + //cerr << "EvaluateWhenAppliedBatch:" << m_name << endl; +#ifdef __linux + /* + pthread_t handle; + handle = pthread_self(); + + int s; + cpu_set_t cpusetOrig, cpuset; + s = pthread_getaffinity_np(handle, sizeof(cpu_set_t), &cpusetOrig); + + CPU_ZERO(&cpuset); + + int core = handle % 8; + core += 24; + CPU_SET(core, &cpuset); + + s = pthread_setaffinity_np(handle, sizeof(cpu_set_t), &cpuset); + */ +#endif + + for (size_t i = 0; i < batch.size(); ++i) { + Hypothesis *hypo = batch[i]; + hypo->EvaluateWhenApplied(*this); + } + +#ifdef __linux + // s = pthread_setaffinity_np(handle, sizeof(cpu_set_t), &cpusetOrig); +#endif +} + +} + diff --git a/mosesdecoder/moses2/FF/StatefulFeatureFunction.h b/mosesdecoder/moses2/FF/StatefulFeatureFunction.h new file mode 100644 index 0000000000000000000000000000000000000000..7cb3eaae97c23373c4236b9842fd7d8ec58205f7 --- /dev/null +++ b/mosesdecoder/moses2/FF/StatefulFeatureFunction.h @@ -0,0 +1,66 @@ +/* + * StatefulFeatureFunction.h + * + * Created on: 24 Oct 2015 + * Author: hieu + */ + +#ifndef STATEFULFEATUREFUNCTION_H_ +#define STATEFULFEATUREFUNCTION_H_ + +#include "FeatureFunction.h" +#include "FFState.h" +#include "../MemPool.h" + +namespace Moses2 +{ + +class Hypothesis; +class InputType; + +namespace SCFG +{ +class Hypothesis; +class Manager; +} + +class StatefulFeatureFunction: public FeatureFunction +{ +public: + StatefulFeatureFunction(size_t startInd, const std::string &line); + virtual ~StatefulFeatureFunction(); + + void SetStatefulInd(size_t ind) { + m_statefulInd = ind; + } + size_t GetStatefulInd() const { + return m_statefulInd; + } + + //! return uninitialise state + virtual FFState* BlankState(MemPool &pool, const System &sys) const = 0; + + //! return the state associated with the empty hypothesis for a given sentence + virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const = 0; + + virtual void EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const = 0; + + virtual void EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const = 0; + + virtual void EvaluateWhenAppliedBatch( + const System &system, + const Batch &batch) const; + +protected: + size_t m_statefulInd; + +}; + +} + +#endif /* STATEFULFEATUREFUNCTION_H_ */ diff --git a/mosesdecoder/moses2/FF/StatelessFeatureFunction.cpp b/mosesdecoder/moses2/FF/StatelessFeatureFunction.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c73d8907c2fb61a8afda8098b9ad0f8c77ac7b8a --- /dev/null +++ b/mosesdecoder/moses2/FF/StatelessFeatureFunction.cpp @@ -0,0 +1,27 @@ +/* + * StatelessFeatureFunction.cpp + * + * Created on: 24 Oct 2015 + * Author: hieu + */ + +#include "StatelessFeatureFunction.h" + +namespace Moses2 +{ + +StatelessFeatureFunction::StatelessFeatureFunction(size_t startInd, + const std::string &line) : + FeatureFunction(startInd, line) +{ + // TODO Auto-generated constructor stub + +} + +StatelessFeatureFunction::~StatelessFeatureFunction() +{ + // TODO Auto-generated destructor stub +} + +} + diff --git a/mosesdecoder/moses2/FF/StatelessFeatureFunction.h b/mosesdecoder/moses2/FF/StatelessFeatureFunction.h new file mode 100644 index 0000000000000000000000000000000000000000..249e4fdfe4424b66c3ba2f17c5230b49885fb02f --- /dev/null +++ b/mosesdecoder/moses2/FF/StatelessFeatureFunction.h @@ -0,0 +1,25 @@ +/* + * StatelessFeatureFunction.h + * + * Created on: 24 Oct 2015 + * Author: hieu + */ + +#ifndef STATELESSFEATUREFUNCTION_H_ +#define STATELESSFEATUREFUNCTION_H_ + +#include "FeatureFunction.h" + +namespace Moses2 +{ + +class StatelessFeatureFunction: public FeatureFunction +{ +public: + StatelessFeatureFunction(size_t startInd, const std::string &line); + virtual ~StatelessFeatureFunction(); +}; + +} + +#endif /* STATELESSFEATUREFUNCTION_H_ */ diff --git a/mosesdecoder/moses2/FF/WordPenalty.cpp b/mosesdecoder/moses2/FF/WordPenalty.cpp new file mode 100644 index 0000000000000000000000000000000000000000..57682053927b51aae3a7546173f576a38f4e47b0 --- /dev/null +++ b/mosesdecoder/moses2/FF/WordPenalty.cpp @@ -0,0 +1,53 @@ +/* + * WordPenalty.cpp + * + * Created on: 28 Oct 2015 + * Author: hieu + */ + +#include "WordPenalty.h" +#include "../TypeDef.h" +#include "../Scores.h" +#include "../Phrase.h" +#include "../TargetPhrase.h" +#include "../SCFG/Word.h" +#include "../PhraseBased/TargetPhraseImpl.h" + +namespace Moses2 +{ + +WordPenalty::WordPenalty(size_t startInd, const std::string &line) : + StatelessFeatureFunction(startInd, line) +{ + ReadParameters(); +} + +WordPenalty::~WordPenalty() +{ + // TODO Auto-generated destructor stub +} + +void WordPenalty::EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + SCORE score = -(SCORE) targetPhrase.GetSize(); + scores.PlusEquals(system, *this, score); +} + +void WordPenalty::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + size_t count = 0; + for (size_t i = 0; i < targetPhrase.GetSize(); ++i) { + const SCFG::Word &word = targetPhrase[i]; + if (!word.isNonTerminal) { + ++count; + } + } + scores.PlusEquals(system, *this, -(SCORE) count); +} + +} + diff --git a/mosesdecoder/moses2/FF/WordPenalty.h b/mosesdecoder/moses2/FF/WordPenalty.h new file mode 100644 index 0000000000000000000000000000000000000000..acd1bb8739cbf6831281ca2fde435d0eeb38e403 --- /dev/null +++ b/mosesdecoder/moses2/FF/WordPenalty.h @@ -0,0 +1,37 @@ +/* + * WordPenalty.h + * + * Created on: 28 Oct 2015 + * Author: hieu + */ + +#ifndef WORDPENALTY_H_ +#define WORDPENALTY_H_ + +#include "StatelessFeatureFunction.h" + +namespace Moses2 +{ + +class WordPenalty: public StatelessFeatureFunction +{ +public: + WordPenalty(size_t startInd, const std::string &line); + virtual ~WordPenalty(); + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + +}; + +} + +#endif /* WORDPENALTY_H_ */ + diff --git a/mosesdecoder/moses2/HypothesisBase.cpp b/mosesdecoder/moses2/HypothesisBase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c124866d1036d62bdd7bd98c047b94195f1080cd --- /dev/null +++ b/mosesdecoder/moses2/HypothesisBase.cpp @@ -0,0 +1,81 @@ +/* + * Hypothesis.cpp + * + * Created on: 24 Oct 2015 + * Author: hieu + */ + +#include +#include +#include +#include "HypothesisBase.h" +#include "System.h" +#include "Scores.h" +#include "ManagerBase.h" +#include "MemPool.h" +#include "FF/StatefulFeatureFunction.h" + +using namespace std; + +namespace Moses2 +{ + +//size_t g_numHypos = 0; + +HypothesisBase::HypothesisBase(MemPool &pool, const System &system) +{ + m_scores = new (pool.Allocate()) Scores(system, pool, + system.featureFunctions.GetNumScores()); + + // FF states + const std::vector &sfffs = + system.featureFunctions.GetStatefulFeatureFunctions(); + size_t numStatefulFFs = sfffs.size(); + m_ffStates = (FFState **) pool.Allocate(sizeof(FFState*) * numStatefulFFs); + + BOOST_FOREACH(const StatefulFeatureFunction *sfff, sfffs) { + size_t statefulInd = sfff->GetStatefulInd(); + FFState *state = sfff->BlankState(pool, system); + m_ffStates[statefulInd] = state; + } +} + +size_t HypothesisBase::hash() const +{ + return hash(0); +} + +size_t HypothesisBase::hash(size_t seed) const +{ + size_t numStatefulFFs = + GetManager().system.featureFunctions.GetStatefulFeatureFunctions().size(); + + // states + for (size_t i = 0; i < numStatefulFFs; ++i) { + const FFState *state = m_ffStates[i]; + size_t hash = state->hash(); + boost::hash_combine(seed, hash); + } + return seed; + +} + +bool HypothesisBase::operator==(const HypothesisBase &other) const +{ + size_t numStatefulFFs = + GetManager().system.featureFunctions.GetStatefulFeatureFunctions().size(); + + // states + for (size_t i = 0; i < numStatefulFFs; ++i) { + const FFState &thisState = *m_ffStates[i]; + const FFState &otherState = *other.m_ffStates[i]; + if (thisState != otherState) { + return false; + } + } + return true; + +} + +} + diff --git a/mosesdecoder/moses2/HypothesisBase.h b/mosesdecoder/moses2/HypothesisBase.h new file mode 100644 index 0000000000000000000000000000000000000000..55747990667b886913227c27cfee933b945e6c54 --- /dev/null +++ b/mosesdecoder/moses2/HypothesisBase.h @@ -0,0 +1,76 @@ +/* + * Hypothesis.h + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include "FF/FFState.h" +#include "Scores.h" + +namespace Moses2 +{ + +class ManagerBase; +class Scores; + +class HypothesisBase +{ +public: + virtual ~HypothesisBase() { + } + + inline ManagerBase &GetManager() const { + return *m_mgr; + } + + template + const T &Cast() const { + return static_cast(*this); + } + + const Scores &GetScores() const { + return *m_scores; + } + Scores &GetScores() { + return *m_scores; + } + + const FFState *GetState(size_t ind) const { + return m_ffStates[ind]; + } + FFState *GetState(size_t ind) { + return m_ffStates[ind]; + } + + virtual size_t hash() const; + virtual size_t hash(size_t seed) const; + virtual bool operator==(const HypothesisBase &other) const; + + virtual SCORE GetFutureScore() const = 0; + virtual void EvaluateWhenApplied() = 0; + + virtual std::string Debug(const System &system) const = 0; + +protected: + ManagerBase *m_mgr; + Scores *m_scores; + FFState **m_ffStates; + + HypothesisBase(MemPool &pool, const System &system); +}; + +//////////////////////////////////////////////////////////////////////////////////// +class HypothesisFutureScoreOrderer +{ +public: + bool operator()(const HypothesisBase* a, const HypothesisBase* b) const { + return a->GetFutureScore() > b->GetFutureScore(); + } +}; + +} + diff --git a/mosesdecoder/moses2/HypothesisColl.cpp b/mosesdecoder/moses2/HypothesisColl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..77587dc1476ffe0db10995c5b42994a597624a87 --- /dev/null +++ b/mosesdecoder/moses2/HypothesisColl.cpp @@ -0,0 +1,305 @@ +/* + * HypothesisColl.cpp + * + * Created on: 26 Feb 2016 + * Author: hieu + */ +#include +#include +#include +#include +#include "HypothesisColl.h" +#include "ManagerBase.h" +#include "System.h" +#include "MemPoolAllocator.h" + +using namespace std; + +namespace Moses2 +{ + +HypothesisColl::HypothesisColl(const ManagerBase &mgr) + :m_coll(MemPoolAllocator(mgr.GetPool())) + ,m_sortedHypos(NULL) +{ + m_bestScore = -std::numeric_limits::infinity(); + m_worstScore = std::numeric_limits::infinity(); +} + +const HypothesisBase *HypothesisColl::GetBestHypo() const +{ + if (GetSize() == 0) { + return NULL; + } + if (m_sortedHypos) { + return (*m_sortedHypos)[0]; + } + + SCORE bestScore = -std::numeric_limits::infinity(); + const HypothesisBase *bestHypo; + BOOST_FOREACH(const HypothesisBase *hypo, m_coll) { + if (hypo->GetFutureScore() > bestScore) { + bestScore = hypo->GetFutureScore(); + bestHypo = hypo; + } + } + return bestHypo; +} + +void HypothesisColl::Add( + const ManagerBase &mgr, + HypothesisBase *hypo, + Recycler &hypoRecycle, + ArcLists &arcLists) +{ + size_t maxStackSize = mgr.system.options.search.stack_size; + + if (GetSize() > maxStackSize * 2) { + //cerr << "maxStackSize=" << maxStackSize << " " << GetSize() << endl; + PruneHypos(mgr, mgr.arcLists); + } + + SCORE futureScore = hypo->GetFutureScore(); + + /* + cerr << "scores:" + << futureScore << " " + << m_bestScore << " " + << GetSize() << " " + << endl; + */ + if (GetSize() >= maxStackSize && futureScore < m_worstScore) { + // beam threshold or really bad hypo that won't make the pruning cut + // as more hypos are added, the m_worstScore stat gets out of date and isn't the optimum cut-off point + //cerr << "Discard, really bad score:" << hypo->Debug(mgr.system) << endl; + hypoRecycle.Recycle(hypo); + return; + } + + StackAdd added = Add(hypo); + + size_t nbestSize = mgr.system.options.nbest.nbest_size; + if (nbestSize) { + arcLists.AddArc(added.added, hypo, added.other); + } else { + if (added.added) { + if (added.other) { + hypoRecycle.Recycle(added.other); + } + } else { + hypoRecycle.Recycle(hypo); + } + } + + // update beam variables + if (added.added) { + if (futureScore > m_bestScore) { + m_bestScore = futureScore; + float beamWidth = mgr.system.options.search.beam_width; + if ( m_bestScore + beamWidth > m_worstScore ) { + m_worstScore = m_bestScore + beamWidth; + } + } else if (GetSize() <= maxStackSize && futureScore < m_worstScore) { + m_worstScore = futureScore; + } + } +} + +StackAdd HypothesisColl::Add(const HypothesisBase *hypo) +{ + std::pair<_HCType::iterator, bool> addRet = m_coll.insert(hypo); + //cerr << endl << "new=" << hypo->Debug(hypo->GetManager().system) << endl; + + // CHECK RECOMBINATION + if (addRet.second) { + // equiv hypo doesn't exists + //cerr << "Added " << hypo << endl; + return StackAdd(true, NULL); + } else { + HypothesisBase *hypoExisting = const_cast(*addRet.first); + //cerr << "hypoExisting=" << hypoExisting->Debug(hypo->GetManager().system) << endl; + + if (hypo->GetFutureScore() > hypoExisting->GetFutureScore()) { + // incoming hypo is better than the one we have + //cerr << "Add " << hypo << "(" << hypo->hash() << ")" + // << " discard existing " << hypoExisting << "(" << hypoExisting->hash() << ")" + // << endl; + + const HypothesisBase * const &hypoExisting1 = *addRet.first; + const HypothesisBase *&hypoExisting2 = + const_cast(hypoExisting1); + hypoExisting2 = hypo; + + /* + Delete(hypoExisting); + addRet = m_coll.insert(hypo); + UTIL_THROW_IF2(!addRet.second, "couldn't insert hypo " + << hypo << "(" << hypo->hash() << ")"); + */ + /* + if (!addRet.second) { + cerr << "couldn't insert hypo " << hypo << "(" << hypo->hash() << ")" << endl; + cerr << "m_coll="; + for (_HCType::const_iterator iter = m_coll.begin(); iter != m_coll.end(); ++iter) { + const HypothesisBase *h = *iter; + cerr << h << "(" << h->hash() << ") "; + } + cerr << endl; + abort(); + } + */ + + return StackAdd(true, hypoExisting); + } else { + // already storing the best hypo. discard incoming hypo + //cerr << "Keep existing " << hypoExisting << "(" << hypoExisting->hash() << ")" + // << " discard new " << hypo << "(" << hypo->hash() << ")" + // << endl; + return StackAdd(false, hypoExisting); + } + } + + //assert(false); +} + +const Hypotheses &HypothesisColl::GetSortedAndPrunedHypos( + const ManagerBase &mgr, + ArcLists &arcLists) const +{ + if (m_sortedHypos == NULL) { + // create sortedHypos first + MemPool &pool = mgr.GetPool(); + m_sortedHypos = new (pool.Allocate()) Hypotheses(pool, + m_coll.size()); + + SortHypos(mgr, m_sortedHypos->GetArray()); + + // prune + Recycler &recycler = mgr.GetHypoRecycle(); + + size_t maxStackSize = mgr.system.options.search.stack_size; + if (maxStackSize && m_sortedHypos->size() > maxStackSize) { + for (size_t i = maxStackSize; i < m_sortedHypos->size(); ++i) { + HypothesisBase *hypo = const_cast((*m_sortedHypos)[i]); + recycler.Recycle(hypo); + + // delete from arclist + if (mgr.system.options.nbest.nbest_size) { + arcLists.Delete(hypo); + } + } + m_sortedHypos->resize(maxStackSize); + } + + } + + return *m_sortedHypos; +} + +void HypothesisColl::PruneHypos(const ManagerBase &mgr, ArcLists &arcLists) +{ + size_t maxStackSize = mgr.system.options.search.stack_size; + + Recycler &recycler = mgr.GetHypoRecycle(); + + const HypothesisBase **sortedHypos = (const HypothesisBase **) alloca(GetSize() * sizeof(const HypothesisBase *)); + SortHypos(mgr, sortedHypos); + + // update worse score + m_worstScore = sortedHypos[maxStackSize - 1]->GetFutureScore(); + + // prune + for (size_t i = maxStackSize; i < GetSize(); ++i) { + HypothesisBase *hypo = const_cast(sortedHypos[i]); + + // delete from arclist + if (mgr.system.options.nbest.nbest_size) { + arcLists.Delete(hypo); + } + + // delete from collection + Delete(hypo); + + recycler.Recycle(hypo); + } + +} + +void HypothesisColl::SortHypos(const ManagerBase &mgr, const HypothesisBase **sortedHypos) const +{ + size_t maxStackSize = mgr.system.options.search.stack_size; + //assert(maxStackSize); // can't do stack=0 - unlimited stack size. No-one ever uses that + //assert(GetSize() > maxStackSize); + //assert(sortedHypos.size() == GetSize()); + + /* + cerr << "UNSORTED hypos: "; + BOOST_FOREACH(const HypothesisBase *hypo, m_coll) { + cerr << hypo << "(" << hypo->GetFutureScore() << ")" << " "; + } + cerr << endl; + */ + size_t ind = 0; + BOOST_FOREACH(const HypothesisBase *hypo, m_coll) { + sortedHypos[ind] = hypo; + ++ind; + } + + size_t indMiddle; + if (maxStackSize == 0) { + indMiddle = GetSize(); + } else if (GetSize() > maxStackSize) { + indMiddle = maxStackSize; + } else { + // GetSize() <= maxStackSize + indMiddle = GetSize(); + } + + const HypothesisBase **iterMiddle = sortedHypos + indMiddle; + + std::partial_sort( + sortedHypos, + iterMiddle, + sortedHypos + GetSize(), + HypothesisFutureScoreOrderer()); + + /* + cerr << "sorted hypos: "; + for (size_t i = 0; i < sortedHypos.size(); ++i) { + const HypothesisBase *hypo = sortedHypos[i]; + cerr << hypo << " "; + } + cerr << endl; + */ +} + +void HypothesisColl::Delete(const HypothesisBase *hypo) +{ + //cerr << " Delete hypo=" << hypo << "(" << hypo->hash() << ")" + // << " m_coll=" << m_coll.size() << endl; + + size_t erased = m_coll.erase(hypo); + UTIL_THROW_IF2(erased != 1, "couldn't erase hypo " << hypo); +} + +void HypothesisColl::Clear() +{ + m_sortedHypos = NULL; + m_coll.clear(); + + m_bestScore = -std::numeric_limits::infinity(); + m_worstScore = std::numeric_limits::infinity(); +} + +std::string HypothesisColl::Debug(const System &system) const +{ + stringstream out; + BOOST_FOREACH (const HypothesisBase *hypo, m_coll) { + out << hypo->Debug(system); + out << std::endl << std::endl; + } + + return out.str(); +} + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/HypothesisColl.h b/mosesdecoder/moses2/HypothesisColl.h new file mode 100644 index 0000000000000000000000000000000000000000..9c17fc9e712eb5946879aea8dfd6c4956668f6de --- /dev/null +++ b/mosesdecoder/moses2/HypothesisColl.h @@ -0,0 +1,75 @@ +/* + * HypothesisColl.h + * + * Created on: 26 Feb 2016 + * Author: hieu + */ +#pragma once +#include +#include "HypothesisBase.h" +#include "MemPoolAllocator.h" +#include "Recycler.h" +#include "Array.h" +#include "legacy/Util2.h" + +namespace Moses2 +{ + +class ManagerBase; +class ArcLists; + +typedef Array Hypotheses; + +//////////////////////////////////////////////////// +class HypothesisColl +{ +public: + HypothesisColl(const ManagerBase &mgr); + + void Add(const ManagerBase &mgr, + HypothesisBase *hypo, + Recycler &hypoRecycle, + ArcLists &arcLists); + + size_t GetSize() const { + return m_coll.size(); + } + + void Clear(); + + const Hypotheses &GetSortedAndPrunedHypos( + const ManagerBase &mgr, + ArcLists &arcLists) const; + + const HypothesisBase *GetBestHypo() const; + + template + const T *GetBestHypo() const { + const HypothesisBase *hypo = GetBestHypo(); + return hypo ? &hypo->Cast() : NULL; + } + + void Delete(const HypothesisBase *hypo); + + std::string Debug(const System &system) const; + +protected: + typedef boost::unordered_set, UnorderedComparer, + MemPoolAllocator > _HCType; + + _HCType m_coll; + mutable Hypotheses *m_sortedHypos; + + SCORE m_bestScore; + SCORE m_worstScore; + + StackAdd Add(const HypothesisBase *hypo); + + void PruneHypos(const ManagerBase &mgr, ArcLists &arcLists); + void SortHypos(const ManagerBase &mgr, const HypothesisBase **sortedHypos) const; + +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/InMemoryTrie/InMemoryTrie.h b/mosesdecoder/moses2/InMemoryTrie/InMemoryTrie.h new file mode 100644 index 0000000000000000000000000000000000000000..ba085f6adbcfa675379a33e5623d2696d38fa3c6 --- /dev/null +++ b/mosesdecoder/moses2/InMemoryTrie/InMemoryTrie.h @@ -0,0 +1,96 @@ +#pragma once + +#include +#include "Node.h" + +namespace Moses2 +{ + +template +class InMemoryTrie +{ +public: + InMemoryTrie() { + } + Node* insert(const std::vector& word, + const ValueClass& value); + const Node* getNode( + const std::vector& words) const; + const Node &getNode(const std::vector& words, + size_t &stoppedAtInd) const; + std::vector*> getNodes( + const std::vector& words, size_t &stoppedAtInd) const; +private: + Node root; +}; + +template +Node* InMemoryTrie::insert( + const std::vector& word, const ValueClass& value) +{ + Node* cNode = &root; + for (size_t i = 0; i < word.size(); ++i) { + KeyClass cKey = word[i]; + cNode = cNode->addSubnode(cKey); + } + cNode->setValue(value); + return cNode; +} + +template +const Node* InMemoryTrie::getNode( + const std::vector& words) const +{ + size_t stoppedAtInd; + const Node &ret = getNode(words, stoppedAtInd); + if (stoppedAtInd < words.size()) { + return NULL; + } + return &ret; +} + +template +const Node &InMemoryTrie::getNode( + const std::vector& words, size_t &stoppedAtInd) const +{ + const Node *prevNode = &root, *newNode; + for (size_t i = 0; i < words.size(); ++i) { + const KeyClass &cKey = words[i]; + newNode = prevNode->findSub(cKey); + if (newNode == NULL) { + stoppedAtInd = i; + return *prevNode; + } + prevNode = newNode; + } + + stoppedAtInd = words.size(); + return *newNode; +} + +template +std::vector*> InMemoryTrie::getNodes( + const std::vector& words, size_t &stoppedAtInd) const +{ + std::vector*> ret; + const Node *prevNode = &root, *newNode; + ret.push_back(prevNode); + + for (size_t i = 0; i < words.size(); ++i) { + const KeyClass &cKey = words[i]; + newNode = prevNode->findSub(cKey); + if (newNode == NULL) { + stoppedAtInd = i; + return ret; + } else { + ret.push_back(newNode); + } + prevNode = newNode; + } + + stoppedAtInd = words.size(); + return ret; +} + +} + diff --git a/mosesdecoder/moses2/InMemoryTrie/Node.h b/mosesdecoder/moses2/InMemoryTrie/Node.h new file mode 100644 index 0000000000000000000000000000000000000000..39c38e22f7c83aeaaac86621b182a8e40606c218 --- /dev/null +++ b/mosesdecoder/moses2/InMemoryTrie/Node.h @@ -0,0 +1,85 @@ +#pragma once + +#include +#include +#include + +namespace Moses2 +{ + +template +class Node +{ +public: + Node() { + } + Node(const ValueClass& value) : + m_value(value) { + } + ~Node(); + void setKey(const KeyClass& key); + void setValue(const ValueClass& value) { + m_value = value; + } + Node* findSub(const KeyClass& key); + const Node* findSub(const KeyClass& key) const; + Node *addSubnode(const KeyClass& cKey) { + Node *node = findSub(cKey); + if (node) { + return node; + } else { + node = new Node(); + subNodes[cKey] = node; + return node; + } + } + + std::vector getSubnodes(); + const ValueClass &getValue() const { + return m_value; + } + +private: + boost::unordered_map subNodes; + ValueClass m_value; + +}; + +template +Node::~Node() +{ + typename boost::unordered_map::iterator iter; + for (iter = subNodes.begin(); iter != subNodes.end(); ++iter) { + Node *node = iter->second; + delete node; + } +} + +template +const Node* Node::findSub( + const KeyClass& cKey) const +{ + typename boost::unordered_map::const_iterator iter; + iter = subNodes.find(cKey); + if (iter != subNodes.end()) { + Node *node = iter->second; + return node; + } + return NULL; +} + +template +Node* Node::findSub( + const KeyClass& cKey) +{ + typename boost::unordered_map::iterator iter; + iter = subNodes.find(cKey); + if (iter != subNodes.end()) { + Node *node = iter->second; + return node; + } + return NULL; +} + +} + diff --git a/mosesdecoder/moses2/InMemoryTrie/utils.h b/mosesdecoder/moses2/InMemoryTrie/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..eccb95a937c438ded2057d4e40bbe90cc8468679 --- /dev/null +++ b/mosesdecoder/moses2/InMemoryTrie/utils.h @@ -0,0 +1,32 @@ +#pragma once + +#include "InMemoryTrie.h" +#include +#include +#include +#include +#include "legacy/Util2.h" +#include "../legacy/Factor.h" +#include "../legacy/InputFileStream.h" + +using namespace std; + +namespace Moses2 +{ + +inline void ParseLineByChar(string& line, char c, vector& substrings) +{ + size_t i = 0; + size_t j = line.find(c); + + while (j != string::npos) { + substrings.push_back(line.substr(i, j - i)); + i = ++j; + j = line.find(c, j); + + if (j == string::npos) substrings.push_back(line.substr(i, line.length())); + } +} + +} + diff --git a/mosesdecoder/moses2/InputPathBase.cpp b/mosesdecoder/moses2/InputPathBase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c7703354819dc2f8c78a144dd71711a80846b6b8 --- /dev/null +++ b/mosesdecoder/moses2/InputPathBase.cpp @@ -0,0 +1,21 @@ +/* + * InputPath.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include "InputPathBase.h" +#include "TranslationModel/PhraseTable.h" + +namespace Moses2 +{ +InputPathBase::InputPathBase(MemPool &pool, + const Range &range, size_t numPt, const InputPathBase *prefixPath) : + range(range), prefixPath(prefixPath) +{ + +} + +} + diff --git a/mosesdecoder/moses2/InputPathBase.h b/mosesdecoder/moses2/InputPathBase.h new file mode 100644 index 0000000000000000000000000000000000000000..59fb219e350817df11a0dbe5680f6dbd403d9388 --- /dev/null +++ b/mosesdecoder/moses2/InputPathBase.h @@ -0,0 +1,32 @@ +/* + * InputPath.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include "SubPhrase.h" +#include "legacy/Range.h" + +namespace Moses2 +{ + +class PhraseTable; + +class InputPathBase +{ +public: + const InputPathBase *prefixPath; + Range range; + + InputPathBase(MemPool &pool, const Range &range, + size_t numPt, const InputPathBase *prefixPath); + +}; + +} + diff --git a/mosesdecoder/moses2/InputPathsBase.cpp b/mosesdecoder/moses2/InputPathsBase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bcc57a7f7537f143425fef764a4dc02dfcfda05c --- /dev/null +++ b/mosesdecoder/moses2/InputPathsBase.cpp @@ -0,0 +1,20 @@ +/* + * InputPaths.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include "InputPathsBase.h" + +using namespace std; + +namespace Moses2 +{ + +InputPathsBase::~InputPathsBase() +{ +} + +} + diff --git a/mosesdecoder/moses2/InputPathsBase.h b/mosesdecoder/moses2/InputPathsBase.h new file mode 100644 index 0000000000000000000000000000000000000000..88e69ea04ef14800a849a12fefb099a6022ed3c4 --- /dev/null +++ b/mosesdecoder/moses2/InputPathsBase.h @@ -0,0 +1,54 @@ +/* + * InputPaths.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include "MemPool.h" + +namespace Moses2 +{ + +class InputType; +class System; +class ManagerBase; +class InputPathBase; + +class InputPathsBase +{ + typedef std::vector Coll; +public: + InputPathsBase() { + } + virtual ~InputPathsBase(); + + //! iterators + typedef Coll::iterator iterator; + typedef Coll::const_iterator const_iterator; + + const_iterator begin() const { + return m_inputPaths.begin(); + } + const_iterator end() const { + return m_inputPaths.end(); + } + + iterator begin() { + return m_inputPaths.begin(); + } + iterator end() { + return m_inputPaths.end(); + } + + virtual void Init(const InputType &input, const ManagerBase &mgr) = 0; + +protected: + Coll m_inputPaths; +}; + +} + diff --git a/mosesdecoder/moses2/InputType.cpp b/mosesdecoder/moses2/InputType.cpp new file mode 100644 index 0000000000000000000000000000000000000000..889918a12bff5103bcc92f7aa3a6ec1a657aaf94 --- /dev/null +++ b/mosesdecoder/moses2/InputType.cpp @@ -0,0 +1,101 @@ +/* + * InputType.cpp + * + * Created on: 14 Dec 2015 + * Author: hieu + */ + +#include "InputType.h" +#include "System.h" +#include + +using namespace std; + +namespace Moses2 +{ +////////////////////////////////////////////////////////////////////////////// +InputType::XMLOption::XMLOption(MemPool &pool, const std::string &nodeName, size_t vStartPos) + :startPos(vStartPos) + ,prob(0) + ,m_entity(NULL) +{ + m_nodeName = pool.Allocate(nodeName.size() + 1); + strcpy(m_nodeName, nodeName.c_str()); +} + +void InputType::XMLOption::SetTranslation(MemPool &pool, const std::string &val) +{ + m_translation = pool.Allocate(val.size() + 1); + strcpy(m_translation, val.c_str()); +} + +void InputType::XMLOption::SetEntity(MemPool &pool, const std::string &val) +{ + m_entity = pool.Allocate(val.size() + 1); + strcpy(m_entity, val.c_str()); +} + +std::string InputType::XMLOption::Debug(const System &system) const +{ + std::stringstream out; + out << "[" << startPos << "," << phraseSize << "]=" + << m_nodeName << "," + << m_translation << "," + << prob; + if (m_entity) { + out << "," << m_entity; + } + return out.str(); +} + +////////////////////////////////////////////////////////////////////////////// + +InputType::InputType(MemPool &pool) + :m_reorderingConstraint(pool) + ,m_xmlOptions(pool) + ,m_xmlCoverageMap(pool) +{ +} + +InputType::~InputType() +{ + // TODO Auto-generated destructor stub +} + +void InputType::Init(const System &system, size_t size, int max_distortion) +{ + m_reorderingConstraint.InitializeWalls(size, max_distortion); + + if (system.options.input.xml_policy != XmlPassThrough) { + m_xmlCoverageMap.assign(size, false); + } +} + +void InputType::AddXMLOption(const System &system, const XMLOption *xmlOption) +{ + m_xmlOptions.push_back(xmlOption); + + if (system.options.input.xml_policy != XmlPassThrough) { + for(size_t j = xmlOption->startPos; j < xmlOption->startPos + xmlOption->phraseSize; ++j) { + m_xmlCoverageMap[j]=true; + } + } +} + +bool InputType::XmlOverlap(size_t startPos, size_t endPos) const +{ + for (size_t pos = startPos; pos <= endPos ; pos++) { + if (pos < m_xmlCoverageMap.size() && m_xmlCoverageMap[pos]) { + return true; + } + } + return false; +} + +std::string InputType::Debug(const System &system) const +{ + cerr << "InputType::Debug" << endl; + return ""; +} + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/InputType.h b/mosesdecoder/moses2/InputType.h new file mode 100644 index 0000000000000000000000000000000000000000..b4f901ac69c16ac8e33e580f9d7966209b8f7aef --- /dev/null +++ b/mosesdecoder/moses2/InputType.h @@ -0,0 +1,86 @@ +/* + * InputType.h + * + * Created on: 14 Dec 2015 + * Author: hieu + */ + +#pragma once + +#include "PhraseBased/ReorderingConstraint.h" +#include "TypeDef.h" + +namespace Moses2 +{ + +class InputType +{ +public: + ////////////////////////////////////////////////////////////////////////////// + class XMLOption + { + public: + size_t startPos, phraseSize; + + SCORE prob; + + XMLOption(MemPool &pool, const std::string &nodeName, size_t vStartPos); + + const char *GetNodeName() const { + return m_nodeName; + } + + const char *GetTranslation() const { + return m_translation; + } + + const char *GetEntity() const { + return m_entity; + } + + void SetTranslation(MemPool &pool, const std::string &val); + void SetEntity(MemPool &pool, const std::string &val); + + std::string Debug(const System &system) const; + public: + char *m_nodeName; + char *m_translation; + char *m_entity; + + }; + + ////////////////////////////////////////////////////////////////////////////// + + InputType(MemPool &pool); + virtual ~InputType(); + + virtual void Init(const System &system, size_t size, int max_distortion); + + ReorderingConstraint &GetReorderingConstraint() { + return m_reorderingConstraint; + } + + const ReorderingConstraint &GetReorderingConstraint() const { + return m_reorderingConstraint; + } + + const Vector &GetXMLOptions() const { + return m_xmlOptions; + } + + void AddXMLOption(const System &system, const XMLOption *xmlOption); + + //! Returns true if there were any XML tags parsed that at least partially covered the range passed + bool XmlOverlap(size_t startPos, size_t endPos) const; + + virtual std::string Debug(const System &system) const; + +protected: + ReorderingConstraint m_reorderingConstraint; /**< limits on reordering specified either by "-mp" switch or xml tags */ + Vector m_xmlOptions; + Vector m_xmlCoverageMap; + +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/Jamfile b/mosesdecoder/moses2/Jamfile new file mode 100644 index 0000000000000000000000000000000000000000..565745600141543b4e39eb70ca5e35068c65f69b --- /dev/null +++ b/mosesdecoder/moses2/Jamfile @@ -0,0 +1,195 @@ +local with-cmph = [ option.get "with-cmph" ] ; +local includes = ; + +if $(with-cmph) { + lib cmph : : $(with-cmph)/lib $(with-cmph)/lib64 ; + includes += $(with-cmph)/include ; +} +else { + alias cmph ; +} + +if [ xmlrpc ] +{ + echo "BUILDING MOSES2 SERVER!" ; + alias mserver2 : [ glob server/*.cpp ] ; +} +else +{ + echo "NOT BUILDING MOSES2 SERVER!" ; + alias mserver2 ; +} + +max-factors = [ option.get "max-factors" : 4 : 4 ] ; +max-factors = MAX_NUM_FACTORS=$(max-factors) $(FACTOR-LOG) ; + +max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ; +max-order = KENLM_MAX_ORDER=$(max-order) ; + +alias deps : ..//z ..//boost_iostreams ..//boost_filesystem : : : $(max-factors) $(max-order) ; + + + lib moses2_lib : + AlignmentInfo.cpp + AlignmentInfoCollection.cpp + ArcLists.cpp + EstimatedScores.cpp + HypothesisBase.cpp + HypothesisColl.cpp + InputPathBase.cpp + InputPathsBase.cpp + InputType.cpp + ManagerBase.cpp + MemPool.cpp + Phrase.cpp + pugixml.cpp + Scores.cpp + SubPhrase.cpp + System.cpp + TargetPhrase.cpp + TranslationTask.cpp + TrellisPaths.cpp + TypeDef.cpp + Vector.cpp + Weights.cpp + Word.cpp + FF/Distortion.cpp + FF/FeatureFunction.cpp + FF/FeatureFunctions.cpp + FF/FeatureRegistry.cpp + FF/PhrasePenalty.cpp + FF/ExampleStatefulFF.cpp + FF/ExampleStatelessFF.cpp + FF/StatefulFeatureFunction.cpp + FF/StatelessFeatureFunction.cpp + FF/WordPenalty.cpp + + FF/LexicalReordering/BidirectionalReorderingState.cpp + FF/LexicalReordering/HReorderingBackwardState.cpp + FF/LexicalReordering/HReorderingForwardState.cpp + FF/LexicalReordering/LexicalReordering.cpp + FF/LexicalReordering/LRModel.cpp + FF/LexicalReordering/LRState.cpp + FF/LexicalReordering/PhraseBasedReorderingState.cpp + FF/LexicalReordering/ReorderingStack.cpp + + FF/OSM/OpSequenceModel.cpp + FF/OSM/KenOSM.cpp + FF/OSM/osmHyp.cpp + + LM/LanguageModel.cpp + LM/KENLM.cpp + LM/KENLMBatch.cpp + LM/GPULM.cpp + + TranslationModel/PhraseTable.cpp + TranslationModel/ProbingPT.cpp + TranslationModel/Transliteration.cpp + TranslationModel/UnknownWordPenalty.cpp + TranslationModel/Memory/PhraseTableMemory.cpp + + TranslationModel/CompactPT/BlockHashIndex.cpp + TranslationModel/CompactPT/CmphStringVectorAdapter.cpp + TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp + TranslationModel/CompactPT/MurmurHash3.cpp + TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp + TranslationModel/CompactPT/ThrowingFwrite.cpp + TranslationModel/Dynamic/DynamicPhraseTable.cpp + + parameters/AllOptions.cpp + parameters/BookkeepingOptions.cpp + parameters/ContextParameters.cpp + parameters/CubePruningOptions.cpp + parameters/InputOptions.cpp + parameters/LMBR_Options.cpp + parameters/MBR_Options.cpp + parameters/NBestOptions.cpp + parameters/OOVHandlingOptions.cpp + parameters/OptionsBaseClass.cpp + parameters/ReorderingOptions.cpp + parameters/ReportingOptions.cpp + parameters/SearchOptions.cpp + parameters/ServerOptions.cpp + parameters/SyntaxOptions.cpp + + PhraseBased/Hypothesis.cpp + PhraseBased/InputPath.cpp + PhraseBased/InputPaths.cpp + PhraseBased/Manager.cpp + PhraseBased/PhraseImpl.cpp + PhraseBased/ReorderingConstraint.cpp + PhraseBased/TargetPhrases.cpp + PhraseBased/Search.cpp + PhraseBased/Sentence.cpp + PhraseBased/SentenceWithCandidates.cpp + PhraseBased/TargetPhraseImpl.cpp + PhraseBased/TrellisPath.cpp + + PhraseBased/Normal/Search.cpp + PhraseBased/Normal/Stack.cpp + PhraseBased/Normal/Stacks.cpp + + PhraseBased/CubePruningMiniStack/Misc.cpp + PhraseBased/CubePruningMiniStack/Search.cpp + PhraseBased/CubePruningMiniStack/Stack.cpp + +# PhraseBased/CubePruningCardinalStack/Misc.cpp +# PhraseBased/CubePruningCardinalStack/Search.cpp +# PhraseBased/CubePruningCardinalStack/Stack.cpp + +# PhraseBased/CubePruningBitmapStack/Misc.cpp +# PhraseBased/CubePruningBitmapStack/Search.cpp +# PhraseBased/CubePruningBitmapStack/Stack.cpp + +# PhraseBased/CubePruningPerBitmap/Misc.cpp +# PhraseBased/CubePruningPerBitmap/Search.cpp +# PhraseBased/CubePruningPerBitmap/Stacks.cpp + +# PhraseBased/CubePruningPerMiniStack/Misc.cpp +# PhraseBased/CubePruningPerMiniStack/Search.cpp +# PhraseBased/CubePruningPerMiniStack/Stacks.cpp + + legacy/Bitmap.cpp + legacy/Bitmaps.cpp + legacy/Factor.cpp + legacy/FactorCollection.cpp + legacy/InputFileStream.cpp + legacy/Matrix.cpp + legacy/OutputFileStream.cpp + legacy/Parameter.cpp + legacy/Range.cpp + legacy/Range.cpp + legacy/ThreadPool.cpp + legacy/Timer.cpp + legacy/Util2.cpp + + SCFG/ActiveChart.cpp + SCFG/Hypothesis.cpp + SCFG/InputPath.cpp + SCFG/InputPaths.cpp + SCFG/Manager.cpp + SCFG/Misc.cpp + SCFG/PhraseImpl.cpp + SCFG/Sentence.cpp + SCFG/Stack.cpp + SCFG/Stacks.cpp + SCFG/TargetPhraseImpl.cpp + SCFG/TargetPhrases.cpp + SCFG/Word.cpp + SCFG/nbest/KBestExtractor.cpp + SCFG/nbest/NBest.cpp + SCFG/nbest/NBests.cpp + SCFG/nbest/NBestColl.cpp + Moses2Wrapper.cpp + DLLEntryApi.cpp + deps + cmph + mserver2 + : + $(includes) + ; +#need to figure out this +lib moses2decoder : Main.cpp moses2_lib ../probingpt//probingpt ../util//kenutil ../lm//kenlm ; +exe moses2 : moses2decoder ; +echo "Building Moses2" ; +alias programs : moses2 moses2decoder ; \ No newline at end of file diff --git a/mosesdecoder/moses2/LM/GPULM.cpp b/mosesdecoder/moses2/LM/GPULM.cpp new file mode 100644 index 0000000000000000000000000000000000000000..714ff8ff2bc5e510dbbcd357012b03e177f6a424 --- /dev/null +++ b/mosesdecoder/moses2/LM/GPULM.cpp @@ -0,0 +1,242 @@ +/* + * GPULM.cpp + * + * Created on: 4 Nov 2015 + * Author: hieu + */ +#include +#include +#include + +#ifdef _linux +#include +#include +#endif +#include +#include +#include + +#include "GPULM.h" +#include "../Phrase.h" +#include "../Scores.h" +#include "../System.h" +#include "../PhraseBased/Hypothesis.h" +#include "../PhraseBased/Manager.h" +#include "../PhraseBased/TargetPhraseImpl.h" +#include "util/exception.hh" +#include "../legacy/FactorCollection.h" + +using namespace std; + +namespace Moses2 +{ + +struct GPULMState: public FFState { + virtual std::string ToString() const { + return "GPULMState"; + } + + virtual size_t hash() const { + return boost::hash_value(lastWords); + } + + virtual bool operator==(const FFState& other) const { + const GPULMState &otherCast = static_cast(other); + bool ret = lastWords == otherCast.lastWords; + + return ret; + } + + void SetContext(const Context &context) { + lastWords = context; + if (lastWords.size()) { + lastWords.resize(lastWords.size() - 1); + } + } + + Context lastWords; +}; + + +///////////////////////////////////////////////////////////////// +GPULM::GPULM(size_t startInd, const std::string &line) + :StatefulFeatureFunction(startInd, line) +{ + cerr << "GPULM::GPULM" << endl; + ReadParameters(); +} + +GPULM::~GPULM() +{ + // TODO Auto-generated destructor stub +} + +void GPULM::Load(System &system) +{ + cerr << "GPULM::Load" << endl; + FactorCollection &fc = system.GetVocab(); + + m_bos = fc.AddFactor(BOS_, system, false); + m_eos = fc.AddFactor(EOS_, system, false); + + FactorCollection &collection = system.GetVocab(); +} + +FFState* GPULM::BlankState(MemPool &pool, const System &sys) const +{ + GPULMState *ret = new (pool.Allocate()) GPULMState(); + return ret; +} + +//! return the state associated with the empty hypothesis for a given sentence +void GPULM::EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const +{ + GPULMState &stateCast = static_cast(state); + stateCast.lastWords.push_back(m_bos); +} + +void GPULM::EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + if (targetPhrase.GetSize() == 0) { + return; + } + + SCORE score = 0; + SCORE nonFullScore = 0; + Context context; +// context.push_back(m_bos); + + context.reserve(m_order); + for (size_t i = 0; i < targetPhrase.GetSize(); ++i) { + const Factor *factor = targetPhrase[i][m_factorType]; + ShiftOrPush(context, factor); + + if (context.size() == m_order) { + //std::pair fromScoring = Score(context); + //score += fromScoring.first; + } else { + //std::pair fromScoring = Score(context); + //nonFullScore += fromScoring.first; + } + } + +} + +void GPULM::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + UTIL_THROW2("Not implemented"); +} + +void GPULM::EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const +{ + UTIL_THROW2("Not implemented"); +} + +void GPULM::SetParameter(const std::string& key, + const std::string& value) +{ + //cerr << "key=" << key << " " << value << endl; + if (key == "path") { + m_path = value; + } else if (key == "order") { + m_order = Scan(value); + } else if (key == "factor") { + m_factorType = Scan(value); + } else { + StatefulFeatureFunction::SetParameter(key, value); + } + + //cerr << "SetParameter done" << endl; +} + +void GPULM::EvaluateWhenAppliedBatch( + const System &system, + const Batch &batch) const +{ + // create list of ngrams + std::vector > contexts; + + for (size_t i = 0; i < batch.size(); ++i) { + Hypothesis *hypo = batch[i]; + CreateNGram(contexts, *hypo); + } + + // score ngrams + for (size_t i = 0; i < contexts.size(); ++i) { + const Context &context = contexts[i].second; + Hypothesis *hypo = contexts[i].first; + SCORE score = Score(context); + Scores &scores = hypo->GetScores(); + scores.PlusEquals(system, *this, score); + } + + +} + +void GPULM::CreateNGram(std::vector > &contexts, Hypothesis &hypo) const +{ + const TargetPhrase &tp = hypo.GetTargetPhrase(); + + if (tp.GetSize() == 0) { + return; + } + + const Hypothesis *prevHypo = hypo.GetPrevHypo(); + assert(prevHypo); + const FFState *prevState = prevHypo->GetState(GetStatefulInd()); + assert(prevState); + const GPULMState &prevStateCast = static_cast(*prevState); + + Context context = prevStateCast.lastWords; + context.reserve(m_order); + + for (size_t i = 0; i < tp.GetSize(); ++i) { + const Word &word = tp[i]; + const Factor *factor = word[m_factorType]; + ShiftOrPush(context, factor); + + std::pair ele(&hypo, context); + contexts.push_back(ele); + } + + FFState *state = hypo.GetState(GetStatefulInd()); + GPULMState &stateCast = static_cast(*state); + stateCast.SetContext(context); +} + +void GPULM::ShiftOrPush(std::vector &context, + const Factor *factor) const +{ + if (context.size() < m_order) { + context.resize(context.size() + 1); + } + assert(context.size()); + + for (size_t i = context.size() - 1; i > 0; --i) { + context[i] = context[i - 1]; + } + + context[0] = factor; +} + +SCORE GPULM::Score(const Context &context) const +{ + return 444; +} + +void GPULM::EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const +{ + UTIL_THROW2("Not implemented"); +} + +} + diff --git a/mosesdecoder/moses2/LM/GPULM.h b/mosesdecoder/moses2/LM/GPULM.h new file mode 100644 index 0000000000000000000000000000000000000000..6a3fb49f757c1b4a526e8b527d5ce68f1980359a --- /dev/null +++ b/mosesdecoder/moses2/LM/GPULM.h @@ -0,0 +1,92 @@ +/* + * KENLM.h + * + * Created on: 4 Nov 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include +#ifdef __linux +#include +#endif + +#include "../FF/StatefulFeatureFunction.h" +#include "lm/model.hh" +#include "../legacy/Factor.h" +#include "../legacy/Util2.h" +#include "../Word.h" +#include "../TypeDef.h" + +namespace Moses2 +{ + +class Word; + +class GPULM: public StatefulFeatureFunction +{ +public: + GPULM(size_t startInd, const std::string &line); + + virtual ~GPULM(); + + virtual void Load(System &system); + + void SetParameter(const std::string& key, + const std::string& value); + + virtual FFState* BlankState(MemPool &pool, const System &sys) const; + + //! return the state associated with the empty hypothesis for a given sentence + virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const; + + virtual void EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const; + + virtual void EvaluateWhenAppliedBatch( + const System &system, + const Batch &batch) const; + +protected: + std::string m_path; + FactorType m_factorType; + util::LoadMethod m_load_method; + const Factor *m_bos; + const Factor *m_eos; + size_t m_order; + + inline lm::WordIndex TranslateID(const Word &word) const { + std::size_t factor = word[m_factorType]->GetId(); + return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]); + } + + std::vector m_lmIdLookup; + + // batch + void CreateNGram(std::vector > &contexts, Hypothesis &hypo) const; + + void ShiftOrPush(std::vector &context, + const Factor *factor) const; + + SCORE Score(const Context &context) const; +}; + +} diff --git a/mosesdecoder/moses2/LM/KENLM.cpp b/mosesdecoder/moses2/LM/KENLM.cpp new file mode 100644 index 0000000000000000000000000000000000000000..689d76b92b9d7d6555310907d807b6a7297c1350 --- /dev/null +++ b/mosesdecoder/moses2/LM/KENLM.cpp @@ -0,0 +1,576 @@ +/* + * KENLM.cpp + * + * Created on: 4 Nov 2015 + * Author: hieu + */ +#include +#include +#include "KENLM.h" +#include "../Phrase.h" +#include "../Scores.h" +#include "../System.h" +#include "../PhraseBased/Hypothesis.h" +#include "../PhraseBased/Manager.h" +#include "../PhraseBased/TargetPhraseImpl.h" +#include "lm/state.hh" +#include "lm/left.hh" +#include "util/exception.hh" +#include "util/tokenize_piece.hh" +#include "util/string_stream.hh" +#include "../legacy/FactorCollection.h" +#include "../SCFG/TargetPhraseImpl.h" +#include "../SCFG/Hypothesis.h" +#include "../SCFG/Manager.h" + +using namespace std; + +namespace Moses2 +{ + +struct KenLMState: public FFState { + lm::ngram::State state; + virtual size_t hash() const { + size_t ret = hash_value(state); + return ret; + } + virtual bool operator==(const FFState& o) const { + const KenLMState &other = static_cast(o); + bool ret = state == other.state; + return ret; + } + + virtual std::string ToString() const { + stringstream ss; + for (size_t i = 0; i < state.Length(); ++i) { + ss << state.words[i] << " "; + } + return ss.str(); + } + +}; + +///////////////////////////////////////////////////////////////// +class LanguageModelChartStateKenLM : public FFState +{ +public: + LanguageModelChartStateKenLM() {} + + const lm::ngram::ChartState &GetChartState() const { + return m_state; + } + lm::ngram::ChartState &GetChartState() { + return m_state; + } + + size_t hash() const { + size_t ret = hash_value(m_state); + return ret; + } + virtual bool operator==(const FFState& o) const { + const LanguageModelChartStateKenLM &other = static_cast(o); + bool ret = m_state == other.m_state; + return ret; + } + + virtual std::string ToString() const { + return "LanguageModelChartStateKenLM"; + } + +private: + lm::ngram::ChartState m_state; +}; + +///////////////////////////////////////////////////////////////// +class MappingBuilder: public lm::EnumerateVocab +{ +public: + MappingBuilder(FactorCollection &factorCollection, System &system, + std::vector &mapping) : + m_factorCollection(factorCollection), m_system(system), m_mapping(mapping) { + } + + void Add(lm::WordIndex index, const StringPiece &str) { + std::size_t factorId = m_factorCollection.AddFactor(str, m_system, false)->GetId(); + if (m_mapping.size() <= factorId) { + // 0 is :-) + m_mapping.resize(factorId + 1); + } + m_mapping[factorId] = index; + } + +private: + FactorCollection &m_factorCollection; + std::vector &m_mapping; + System &m_system; +}; + +///////////////////////////////////////////////////////////////// +template +KENLM::KENLM(size_t startInd, const std::string &line, + const std::string &file, FactorType factorType, + util::LoadMethod load_method) : + StatefulFeatureFunction(startInd, line), m_path(file), m_factorType( + factorType), m_load_method(load_method) +{ + ReadParameters(); +} + +template +KENLM::~KENLM() +{ + // TODO Auto-generated destructor stub +} + +template +void KENLM::Load(System &system) +{ + FactorCollection &fc = system.GetVocab(); + + m_bos = fc.AddFactor(BOS_, system, false); + m_eos = fc.AddFactor(EOS_, system, false); + + lm::ngram::Config config; + config.messages = NULL; + + FactorCollection &collection = system.GetVocab(); + MappingBuilder builder(collection, system, m_lmIdLookup); + config.enumerate_vocab = &builder; + config.load_method = m_load_method; + + m_ngram.reset(new Model(m_path.c_str(), config)); +} + +template +FFState* KENLM::BlankState(MemPool &pool, const System &sys) const +{ + FFState *ret; + if (sys.isPb) { + ret = new (pool.Allocate()) KenLMState(); + } else { + ret = new (pool.Allocate()) LanguageModelChartStateKenLM(); + } + return ret; +} + +//! return the state associated with the empty hypothesis for a given sentence +template +void KENLM::EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const +{ + KenLMState &stateCast = static_cast(state); + stateCast.state = m_ngram->BeginSentenceState(); +} + +template +void KENLM::EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + // contains factors used by this LM + float fullScore, nGramScore; + size_t oovCount; + + CalcScore(targetPhrase, fullScore, nGramScore, oovCount); + + float estimateScore = fullScore - nGramScore; + + bool GetLMEnableOOVFeature = false; + if (GetLMEnableOOVFeature) { + float scoresVec[2], estimateScoresVec[2]; + scoresVec[0] = nGramScore; + scoresVec[1] = oovCount; + scores.PlusEquals(system, *this, scoresVec); + + estimateScoresVec[0] = estimateScore; + estimateScoresVec[1] = 0; + SCORE weightedScore = Scores::CalcWeightedScore(system, *this, + estimateScoresVec); + estimatedScore += weightedScore; + } else { + scores.PlusEquals(system, *this, nGramScore); + + SCORE weightedScore = Scores::CalcWeightedScore(system, *this, + estimateScore); + estimatedScore += weightedScore; + } +} + +template +void KENLM::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + // contains factors used by this LM + float fullScore, nGramScore; + size_t oovCount; + + CalcScore(targetPhrase, fullScore, nGramScore, oovCount); + + //float estimateScore = fullScore - nGramScore; + + // all LM scores are estimated + float estimateScore = fullScore; + nGramScore = 0; + + bool GetLMEnableOOVFeature = false; + if (GetLMEnableOOVFeature) { + float scoresVec[2], estimateScoresVec[2]; + scoresVec[0] = nGramScore; + scoresVec[1] = oovCount; + scores.PlusEquals(system, *this, scoresVec); + + estimateScoresVec[0] = estimateScore; + estimateScoresVec[1] = 0; + SCORE weightedScore = Scores::CalcWeightedScore(system, *this, + estimateScoresVec); + estimatedScore += weightedScore; + } else { + scores.PlusEquals(system, *this, nGramScore); + + SCORE weightedScore = Scores::CalcWeightedScore(system, *this, + estimateScore); + estimatedScore += weightedScore; + } +} + +template +void KENLM::EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const +{ + KenLMState &stateCast = static_cast(state); + + const System &system = mgr.system; + + const lm::ngram::State &in_state = + static_cast(prevState).state; + + if (!hypo.GetTargetPhrase().GetSize()) { + stateCast.state = in_state; + return; + } + + const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos(); + //[begin, end) in STL-like fashion. + const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1; + const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1); + + std::size_t position = begin; + typename Model::State aux_state; + typename Model::State *state0 = &stateCast.state, *state1 = &aux_state; + + float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)), + *state0); + ++position; + for (; position < adjust_end; ++position) { + score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)), + *state1); + std::swap(state0, state1); + } + + if (hypo.GetBitmap().IsComplete()) { + // Score end of sentence. + std::vector indices(m_ngram->Order() - 1); + const lm::WordIndex *last = LastIDs(hypo, &indices.front()); + score += m_ngram->FullScoreForgotState(&indices.front(), last, + m_ngram->GetVocabulary().EndSentence(), stateCast.state).prob; + } else if (adjust_end < end) { + // Get state after adding a long phrase. + std::vector indices(m_ngram->Order() - 1); + const lm::WordIndex *last = LastIDs(hypo, &indices.front()); + m_ngram->GetState(&indices.front(), last, stateCast.state); + } else if (state0 != &stateCast.state) { + // Short enough phrase that we can just reuse the state. + stateCast.state = *state0; + } + + score = TransformLMScore(score); + + bool OOVFeatureEnabled = false; + if (OOVFeatureEnabled) { + std::vector scoresVec(2); + scoresVec[0] = score; + scoresVec[1] = 0.0; + scores.PlusEquals(system, *this, scoresVec); + } else { + scores.PlusEquals(system, *this, score); + } +} + +template +void KENLM::CalcScore(const Phrase &phrase, float &fullScore, + float &ngramScore, std::size_t &oovCount) const +{ + fullScore = 0; + ngramScore = 0; + oovCount = 0; + + if (!phrase.GetSize()) return; + + lm::ngram::ChartState discarded_sadly; + lm::ngram::RuleScore scorer(*m_ngram, discarded_sadly); + + size_t position; + if (m_bos == phrase[0][m_factorType]) { + scorer.BeginSentence(); + position = 1; + } else { + position = 0; + } + + size_t ngramBoundary = m_ngram->Order() - 1; + + size_t end_loop = std::min(ngramBoundary, phrase.GetSize()); + for (; position < end_loop; ++position) { + const Word &word = phrase[position]; + lm::WordIndex index = TranslateID(word); + scorer.Terminal(index); + if (!index) ++oovCount; + } + float before_boundary = fullScore + scorer.Finish(); + for (; position < phrase.GetSize(); ++position) { + const Word &word = phrase[position]; + lm::WordIndex index = TranslateID(word); + scorer.Terminal(index); + if (!index) ++oovCount; + } + fullScore += scorer.Finish(); + + ngramScore = TransformLMScore(fullScore - before_boundary); + fullScore = TransformLMScore(fullScore); +} + +template +void KENLM::CalcScore(const Phrase &phrase, float &fullScore, + float &ngramScore, std::size_t &oovCount) const +{ + fullScore = 0; + ngramScore = 0; + oovCount = 0; + + if (!phrase.GetSize()) return; + + lm::ngram::ChartState discarded_sadly; + lm::ngram::RuleScore scorer(*m_ngram, discarded_sadly); + + size_t position; + if (m_bos == phrase[0][m_factorType]) { + scorer.BeginSentence(); + position = 1; + } else { + position = 0; + } + + size_t ngramBoundary = m_ngram->Order() - 1; + + size_t end_loop = std::min(ngramBoundary, phrase.GetSize()); + for (; position < end_loop; ++position) { + const SCFG::Word &word = phrase[position]; + if (word.isNonTerminal) { + fullScore += scorer.Finish(); + scorer.Reset(); + } else { + lm::WordIndex index = TranslateID(word); + scorer.Terminal(index); + if (!index) ++oovCount; + } + } + float before_boundary = fullScore + scorer.Finish(); + for (; position < phrase.GetSize(); ++position) { + const SCFG::Word &word = phrase[position]; + if (word.isNonTerminal) { + fullScore += scorer.Finish(); + scorer.Reset(); + } else { + lm::WordIndex index = TranslateID(word); + scorer.Terminal(index); + if (!index) ++oovCount; + } + } + fullScore += scorer.Finish(); + + ngramScore = TransformLMScore(fullScore - before_boundary); + fullScore = TransformLMScore(fullScore); +} + +// Convert last words of hypothesis into vocab ids, returning an end pointer. +template +lm::WordIndex *KENLM::LastIDs(const Hypothesis &hypo, + lm::WordIndex *indices) const +{ + lm::WordIndex *index = indices; + lm::WordIndex *end = indices + m_ngram->Order() - 1; + int position = hypo.GetCurrTargetWordsRange().GetEndPos(); + for (;; ++index, --position) { + if (index == end) return index; + if (position == -1) { + *index = m_ngram->GetVocabulary().BeginSentence(); + return index + 1; + } + *index = TranslateID(hypo.GetWord(position)); + } +} + +template +void KENLM::EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const +{ + LanguageModelChartStateKenLM &newState = static_cast(state); + lm::ngram::RuleScore ruleScore(*m_ngram, newState.GetChartState()); + const SCFG::TargetPhraseImpl &target = hypo.GetTargetPhrase(); + const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = + target.GetAlignNonTerm().GetNonTermIndexMap(); + + const size_t size = target.GetSize(); + size_t phrasePos = 0; + // Special cases for first word. + if (size) { + const SCFG::Word &word = target[0]; + if (word[m_factorType] == m_bos) { + // Begin of sentence + ruleScore.BeginSentence(); + phrasePos++; + } else if (word.isNonTerminal) { + // Non-terminal is first so we can copy instead of rescoring. + const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]); + const lm::ngram::ChartState &prevState = static_cast(prevHypo->GetState(featureID))->GetChartState(); + ruleScore.BeginNonTerminal(prevState); + phrasePos++; + } + } + + for (; phrasePos < size; phrasePos++) { + const SCFG::Word &word = target[phrasePos]; + if (word.isNonTerminal) { + const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]); + const lm::ngram::ChartState &prevState = static_cast(prevHypo->GetState(featureID))->GetChartState(); + ruleScore.NonTerminal(prevState); + } else { + ruleScore.Terminal(TranslateID(word)); + } + } + + float score = ruleScore.Finish(); + score = TransformLMScore(score); + + // take out score from loading. This needs reworking + //score -= target.GetScores().GetScores(*this)[0]; + + bool OOVFeatureEnabled = false; + if (OOVFeatureEnabled) { + std::vector scoresVec(2); + scoresVec[0] = score; + scoresVec[1] = 0.0; + scores.PlusEquals(mgr.system, *this, scoresVec); + } else { + scores.PlusEquals(mgr.system, *this, score); + } +} + +/////////////////////////////////////////////////////////////////////////// + +/* Instantiate LanguageModelKen here. Tells the compiler to generate code + * for the instantiations' non-inline member functions in this file. + * Otherwise, depending on the compiler, those functions may not be present + * at link time. + */ +template class KENLM ; +template class KENLM ; +template class KENLM ; +template class KENLM ; +template class KENLM ; +template class KENLM ; + +FeatureFunction *ConstructKenLM(size_t startInd, const std::string &lineOrig) +{ + FactorType factorType = 0; + string filePath; + util::LoadMethod load_method = util::POPULATE_OR_READ; + + util::TokenIter argument(lineOrig, ' '); + ++argument; // KENLM + + util::StringStream line; + line << "KENLM"; + + for (; argument; ++argument) { + const char *equals = std::find(argument->data(), + argument->data() + argument->size(), '='); + UTIL_THROW_IF2(equals == argument->data() + argument->size(), + "Expected = in KenLM argument " << *argument); + StringPiece name(argument->data(), equals - argument->data()); + StringPiece value(equals + 1, + argument->data() + argument->size() - equals - 1); + if (name == "factor") { + factorType = boost::lexical_cast(value); + } else if (name == "order") { + // Ignored + } else if (name == "path") { + filePath.assign(value.data(), value.size()); + } else if (name == "lazyken") { + // deprecated: use load instead. + load_method = + boost::lexical_cast(value) ? + util::LAZY : util::POPULATE_OR_READ; + } else if (name == "load") { + if (value == "lazy") { + load_method = util::LAZY; + } else if (value == "populate_or_lazy") { + load_method = util::POPULATE_OR_LAZY; + } else if (value == "populate_or_read" || value == "populate") { + load_method = util::POPULATE_OR_READ; + } else if (value == "read") { + load_method = util::READ; + } else if (value == "parallel_read") { + load_method = util::PARALLEL_READ; + } else { + UTIL_THROW2("Unknown KenLM load method " << value); + } + } else { + // pass to base class to interpret + line << " " << name << "=" << value; + } + } + + return ConstructKenLM(startInd, line.str(), filePath, factorType, load_method); +} + +FeatureFunction *ConstructKenLM(size_t startInd, const std::string &line, + const std::string &file, FactorType factorType, + util::LoadMethod load_method) +{ + lm::ngram::ModelType model_type; + if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) { + switch (model_type) { + case lm::ngram::PROBING: + return new KENLM(startInd, line, file, + factorType, load_method); + case lm::ngram::REST_PROBING: + return new KENLM(startInd, line, file, + factorType, load_method); + case lm::ngram::TRIE: + return new KENLM(startInd, line, file, factorType, + load_method); + case lm::ngram::QUANT_TRIE: + return new KENLM(startInd, line, file, + factorType, load_method); + case lm::ngram::ARRAY_TRIE: + return new KENLM(startInd, line, file, + factorType, load_method); + case lm::ngram::QUANT_ARRAY_TRIE: + return new KENLM(startInd, line, file, + factorType, load_method); + default: + UTIL_THROW2("Unrecognized kenlm model type " << model_type) + ; + } + } else { + return new KENLM(startInd, line, file, factorType, + load_method); + } +} + +} + diff --git a/mosesdecoder/moses2/LM/KENLM.h b/mosesdecoder/moses2/LM/KENLM.h new file mode 100644 index 0000000000000000000000000000000000000000..3c7839bea74642fd0b67c922130f4e59af83226d --- /dev/null +++ b/mosesdecoder/moses2/LM/KENLM.h @@ -0,0 +1,87 @@ +/* + * KENLM.h + * + * Created on: 4 Nov 2015 + * Author: hieu + */ +#pragma once +#include +#include "../FF/StatefulFeatureFunction.h" +#include "lm/model.hh" +#include "../legacy/Factor.h" +#include "../legacy/Util2.h" +#include "../Word.h" + +namespace Moses2 +{ + +class Word; + +FeatureFunction *ConstructKenLM(size_t startInd, const std::string &lineOrig); +FeatureFunction *ConstructKenLM(size_t startInd, const std::string &line, + const std::string &file, FactorType factorType, + util::LoadMethod load_method); + +template +class KENLM: public StatefulFeatureFunction +{ +public: + KENLM(size_t startInd, const std::string &line, const std::string &file, + FactorType factorType, util::LoadMethod load_method); + + virtual ~KENLM(); + + virtual void Load(System &system); + + virtual FFState* BlankState(MemPool &pool, const System &sys) const; + + //! return the state associated with the empty hypothesis for a given sentence + virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const; + + virtual void EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const; + +protected: + std::string m_path; + FactorType m_factorType; + util::LoadMethod m_load_method; + const Factor *m_bos; + const Factor *m_eos; + + boost::shared_ptr m_ngram; + + void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, + std::size_t &oovCount) const; + + void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, + std::size_t &oovCount) const; + + inline lm::WordIndex TranslateID(const Word &word) const { + std::size_t factor = word[m_factorType]->GetId(); + return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]); + } + // Convert last words of hypothesis into vocab ids, returning an end pointer. + lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const; + + std::vector m_lmIdLookup; + +}; + +} + diff --git a/mosesdecoder/moses2/LM/KENLMBatch.cpp b/mosesdecoder/moses2/LM/KENLMBatch.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d364309613e6e1708a5d80a29b2a772000597ebc --- /dev/null +++ b/mosesdecoder/moses2/LM/KENLMBatch.cpp @@ -0,0 +1,370 @@ +/* + * KENLMBatch.cpp + * + * Created on: 4 Nov 2015 + * Author: hieu + */ +#include +#include +#include + +#ifdef _linux +#include +#include +#endif +#include +#include +#include + +#include "KENLMBatch.h" +#include "../Phrase.h" +#include "../Scores.h" +#include "../System.h" +#include "../PhraseBased/Hypothesis.h" +#include "../PhraseBased/Manager.h" +#include "../PhraseBased/TargetPhraseImpl.h" +#include "lm/state.hh" +#include "lm/left.hh" +#include "util/exception.hh" +#include "util/tokenize_piece.hh" +#include "util/string_stream.hh" +#include "../legacy/FactorCollection.h" + +using namespace std; + +namespace Moses2 +{ + +struct KenLMState: public FFState { + lm::ngram::State state; + virtual size_t hash() const { + size_t ret = hash_value(state); + return ret; + } + virtual bool operator==(const FFState& o) const { + const KenLMState &other = static_cast(o); + bool ret = state == other.state; + return ret; + } + + virtual std::string ToString() const { + stringstream ss; + for (size_t i = 0; i < state.Length(); ++i) { + ss << state.words[i] << " "; + } + return ss.str(); + } + +}; + +///////////////////////////////////////////////////////////////// +class MappingBuilder: public lm::EnumerateVocab +{ +public: + MappingBuilder(FactorCollection &factorCollection, System &system, + std::vector &mapping) : + m_factorCollection(factorCollection), m_system(system), m_mapping(mapping) { + } + + void Add(lm::WordIndex index, const StringPiece &str) { + std::size_t factorId = m_factorCollection.AddFactor(str, m_system, false)->GetId(); + if (m_mapping.size() <= factorId) { + // 0 is :-) + m_mapping.resize(factorId + 1); + } + m_mapping[factorId] = index; + } + +private: + FactorCollection &m_factorCollection; + std::vector &m_mapping; + System &m_system; +}; + +///////////////////////////////////////////////////////////////// +KENLMBatch::KENLMBatch(size_t startInd, const std::string &line) + :StatefulFeatureFunction(startInd, line) + ,m_numHypos(0) +{ + cerr << "KENLMBatch::KENLMBatch" << endl; + ReadParameters(); +} + +KENLMBatch::~KENLMBatch() +{ + // TODO Auto-generated destructor stub +} + +void KENLMBatch::Load(System &system) +{ + cerr << "KENLMBatch::Load" << endl; + FactorCollection &fc = system.GetVocab(); + + m_bos = fc.AddFactor(BOS_, system, false); + m_eos = fc.AddFactor(EOS_, system, false); + + lm::ngram::Config config; + config.messages = NULL; + + FactorCollection &collection = system.GetVocab(); + MappingBuilder builder(collection, system, m_lmIdLookup); + config.enumerate_vocab = &builder; + config.load_method = m_load_method; + + m_ngram.reset(new Model(m_path.c_str(), config)); +} + +FFState* KENLMBatch::BlankState(MemPool &pool, const System &sys) const +{ + KenLMState *ret = new (pool.Allocate()) KenLMState(); + return ret; +} + +//! return the state associated with the empty hypothesis for a given sentence +void KENLMBatch::EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const +{ + KenLMState &stateCast = static_cast(state); + stateCast.state = m_ngram->BeginSentenceState(); +} + +void KENLMBatch::EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + // contains factors used by this LM + float fullScore, nGramScore; + size_t oovCount; + + CalcScore(targetPhrase, fullScore, nGramScore, oovCount); + + float estimateScore = fullScore - nGramScore; + + bool GetLMEnableOOVFeature = false; + if (GetLMEnableOOVFeature) { + float scoresVec[2], estimateScoresVec[2]; + scoresVec[0] = nGramScore; + scoresVec[1] = oovCount; + scores.PlusEquals(system, *this, scoresVec); + + estimateScoresVec[0] = estimateScore; + estimateScoresVec[1] = 0; + SCORE weightedScore = Scores::CalcWeightedScore(system, *this, + estimateScoresVec); + estimatedScore += weightedScore; + } else { + scores.PlusEquals(system, *this, nGramScore); + + SCORE weightedScore = Scores::CalcWeightedScore(system, *this, + estimateScore); + estimatedScore += weightedScore; + } +} + +void KENLMBatch::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ +} + +void KENLMBatch::EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const +{ + KenLMState &stateCast = static_cast(state); + + const System &system = mgr.system; + + const lm::ngram::State &in_state = + static_cast(prevState).state; + + if (!hypo.GetTargetPhrase().GetSize()) { + stateCast.state = in_state; + return; + } + + const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos(); + //[begin, end) in STL-like fashion. + const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1; + const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1); + + std::size_t position = begin; + Model::State aux_state; + Model::State *state0 = &stateCast.state, *state1 = &aux_state; + + float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)), + *state0); + ++position; + for (; position < adjust_end; ++position) { + score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)), + *state1); + std::swap(state0, state1); + } + + if (hypo.GetBitmap().IsComplete()) { + // Score end of sentence. + std::vector indices(m_ngram->Order() - 1); + const lm::WordIndex *last = LastIDs(hypo, &indices.front()); + score += m_ngram->FullScoreForgotState(&indices.front(), last, + m_ngram->GetVocabulary().EndSentence(), stateCast.state).prob; + } else if (adjust_end < end) { + // Get state after adding a long phrase. + std::vector indices(m_ngram->Order() - 1); + const lm::WordIndex *last = LastIDs(hypo, &indices.front()); + m_ngram->GetState(&indices.front(), last, stateCast.state); + } else if (state0 != &stateCast.state) { + // Short enough phrase that we can just reuse the state. + stateCast.state = *state0; + } + + score = TransformLMScore(score); + + bool OOVFeatureEnabled = false; + if (OOVFeatureEnabled) { + std::vector scoresVec(2); + scoresVec[0] = score; + scoresVec[1] = 0.0; + scores.PlusEquals(system, *this, scoresVec); + } else { + scores.PlusEquals(system, *this, score); + } +} + +void KENLMBatch::CalcScore(const Phrase &phrase, float &fullScore, + float &ngramScore, std::size_t &oovCount) const +{ + fullScore = 0; + ngramScore = 0; + oovCount = 0; + + if (!phrase.GetSize()) return; + + lm::ngram::ChartState discarded_sadly; + lm::ngram::RuleScore scorer(*m_ngram, discarded_sadly); + + size_t position; + if (m_bos == phrase[0][m_factorType]) { + scorer.BeginSentence(); + position = 1; + } else { + position = 0; + } + + size_t ngramBoundary = m_ngram->Order() - 1; + + size_t end_loop = std::min(ngramBoundary, phrase.GetSize()); + for (; position < end_loop; ++position) { + const Word &word = phrase[position]; + lm::WordIndex index = TranslateID(word); + scorer.Terminal(index); + if (!index) ++oovCount; + } + float before_boundary = fullScore + scorer.Finish(); + for (; position < phrase.GetSize(); ++position) { + const Word &word = phrase[position]; + lm::WordIndex index = TranslateID(word); + scorer.Terminal(index); + if (!index) ++oovCount; + } + fullScore += scorer.Finish(); + + ngramScore = TransformLMScore(fullScore - before_boundary); + fullScore = TransformLMScore(fullScore); +} + +// Convert last words of hypothesis into vocab ids, returning an end pointer. +lm::WordIndex *KENLMBatch::LastIDs(const Hypothesis &hypo, + lm::WordIndex *indices) const +{ + lm::WordIndex *index = indices; + lm::WordIndex *end = indices + m_ngram->Order() - 1; + int position = hypo.GetCurrTargetWordsRange().GetEndPos(); + for (;; ++index, --position) { + if (index == end) return index; + if (position == -1) { + *index = m_ngram->GetVocabulary().BeginSentence(); + return index + 1; + } + *index = TranslateID(hypo.GetWord(position)); + } +} + +void KENLMBatch::SetParameter(const std::string& key, + const std::string& value) +{ + //cerr << "key=" << key << " " << value << endl; + if (key == "path") { + m_path = value; + } else if (key == "order") { + // ignore + } else if (key == "factor") { + m_factorType = Scan(value); + } else if (key == "lazyken") { + m_load_method = + boost::lexical_cast(value) ? + util::LAZY : util::POPULATE_OR_READ; + } else if (key == "load") { + if (value == "lazy") { + m_load_method = util::LAZY; + } else if (value == "populate_or_lazy") { + m_load_method = util::POPULATE_OR_LAZY; + } else if (value == "populate_or_read" || value == "populate") { + m_load_method = util::POPULATE_OR_READ; + } else if (value == "read") { + m_load_method = util::READ; + } else if (value == "parallel_read") { + m_load_method = util::PARALLEL_READ; + } else { + UTIL_THROW2("Unknown KenLM load method " << value); + } + } else { + StatefulFeatureFunction::SetParameter(key, value); + } + + //cerr << "SetParameter done" << endl; +} + +void KENLMBatch::EvaluateWhenAppliedBatch( + const Batch &batch) const +{ + { + // write lock + boost::unique_lock lock(m_accessLock); + m_batches.push_back(&batch); + m_numHypos += batch.size(); + } + //cerr << "m_numHypos=" << m_numHypos << endl; + + if (m_numHypos > 0) { + // process batch + EvaluateWhenAppliedBatch(); + + m_batches.clear(); + m_numHypos = 0; + + m_threadNeeded.notify_all(); + } else { + boost::mutex::scoped_lock lock(m_mutex); + m_threadNeeded.wait(lock); + } +} + +void KENLMBatch::EvaluateWhenAppliedBatch() const +{ + BOOST_FOREACH(const Batch *batch, m_batches) { + //cerr << "batch=" << batch->size() << endl; + BOOST_FOREACH(Hypothesis *hypo, *batch) { + hypo->EvaluateWhenApplied(*this); + } + } +} + +void KENLMBatch::EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const +{ + UTIL_THROW2("Not implemented"); +} + +} + diff --git a/mosesdecoder/moses2/LM/KENLMBatch.h b/mosesdecoder/moses2/LM/KENLMBatch.h new file mode 100644 index 0000000000000000000000000000000000000000..1510381b566c7f93a7a5f04e3c9ff88a17d623c5 --- /dev/null +++ b/mosesdecoder/moses2/LM/KENLMBatch.h @@ -0,0 +1,102 @@ +/* + * KENLM.h + * + * Created on: 4 Nov 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include +#ifdef __linux +#include +#endif + +#include "../FF/StatefulFeatureFunction.h" +#include "lm/model.hh" +#include "../legacy/Factor.h" +#include "../legacy/Util2.h" +#include "../Word.h" +#include "../TypeDef.h" + +namespace Moses2 +{ + +class Word; + +class KENLMBatch: public StatefulFeatureFunction +{ +public: + KENLMBatch(size_t startInd, const std::string &line); + + virtual ~KENLMBatch(); + + virtual void Load(System &system); + + void SetParameter(const std::string& key, + const std::string& value); + + virtual FFState* BlankState(MemPool &pool, const System &sys) const; + + //! return the state associated with the empty hypothesis for a given sentence + virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const; + + virtual void EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const; + + virtual void EvaluateWhenAppliedBatch( + const Batch &batch) const; + +protected: + std::string m_path; + FactorType m_factorType; + util::LoadMethod m_load_method; + const Factor *m_bos; + const Factor *m_eos; + + typedef lm::ngram::ProbingModel Model; + boost::shared_ptr m_ngram; + + void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, + std::size_t &oovCount) const; + + inline lm::WordIndex TranslateID(const Word &word) const { + std::size_t factor = word[m_factorType]->GetId(); + return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]); + } + // Convert last words of hypothesis into vocab ids, returning an end pointer. + lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const; + + std::vector m_lmIdLookup; + + // batch + mutable std::vector m_batches; + mutable size_t m_numHypos; + + mutable boost::shared_mutex m_accessLock; + + mutable boost::mutex m_mutex; + mutable boost::condition_variable m_threadNeeded; + + void EvaluateWhenAppliedBatch() const; + +}; + +} diff --git a/mosesdecoder/moses2/LM/LanguageModel.cpp b/mosesdecoder/moses2/LM/LanguageModel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a720851ba380cac3688c96e65e857a3a57b055cb --- /dev/null +++ b/mosesdecoder/moses2/LM/LanguageModel.cpp @@ -0,0 +1,322 @@ +/* + * LanguageModel.cpp + * + * Created on: 29 Oct 2015 + * Author: hieu + */ +#include +#include "LanguageModel.h" +#include "../Phrase.h" +#include "../System.h" +#include "../PhraseBased/Manager.h" +#include "../PhraseBased/Hypothesis.h" +#include "../PhraseBased/TargetPhraseImpl.h" +#include "../FF/PointerState.h" +#include "../legacy/Util2.h" +#include "../legacy/InputFileStream.h" +#include "../legacy/Bitmap.h" +#include "../legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ + +struct LMState: public PointerState { + LMState() : + PointerState() { + // uninitialised + } + + void Set(MemPool &pool, void *lms, const std::vector &context) { + lmstate = lms; + + numWords = context.size(); + lastWords = (const Factor**) pool.Allocate( + sizeof(const Factor*) * numWords); + for (size_t i = 0; i < numWords; ++i) { + lastWords[i] = context[i]; + } + } + + void Init(MemPool &pool, const Factor *factor) { + lmstate = NULL; + numWords = 1; + lastWords = (const Factor**) pool.Allocate(sizeof(const Factor*)); + lastWords[0] = factor; + } + + size_t numWords; + const Factor** lastWords; +}; + +//////////////////////////////////////////////////////////////////////////////////////// +LanguageModel::LanguageModel(size_t startInd, const std::string &line) : + StatefulFeatureFunction(startInd, line), m_oov(-100) +{ + ReadParameters(); +} + +LanguageModel::~LanguageModel() +{ + // TODO Auto-generated destructor stub +} + +void LanguageModel::Load(System &system) +{ + FactorCollection &fc = system.GetVocab(); + + m_bos = fc.AddFactor(BOS_, system, false); + m_eos = fc.AddFactor(EOS_, system, false); + + InputFileStream infile(m_path); + size_t lineNum = 0; + string line; + while (getline(infile, line)) { + if (++lineNum % 100000 == 0) { + cerr << lineNum << " "; + } + + vector substrings = Tokenize(line, "\t"); + + if (substrings.size() < 2) continue; + + assert(substrings.size() == 2 || substrings.size() == 3); + + SCORE prob = TransformLMScore(Scan(substrings[0])); + if (substrings[1] == "") { + m_oov = prob; + continue; + } + + SCORE backoff = 0.f; + if (substrings.size() == 3) { + backoff = TransformLMScore(Scan(substrings[2])); + } + + // ngram + vector key = Tokenize(substrings[1], " "); + + vector factorKey(key.size()); + for (size_t i = 0; i < key.size(); ++i) { + factorKey[factorKey.size() - i - 1] = fc.AddFactor(key[i], system, false); + } + + m_root.insert(factorKey, LMScores(prob, backoff)); + } + +} + +void LanguageModel::SetParameter(const std::string& key, + const std::string& value) +{ + if (key == "path") { + m_path = value; + } else if (key == "factor") { + m_factorType = Scan(value); + } else if (key == "order") { + m_order = Scan(value); + } else { + StatefulFeatureFunction::SetParameter(key, value); + } +} + +FFState* LanguageModel::BlankState(MemPool &pool, const System &sys) const +{ + return new (pool.Allocate()) LMState(); +} + +void LanguageModel::EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const +{ + LMState &stateCast = static_cast(state); + + MemPool &pool = mgr.GetPool(); + stateCast.Init(pool, m_bos); +} + +void LanguageModel::EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + if (targetPhrase.GetSize() == 0) { + return; + } + + SCORE score = 0; + SCORE nonFullScore = 0; + vector context; +// context.push_back(m_bos); + + context.reserve(m_order); + for (size_t i = 0; i < targetPhrase.GetSize(); ++i) { + const Factor *factor = targetPhrase[i][m_factorType]; + ShiftOrPush(context, factor); + + if (context.size() == m_order) { + std::pair fromScoring = Score(context); + score += fromScoring.first; + } else { + std::pair fromScoring = Score(context); + nonFullScore += fromScoring.first; + } + } + + scores.PlusEquals(system, *this, score); + SCORE weightedScore = Scores::CalcWeightedScore(system, *this, nonFullScore); + estimatedScore += weightedScore; +} + +void LanguageModel::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ +} + +void LanguageModel::EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const +{ + const LMState &prevLMState = static_cast(prevState); + size_t numWords = prevLMState.numWords; + + // context is held backwards + vector context(numWords); + for (size_t i = 0; i < numWords; ++i) { + context[i] = prevLMState.lastWords[i]; + } + //DebugContext(context); + + SCORE score = 0; + std::pair fromScoring; + const TargetPhrase &tp = hypo.GetTargetPhrase(); + for (size_t i = 0; i < tp.GetSize(); ++i) { + const Word &word = tp[i]; + const Factor *factor = word[m_factorType]; + ShiftOrPush(context, factor); + fromScoring = Score(context); + score += fromScoring.first; + } + + const Bitmap &bm = hypo.GetBitmap(); + if (bm.IsComplete()) { + // everything translated + ShiftOrPush(context, m_eos); + fromScoring = Score(context); + score += fromScoring.first; + fromScoring.second = NULL; + context.clear(); + } else { + assert(context.size()); + if (context.size() == m_order) { + context.resize(context.size() - 1); + } + } + + scores.PlusEquals(mgr.system, *this, score); + + // return state + //DebugContext(context); + + LMState &stateCast = static_cast(state); + MemPool &pool = mgr.GetPool(); + stateCast.Set(pool, fromScoring.second, context); +} + +void LanguageModel::ShiftOrPush(std::vector &context, + const Factor *factor) const +{ + if (context.size() < m_order) { + context.resize(context.size() + 1); + } + assert(context.size()); + + for (size_t i = context.size() - 1; i > 0; --i) { + context[i] = context[i - 1]; + } + + context[0] = factor; +} + +std::pair LanguageModel::Score( + const std::vector &context) const +{ + //cerr << "context="; + //DebugContext(context); + + std::pair ret; + + typedef Node LMNode; + const LMNode *node = m_root.getNode(context); + if (node) { + ret.first = node->getValue().prob; + ret.second = (void*) node; + } else { + SCORE backoff = 0; + std::vector backOffContext(context.begin() + 1, + context.end()); + node = m_root.getNode(backOffContext); + if (node) { + backoff = node->getValue().backoff; + } + + std::vector newContext(context.begin(), context.end() - 1); + std::pair newRet = Score(newContext); + + ret.first = backoff + newRet.first; + ret.second = newRet.second; + } + + //cerr << "score=" << ret.first << endl; + return ret; +} + +SCORE LanguageModel::BackoffScore( + const std::vector &context) const +{ + //cerr << "backoff="; + //DebugContext(context); + + SCORE ret; + size_t stoppedAtInd; + const Node &node = m_root.getNode(context, + stoppedAtInd); + + if (stoppedAtInd == context.size()) { + // found entire ngram + ret = node.getValue().backoff; + } else { + if (stoppedAtInd == 0) { + ret = m_oov; + stoppedAtInd = 1; + } else { + ret = node.getValue().backoff; + } + + // recursive + std::vector backoff(context.begin() + stoppedAtInd, + context.end()); + ret += BackoffScore(backoff); + } + + return ret; +} + +void LanguageModel::DebugContext( + const std::vector &context) const +{ + for (size_t i = 0; i < context.size(); ++i) { + cerr << context[i]->GetString() << " "; + } + cerr << endl; +} + +void LanguageModel::EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const +{ + UTIL_THROW2("Not implemented"); +} + +} + diff --git a/mosesdecoder/moses2/LM/LanguageModel.h b/mosesdecoder/moses2/LM/LanguageModel.h new file mode 100644 index 0000000000000000000000000000000000000000..12d25809fac24583569052f175021083dfb4b729 --- /dev/null +++ b/mosesdecoder/moses2/LM/LanguageModel.h @@ -0,0 +1,92 @@ +/* + * LanguageModel.h + * + * Created on: 29 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include "../FF/StatefulFeatureFunction.h" +#include "../TypeDef.h" +#include "../InMemoryTrie/InMemoryTrie.h" +#include "../legacy/Factor.h" +#include "../legacy/Util2.h" + +namespace Moses2 +{ + +//////////////////////////////////////////////////////////////////////////////////////// +struct LMScores { + LMScores() { + } + + LMScores(const LMScores ©) : + prob(copy.prob), backoff(copy.backoff) { + } + + LMScores(float inProb, float inBackoff) : + prob(inProb), backoff(inBackoff) { + } + + void Debug(std::ostream &out, const System &system) const { + out << "(" << prob << "," << backoff << ")" << std::flush; + } + + float prob, backoff; +}; + +//////////////////////////////////////////////////////////////////////////////////////// +class LanguageModel: public StatefulFeatureFunction +{ +public: + LanguageModel(size_t startInd, const std::string &line); + virtual ~LanguageModel(); + + virtual void Load(System &system); + + virtual void SetParameter(const std::string& key, const std::string& value); + + virtual FFState* BlankState(MemPool &pool, const System &sys) const; + virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr, + const InputType &input, const Hypothesis &hypo) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void EvaluateWhenApplied(const ManagerBase &mgr, + const Hypothesis &hypo, const FFState &prevState, Scores &scores, + FFState &state) const; + + virtual void EvaluateWhenApplied(const SCFG::Manager &mgr, + const SCFG::Hypothesis &hypo, int featureID, Scores &scores, + FFState &state) const; + +protected: + std::string m_path; + FactorType m_factorType; + size_t m_order; + + InMemoryTrie m_root; + SCORE m_oov; + const Factor *m_bos; + const Factor *m_eos; + + void ShiftOrPush(std::vector &context, + const Factor *factor) const; + std::pair Score( + const std::vector &context) const; + SCORE BackoffScore(const std::vector &context) const; + + void DebugContext(const std::vector &context) const; +}; + +} + diff --git a/mosesdecoder/moses2/Main.cpp b/mosesdecoder/moses2/Main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6fa1f5bd8a4e070b42d0b2d4433bf948cd021559 --- /dev/null +++ b/mosesdecoder/moses2/Main.cpp @@ -0,0 +1,136 @@ +#include +#include +#include +#include "Main.h" +#include "System.h" +#include "Phrase.h" +#include "TranslationTask.h" +#include "MemPoolAllocator.h" +#ifdef HAVE_XMLRPC_C + #include "server/Server.h" +#endif // HAVE_XMLRPC_C + +#include "legacy/InputFileStream.h" +#include "legacy/Parameter.h" +#include "legacy/ThreadPool.h" +#include "legacy/Timer.h" +#include "legacy/Util2.h" +#include "util/usage.hh" + +using namespace std; + +//extern size_t g_numHypos; + +int main(int argc, char** argv) +{ + cerr << "Starting..." << endl; + + Moses2::Timer timer; + timer.start(); + //Temp(); + + Moses2::Parameter params; + if (!params.LoadParam(argc, argv)) { + return EXIT_FAILURE; + } + Moses2::System system(params); + timer.check("Loaded"); + + if (params.GetParam("show-weights")) { + return EXIT_SUCCESS; + } + + //cerr << "system.numThreads=" << system.options.server.numThreads << endl; + Moses2::ThreadPool pool(system.options.server.numThreads, system.cpuAffinityOffset, system.cpuAffinityOffsetIncr); + //cerr << "CREATED POOL" << endl; + + if (params.GetParam("server")) { + std::cerr << "RUN SERVER" << std::endl; + run_as_server(system); + } + else { + std::cerr << "RUN BATCH" << std::endl; + batch_run(params, system, pool); + } + + cerr << "Decoding took " << timer.get_elapsed_time() << endl; + // cerr << "g_numHypos=" << g_numHypos << endl; + cerr << "Finished" << endl; + return EXIT_SUCCESS; +} + +//////////////////////////////////////////////////////////////////////////////////////////////// +void run_as_server(Moses2::System& system) +{ +#ifdef HAVE_XMLRPC_C + Moses2::Server server(system.options.server, system); + server.run(system); // actually: don't return. see Server::run() +#else + UTIL_THROW2("Moses2 was compiled without xmlrpc-c. " + << "No server functionality available."); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////// +istream &GetInputStream(Moses2::Parameter ¶ms) +{ + const Moses2::PARAM_VEC *vec = params.GetParam("input-file"); + if (vec && vec->size()) { + Moses2::InputFileStream *stream = new Moses2::InputFileStream(vec->at(0)); + return *stream; + } else { + return cin; + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////// + +void batch_run(Moses2::Parameter& params, Moses2::System& system, Moses2::ThreadPool& pool) +{ + istream& inStream = GetInputStream(params); + + long translationId = 0; + string line; + while (getline(inStream, line)) { + //cerr << "line=" << line << endl; + boost::shared_ptr task(new Moses2::TranslationTask(system, line, translationId)); + + //cerr << "START pool.Submit()" << endl; + pool.Submit(task); + //task->Run(); + ++translationId; + } + + pool.Stop(true); + + if (&inStream != &cin) { + delete& inStream; + } + + //util::PrintUsage(std::cerr); + +} + +//////////////////////////////////////////////////////////////////////////////////////////////// +void Temp() +{ + Moses2::MemPool pool; + Moses2::MemPoolAllocator a(pool); + + boost::unordered_set, std::equal_to, Moses2::MemPoolAllocator > s(a); + s.insert(3); + s.insert(4); + s.insert(3); + s.erase(3); + + boost::pool_allocator alloc; + std::vector > v(alloc); + for (int i = 0; i < 1000; ++i) + v.push_back(i); + + v.clear(); + boost::singleton_pool:: + purge_memory(); + + abort(); +} diff --git a/mosesdecoder/moses2/Main.h b/mosesdecoder/moses2/Main.h new file mode 100644 index 0000000000000000000000000000000000000000..731d6385bc85fd2b3b9778a1c314bbe75ef603ea --- /dev/null +++ b/mosesdecoder/moses2/Main.h @@ -0,0 +1,23 @@ +/* + * Main.h + * + * Created on: 1 Apr 2016 + * Author: hieu + */ +#pragma once +#include + +namespace Moses2 +{ +class Parameter; +class System; +class ThreadPool; +} + +std::istream &GetInputStream(Moses2::Parameter ¶ms); +void batch_run(Moses2::Parameter ¶ms, Moses2::System &system, Moses2::ThreadPool &pool); +void run_as_server(Moses2::System &system); + +void Temp(); + + diff --git a/mosesdecoder/moses2/ManagerBase.cpp b/mosesdecoder/moses2/ManagerBase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..41d3a039468fa34be41d26a0baa948dc3c7e8d9b --- /dev/null +++ b/mosesdecoder/moses2/ManagerBase.cpp @@ -0,0 +1,56 @@ +/* + * Manager.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include +#include "System.h" +#include "ManagerBase.h" +#include "Phrase.h" +#include "InputPathsBase.h" +#include "InputPathBase.h" +#include "TranslationModel/PhraseTable.h" +#include "legacy/Range.h" +#include "PhraseBased/Sentence.h" + +using namespace std; + +namespace Moses2 +{ +ManagerBase::ManagerBase(System &sys, const TranslationTask &task, + const std::string &inputStr, long translationId) + :system(sys) + ,task(task) + ,m_inputStr(inputStr) + ,m_translationId(translationId) + ,m_pool(NULL) + ,m_systemPool(NULL) + ,m_hypoRecycle(NULL) + ,m_input(NULL) +{ +} + +ManagerBase::~ManagerBase() +{ + system.featureFunctions.CleanUpAfterSentenceProcessing(*m_input); + + if (m_pool) { + GetPool().Reset(); + } + if (m_hypoRecycle) { + GetHypoRecycle().Clear(); + } +} + +void ManagerBase::InitPools() +{ + m_pool = &system.GetManagerPool(); + m_systemPool = &system.GetSystemPool(); + m_hypoRecycle = &system.GetHypoRecycler(); +} + +} + diff --git a/mosesdecoder/moses2/ManagerBase.h b/mosesdecoder/moses2/ManagerBase.h new file mode 100644 index 0000000000000000000000000000000000000000..d0740109cccedc19212754307586ce8a2856f3e5 --- /dev/null +++ b/mosesdecoder/moses2/ManagerBase.h @@ -0,0 +1,81 @@ +/* + * Manager.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include +#include "Phrase.h" +#include "MemPool.h" +#include "Recycler.h" +#include "EstimatedScores.h" +#include "ArcLists.h" +#include "legacy/Bitmaps.h" + +namespace Moses2 +{ + +class System; +class TranslationTask; +class PhraseImpl; +class SearchNormal; +class Search; +class InputType; +class OutputCollector; +class HypothesisBase; + +class ManagerBase +{ +public: + System &system; + const TranslationTask &task; + mutable ArcLists arcLists; + + ManagerBase(System &sys, const TranslationTask &task, + const std::string &inputStr, long translationId); + virtual ~ManagerBase(); + virtual void Decode() = 0; + virtual std::string OutputBest() const = 0; + virtual std::string OutputNBest() = 0; + virtual std::string OutputTransOpt() = 0; + + MemPool &GetPool() const { + return *m_pool; + } + + MemPool &GetSystemPool() const { + return *m_systemPool; + } + + Recycler &GetHypoRecycle() const { + return *m_hypoRecycle; + } + + const InputType &GetInput() const { + return *m_input; + } + + long GetTranslationId() const { + return m_translationId; + } + +protected: + std::string m_inputStr; + long m_translationId; + InputType *m_input; + + mutable MemPool *m_pool, *m_systemPool; + mutable Recycler *m_hypoRecycle; + + void InitPools(); + +}; + +} + diff --git a/mosesdecoder/moses2/MemPool.cpp b/mosesdecoder/moses2/MemPool.cpp new file mode 100644 index 0000000000000000000000000000000000000000..31d684bfc6e2307db0119c9f646ad2f9b9a05863 --- /dev/null +++ b/mosesdecoder/moses2/MemPool.cpp @@ -0,0 +1,81 @@ +/* + * MemPool.cpp + * + * Created on: 28 Oct 2015 + * Author: hieu + */ + +#include +#include "MemPool.h" +#include "util/scoped.hh" +#include "legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ + +MemPool::Page::Page(std::size_t vSize) : + size(vSize) +{ + mem = (uint8_t*) util::MallocOrThrow(size); + end = mem + size; +} + +MemPool::Page::~Page() +{ + free(mem); +} +//////////////////////////////////////////////////// +MemPool::MemPool(size_t initSize) : + m_currSize(initSize), m_currPage(0) +{ + Page *page = new Page(m_currSize); + m_pages.push_back(page); + + current_ = page->mem; + //cerr << "new memory pool"; +} + +MemPool::~MemPool() +{ + //cerr << "delete memory pool" << endl; + RemoveAllInColl(m_pages); +} + +uint8_t *MemPool::More(std::size_t size) +{ + ++m_currPage; + if (m_currPage >= m_pages.size()) { + // add new page + m_currSize <<= 1; + std::size_t amount = std::max(m_currSize, size); + + Page *page = new Page(amount); + m_pages.push_back(page); + + uint8_t *ret = page->mem; + current_ = ret + size; + return ret; + } else { + // use existing page + Page &page = *m_pages[m_currPage]; + if (size <= page.size) { + uint8_t *ret = page.mem; + current_ = ret + size; + return ret; + } else { + // recursive call More() + return More(size); + } + } +} + +void MemPool::Reset() +{ + m_currPage = 0; + current_ = m_pages[0]->mem; +} + +} + diff --git a/mosesdecoder/moses2/MemPool.h b/mosesdecoder/moses2/MemPool.h new file mode 100644 index 0000000000000000000000000000000000000000..2e8fccc346bf8bd974e344b8cbbc4157f3d1544a --- /dev/null +++ b/mosesdecoder/moses2/MemPool.h @@ -0,0 +1,158 @@ +/* + * MemPool.h + * + * Created on: 28 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace Moses2 +{ + +class MemPool +{ + struct Page { + uint8_t *mem; + uint8_t *end; + size_t size; + + Page() { + } + Page(std::size_t size); + ~Page(); + }; + +public: + MemPool(std::size_t initSize = 10000); + + ~MemPool(); + + uint8_t *Allocate(std::size_t size) { + size = (size + 3) & 0xfffffffc; + + uint8_t *ret = current_; + current_ += size; + + Page &page = *m_pages[m_currPage]; + if (current_ <= page.end) { + // return what we got + } else { + ret = More(size); + } + return ret; + + } + + template + T *Allocate() { + uint8_t *ret = Allocate(sizeof(T)); + return (T*) ret; + } + + template + T *Allocate(size_t num) { + uint8_t *ret = Allocate(sizeof(T) * num); + return (T*) ret; + } + + // re-use pool + void Reset(); + +private: + uint8_t *More(std::size_t size); + + std::vector m_pages; + + size_t m_currSize; + size_t m_currPage; + uint8_t *current_; + + // no copying + MemPool(const MemPool &); + MemPool &operator=(const MemPool &); +}; + +//////////////////////////////////////////////////////////////////////////////////////////////// +template +class ObjectPoolContiguous +{ + +public: + ObjectPoolContiguous(std::size_t initSize = 100000) : + m_size(0), m_actualSize(initSize) { + m_vec = (T*) malloc(sizeof(T) * initSize); + } + + ~ObjectPoolContiguous() { + free(m_vec); + } + + void Add(T &obj) { + if (m_size >= m_actualSize) { + //std::cerr << std::endl << "MORE " << m_size << std::endl; + m_actualSize *= 2; + m_vec = (T*) realloc(m_vec, sizeof(T) * m_actualSize); + + } + m_vec[m_size] = obj; + ++m_size; + } + + bool IsEmpty() const { + return m_size == 0; + } + + void Reset() { + m_size = 0; + } + + // vector op + size_t GetSize() const { + return m_size; + } + + const T& operator[](size_t ind) const { + return m_vec[ind]; + } + + // stack op + const T &Get() const { + return m_vec[m_size - 1]; + } + + void Pop() { + --m_size; + } + + T *GetData() { + return m_vec; + } + + template + void Sort(const ORDERER &orderer) { + std::sort(m_vec, m_vec + m_size, orderer); + } + +private: + T *m_vec; + size_t m_size, m_actualSize; + + // no copying + ObjectPoolContiguous(const ObjectPoolContiguous &); + ObjectPoolContiguous &operator=(const ObjectPoolContiguous &); +}; + +////////////////////////////////////////////////////////////////////////////////////////// + + +} + diff --git a/mosesdecoder/moses2/MemPoolAllocator.h b/mosesdecoder/moses2/MemPoolAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..994bb77112e0c9cb75b09f4160c3d5a285b5e1b6 --- /dev/null +++ b/mosesdecoder/moses2/MemPoolAllocator.h @@ -0,0 +1,85 @@ +#pragma once +#include "MemPool.h" + +namespace Moses2 +{ + +template +class MemPoolAllocator +{ +public: + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + template + struct rebind { + typedef MemPoolAllocator other; + }; + + MemPoolAllocator(Moses2::MemPool &pool) : + m_pool(pool) { + } + MemPoolAllocator(const MemPoolAllocator &other) : + m_pool(other.m_pool) { + } + + template + MemPoolAllocator(const MemPoolAllocator& other) : + m_pool(other.m_pool) { + } + + size_type max_size() const { + return std::numeric_limits::max(); + } + + void deallocate(pointer p, size_type n) { + //std::cerr << "deallocate " << p << " " << n << std::endl; + } + + pointer allocate(size_type n, std::allocator::const_pointer hint = 0) { + //std::cerr << "allocate " << n << " " << hint << std::endl; + pointer ret = m_pool.Allocate(n); + return ret; + } + + void construct(pointer p, const_reference val) { + //std::cerr << "construct " << p << " " << n << std::endl; + new ((void *) p) T(val); + } + + void destroy(pointer p) { + //std::cerr << "destroy " << p << " " << n << std::endl; + } + + // return address of values + pointer address (reference value) const { + return &value; + } + const_pointer address (const_reference value) const { + return &value; + } + + bool operator==(const MemPoolAllocator &allocator) const { + return true; + } + + bool operator!=(const MemPoolAllocator &allocator) const { + return false; + } + + MemPoolAllocator& operator=(const MemPoolAllocator& allocator) { + return *this; + } + + MemPool &m_pool; +protected: +}; + +} + + diff --git a/mosesdecoder/moses2/Moses2Wrapper.cpp b/mosesdecoder/moses2/Moses2Wrapper.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fb59d010f30ab20a7302acb12c8186ad63b46d0a --- /dev/null +++ b/mosesdecoder/moses2/Moses2Wrapper.cpp @@ -0,0 +1,55 @@ +#include "Moses2Wrapper.h" +#include "System.h" +#include "legacy/Parameter.h" +#include "TranslationTask.h" +using namespace std; +namespace Moses2 { + //summary :: need to update the LM path at runtime with complete artifact path. + void Moses2Wrapper::UpdateLMPath(const std::string& filePath) { + + char sep = '/'; + + #ifdef _WIN32 + sep = '\\'; + #endif + auto file = filePath.substr(filePath.find_last_of(sep) + 1); + auto path = filePath.substr(0, filePath.find_last_of(sep)); + auto a = m_param->GetParam("feature"); + std::vector feature; + for (int i = 0; i < a->size(); i++) { + auto abc = Tokenize(a->at(i)); + if (*abc.begin() == "KENLM") { + string s = ""; + for (int k = 0; k < abc.size(); k++) { + if (abc.at(k).find("path=") != string::npos) { + auto lm = abc.at(k).substr(abc.at(k).find_last_of("=") + 1); + s = s + "path=" + path + sep + lm + " "; + } + else { + s = s + abc.at(k) + " "; + } + } + feature.push_back(s.erase(s.find_last_not_of(" \n\r\t") + 1)); + } + else { + feature.push_back(a->at(i)); + } + } + m_param->OverwriteParam("feature", feature); + } + + Moses2Wrapper::Moses2Wrapper(const std::string &filePath) { + m_param = new Parameter(); + m_param->LoadParam(filePath); + UpdateLMPath(filePath); + m_system = new System(*m_param); + } + std::string Moses2Wrapper::Translate(const std::string &input , long id) { + TranslationTask task(*m_system, input, id); + return task.ReturnTranslation(); + } + Moses2Wrapper::~Moses2Wrapper() { + delete m_param; + delete m_system; + } +} \ No newline at end of file diff --git a/mosesdecoder/moses2/Moses2Wrapper.h b/mosesdecoder/moses2/Moses2Wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..c758ef2f326c520ce92a3b839774338ebb0817ba --- /dev/null +++ b/mosesdecoder/moses2/Moses2Wrapper.h @@ -0,0 +1,39 @@ +#pragma once +#include +#include +namespace Moses2 { + class Parameter; + class System; + extern "C" { + enum MosesApiErrorCode { + MS_API_OK, + MS_API_E_FAILURE, + MS_API_E_INPUT, + MS_API_E_TIMEOUT + }; + } + class Moses2Wrapper + { + Parameter* m_param; + System* m_system; + + public: + Moses2Wrapper(const std::string& filePath); + ~Moses2Wrapper(); + std::string Translate(const std::string& input, long id); + void UpdateLMPath(const std::string& filePath); + int getEngineVersion(); + + static char* CopyString(const char* str) { + int32_t size = (int32_t)strlen(str); + char* obj = (char*)malloc(size + 1); + memcpy(obj, str, size); + obj[size] = '\0'; + return obj; + } + static void Free(void* ptr) { + free(ptr); + } + }; + +} \ No newline at end of file diff --git a/mosesdecoder/moses2/Phrase.cpp b/mosesdecoder/moses2/Phrase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dd4abf3283f571321110977df90e6c84a4b8da39 --- /dev/null +++ b/mosesdecoder/moses2/Phrase.cpp @@ -0,0 +1,23 @@ +/* + * PhraseImpl.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include "Phrase.h" +#include "Word.h" +#include "MemPool.h" +#include "Scores.h" +#include "System.h" + +using namespace std; + +namespace Moses2 +{ + + + + +} // namespace + diff --git a/mosesdecoder/moses2/Phrase.h b/mosesdecoder/moses2/Phrase.h new file mode 100644 index 0000000000000000000000000000000000000000..1007014837a9539cfca9f6cf39c61c03a1354919 --- /dev/null +++ b/mosesdecoder/moses2/Phrase.h @@ -0,0 +1,144 @@ +/* + * PhraseImpl.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include +#include "Word.h" +#include "MemPool.h" +#include "TypeDef.h" +#include "legacy/FactorCollection.h" +#include "SCFG/Word.h" + +namespace Moses2 +{ + +template +class SubPhrase; + +class Scores; +class PhraseTable; +class MemPool; +class System; + +template +class Phrase +{ +public: + virtual ~Phrase() { + } + virtual const WORD& operator[](size_t pos) const = 0; + virtual size_t GetSize() const = 0; + + virtual const WORD& Back() const { + return (*this)[GetSize() - 1]; + } + + virtual size_t hash() const { + size_t seed = 0; + + for (size_t i = 0; i < GetSize(); ++i) { + const WORD &word = (*this)[i]; + size_t wordHash = word.hash(); + boost::hash_combine(seed, wordHash); + } + + return seed; + } + + virtual bool operator==(const Phrase &compare) const { + if (GetSize() != compare.GetSize()) { + return false; + } + + for (size_t i = 0; i < GetSize(); ++i) { + const WORD &word = (*this)[i]; + const WORD &otherWord = compare[i]; + if (word != otherWord) { + return false; + } + } + + return true; + } + + virtual bool operator!=(const Phrase &compare) const { + return !((*this) == compare); + } + + virtual std::string GetString(const FactorList &factorTypes) const { + if (GetSize() == 0) { + return ""; + } + + std::stringstream ret; + + const WORD &word = (*this)[0]; + ret << word.GetString(factorTypes); + for (size_t i = 1; i < GetSize(); ++i) { + const WORD &word = (*this)[i]; + ret << " " << word.GetString(factorTypes); + } + return ret.str(); + } + + virtual SubPhrase GetSubPhrase(size_t start, size_t size) const = 0; + + virtual std::string Debug(const System &system) const { + std::stringstream out; + size_t size = GetSize(); + if (size) { + out << (*this)[0].Debug(system); + for (size_t i = 1; i < size; ++i) { + const WORD &word = (*this)[i]; + out << " " << word.Debug(system); + } + } + + return out.str(); + } + + virtual void OutputToStream(const System &system, std::ostream &out) const { + size_t size = GetSize(); + if (size) { + (*this)[0].OutputToStream(system, out); + for (size_t i = 1; i < size; ++i) { + const WORD &word = (*this)[i]; + out << " "; + word.OutputToStream(system, out); + } + } + } + + +}; + +//////////////////////////////////////////////////////////////////////// +template +class PhraseOrdererLexical +{ +public: + bool operator()(const Phrase &a, const Phrase &b) const { + size_t minSize = std::min(a.GetSize(), b.GetSize()); + for (size_t i = 0; i < minSize; ++i) { + const Word &aWord = a[i]; + const Word &bWord = b[i]; + int cmp = aWord.Compare(bWord); + //std::cerr << "WORD: " << aWord << " ||| " << bWord << " ||| " << lessThan << std::endl; + if (cmp) { + return (cmp < 0); + } + } + return a.GetSize() < b.GetSize(); + } +}; + +} + diff --git a/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Misc.cpp b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Misc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7fcd4fa0c1a9c04d012cd74b4ec423e7a7fd15b3 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Misc.cpp @@ -0,0 +1,158 @@ +/* + * CubePruning.cpp + * + * Created on: 27 Nov 2015 + * Author: hieu + */ + +#include "Misc.h" +#include "Stack.h" +#include "../Manager.h" +#include "../../MemPool.h" +#include "../../System.h" +#include "../../PhraseBased/TargetPhrases.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningMiniStack +{ + +//////////////////////////////////////////////////////////////////////// +QueueItem *QueueItem::Create(QueueItem *currItem, Manager &mgr, CubeEdge &edge, + size_t hypoIndex, size_t tpIndex, + QueueItemRecycler &queueItemRecycler) +{ + QueueItem *ret; + if (currItem) { + // reuse incoming queue item to create new item + ret = currItem; + ret->Init(mgr, edge, hypoIndex, tpIndex); + } else if (!queueItemRecycler.empty()) { + // use item from recycle bin + ret = queueItemRecycler.back(); + ret->Init(mgr, edge, hypoIndex, tpIndex); + queueItemRecycler.pop_back(); + } else { + // create new item + ret = new (mgr.GetPool().Allocate()) QueueItem(mgr, edge, + hypoIndex, tpIndex); + } + + return ret; +} + +QueueItem::QueueItem(Manager &mgr, CubeEdge &edge, size_t hypoIndex, + size_t tpIndex) : + edge(&edge), hypoIndex(hypoIndex), tpIndex(tpIndex) +{ + CreateHypothesis(mgr); +} + +void QueueItem::Init(Manager &mgr, CubeEdge &edge, size_t hypoIndex, + size_t tpIndex) +{ + this->edge = &edge; + this->hypoIndex = hypoIndex; + this->tpIndex = tpIndex; + + CreateHypothesis(mgr); +} + +void QueueItem::CreateHypothesis(Manager &mgr) +{ + const Hypothesis *prevHypo = + static_cast(edge->hypos[hypoIndex]); + const TargetPhraseImpl &tp = edge->tps[tpIndex]; + + //cerr << "hypoIndex=" << hypoIndex << endl; + //cerr << "edge.hypos=" << edge.hypos.size() << endl; + //cerr << prevHypo << endl; + //cerr << *prevHypo << endl; + + hypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + hypo->Init(mgr, *prevHypo, edge->path, tp, edge->newBitmap, + edge->estimatedScore); + + if (!mgr.system.options.cube.lazy_scoring) { + hypo->EvaluateWhenApplied(); + } +} + +//////////////////////////////////////////////////////////////////////// +CubeEdge::CubeEdge(Manager &mgr, const Hypotheses &hypos, const InputPath &path, + const TargetPhrases &tps, const Bitmap &newBitmap) : + hypos(hypos), path(path), tps(tps), newBitmap(newBitmap) +{ + estimatedScore = mgr.GetEstimatedScores().CalcEstimatedScore(newBitmap); +} + +std::string CubeEdge::Debug(const System &system) const +{ + stringstream out; + out << newBitmap; + return out.str(); +} + +bool CubeEdge::SetSeenPosition(const size_t x, const size_t y, + SeenPositions &seenPositions) const +{ + //UTIL_THROW_IF2(x >= (1<<17), "Error"); + //UTIL_THROW_IF2(y >= (1<<17), "Error"); + + SeenPositionItem val(this, (x << 16) + y); + std::pair pairRet = seenPositions.insert(val); + return pairRet.second; +} + +void CubeEdge::CreateFirst(Manager &mgr, Queue &queue, + SeenPositions &seenPositions, + QueueItemRecycler &queueItemRecycler) +{ + assert(hypos.size()); + assert(tps.GetSize()); + + QueueItem *item = QueueItem::Create(NULL, mgr, *this, 0, 0, + queueItemRecycler); + queue.push(item); + bool setSeen = SetSeenPosition(0, 0, seenPositions); + assert(setSeen); +} + +void CubeEdge::CreateNext(Manager &mgr, QueueItem *item, Queue &queue, + SeenPositions &seenPositions, + QueueItemRecycler &queueItemRecycler) +{ + size_t hypoIndex = item->hypoIndex; + size_t tpIndex = item->tpIndex; + + if (hypoIndex + 1 < hypos.size() + && SetSeenPosition(hypoIndex + 1, tpIndex, seenPositions)) { + // reuse incoming queue item to create new item + QueueItem *newItem = QueueItem::Create(item, mgr, *this, hypoIndex + 1, + tpIndex, queueItemRecycler); + assert(newItem == item); + queue.push(newItem); + item = NULL; + } + + if (tpIndex + 1 < tps.GetSize() + && SetSeenPosition(hypoIndex, tpIndex + 1, seenPositions)) { + QueueItem *newItem = QueueItem::Create(item, mgr, *this, hypoIndex, + tpIndex + 1, queueItemRecycler); + queue.push(newItem); + item = NULL; + } + + if (item) { + // recycle unused queue item + queueItemRecycler.push_back(item); + } +} + +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Misc.h b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Misc.h new file mode 100644 index 0000000000000000000000000000000000000000..4fc576cbaa21c179cb0c6ffb378a45e1d8158947 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Misc.h @@ -0,0 +1,103 @@ +/* + * CubePruning.h + * + * Created on: 27 Nov 2015 + * Author: hieu + */ +#pragma once +#include +#include +#include +#include +#include +#include "../../legacy/Range.h" +#include "../Hypothesis.h" +#include "../../TypeDef.h" +#include "../../Vector.h" +#include "../../MemPoolAllocator.h" +#include "Stack.h" + +namespace Moses2 +{ + +class Manager; +class InputPath; +class TargetPhrases; +class Bitmap; + +namespace NSCubePruningMiniStack +{ +class CubeEdge; + +class QueueItem; +typedef std::deque > QueueItemRecycler; + +/////////////////////////////////////////// +class QueueItem +{ + ~QueueItem(); // NOT IMPLEMENTED. Use MemPool +public: + static QueueItem *Create(QueueItem *currItem, Manager &mgr, CubeEdge &edge, + size_t hypoIndex, size_t tpIndex, + QueueItemRecycler &queueItemRecycler); + QueueItem(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex); + + void Init(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex); + + CubeEdge *edge; + size_t hypoIndex, tpIndex; + Hypothesis *hypo; + +protected: + void CreateHypothesis(Manager &mgr); +}; + +/////////////////////////////////////////// +class QueueItemOrderer +{ +public: + bool operator()(QueueItem* itemA, QueueItem* itemB) const { + HypothesisFutureScoreOrderer orderer; + return !orderer(itemA->hypo, itemB->hypo); + } +}; + +/////////////////////////////////////////// +class CubeEdge +{ +public: + typedef std::priority_queue >, QueueItemOrderer> Queue; + + typedef std::pair SeenPositionItem; + typedef boost::unordered_set, + std::equal_to, MemPoolAllocator > SeenPositions; + + const Hypotheses &hypos; + const InputPath &path; + const TargetPhrases &tps; + const Bitmap &newBitmap; + SCORE estimatedScore; + + CubeEdge(Manager &mgr, const Hypotheses &hypos, const InputPath &path, + const TargetPhrases &tps, const Bitmap &newBitmap); + + bool SetSeenPosition(const size_t x, const size_t y, + SeenPositions &seenPositions) const; + + void CreateFirst(Manager &mgr, Queue &queue, SeenPositions &seenPositions, + QueueItemRecycler &queueItemRecycler); + void CreateNext(Manager &mgr, QueueItem *item, Queue &queue, + SeenPositions &seenPositions, + QueueItemRecycler &queueItemRecycler); + + std::string Debug(const System &system) const; + +protected: + +}; + +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Search.cpp b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Search.cpp new file mode 100644 index 0000000000000000000000000000000000000000..74103d21192b633ec0409c0bb0c9b2ec74fd14d7 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Search.cpp @@ -0,0 +1,248 @@ +/* + * Search.cpp + * + * Created on: 16 Nov 2015 + * Author: hieu + */ +#include +#include "Search.h" +#include "Stack.h" +#include "../Manager.h" +#include "../Hypothesis.h" +#include "../TrellisPath.h" +#include "../Sentence.h" +#include "../../TrellisPaths.h" +#include "../../InputPathsBase.h" +#include "../../InputPathBase.h" +#include "../../System.h" +#include "../../TranslationTask.h" +#include "../../legacy/Util2.h" +#include "../../PhraseBased/TargetPhrases.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningMiniStack +{ + +//////////////////////////////////////////////////////////////////////// +Search::Search(Manager &mgr) : + Moses2::Search(mgr), m_stack(mgr), m_cubeEdgeAlloc(mgr.GetPool()) + + , m_queue(QueueItemOrderer(), + std::vector >( + MemPoolAllocator(mgr.GetPool()))) + + , m_seenPositions( + MemPoolAllocator(mgr.GetPool())) + + , m_queueItemRecycler(MemPoolAllocator(mgr.GetPool())) + +{ +} + +Search::~Search() +{ +} + +void Search::Decode() +{ + const Sentence &sentence = static_cast(mgr.GetInput()); + + // init cue edges + m_cubeEdges.resize(sentence.GetSize() + 1); + for (size_t i = 0; i < m_cubeEdges.size(); ++i) { + m_cubeEdges[i] = new (mgr.GetPool().Allocate()) CubeEdges( + m_cubeEdgeAlloc); + } + + const Bitmap &initBitmap = mgr.GetBitmaps().GetInitialBitmap(); + Hypothesis *initHypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + initHypo->Init(mgr, mgr.GetInputPaths().GetBlank(), mgr.GetInitPhrase(), + initBitmap); + initHypo->EmptyHypothesisState(mgr.GetInput()); + //cerr << "initHypo=" << *initHypo << endl; + + m_stack.Add(initHypo, mgr.GetHypoRecycle(), mgr.arcLists); + PostDecode(0); + + for (size_t stackInd = 1; stackInd < sentence.GetSize() + 1; + ++stackInd) { + //cerr << "stackInd=" << stackInd << endl; + m_stack.Clear(); + Decode(stackInd); + PostDecode(stackInd); + + //m_stack.DebugCounts(); + } + +} + +void Search::Decode(size_t stackInd) +{ + Recycler &hypoRecycler = mgr.GetHypoRecycle(); + + // reuse queue from previous stack. Clear it first + std::vector > &container = Container( + m_queue); + //cerr << "container=" << container.size() << endl; + BOOST_FOREACH(QueueItem *item, container) { + // recycle unused hypos from queue + Hypothesis *hypo = item->hypo; + hypoRecycler.Recycle(hypo); + + // recycle queue item + m_queueItemRecycler.push_back(item); + } + container.clear(); + + m_seenPositions.clear(); + + // add top hypo from every edge into queue + CubeEdges &edges = *m_cubeEdges[stackInd]; + + BOOST_FOREACH(CubeEdge *edge, edges) { + //cerr << *edge << " "; + edge->CreateFirst(mgr, m_queue, m_seenPositions, m_queueItemRecycler); + } + + /* + cerr << "edges: "; + boost::unordered_set uniqueBM; + BOOST_FOREACH(CubeEdge *edge, edges) { + uniqueBM.insert(&edge->newBitmap); + //cerr << *edge << " "; + } + cerr << edges.size() << " " << uniqueBM.size(); + cerr << endl; + */ + + size_t pops = 0; + while (!m_queue.empty() && pops < mgr.system.options.cube.pop_limit) { + // get best hypo from queue, add to stack + //cerr << "queue=" << queue.size() << endl; + QueueItem *item = m_queue.top(); + m_queue.pop(); + + CubeEdge *edge = item->edge; + + // add hypo to stack + Hypothesis *hypo = item->hypo; + + if (mgr.system.options.cube.lazy_scoring) { + hypo->EvaluateWhenApplied(); + } + + //cerr << "hypo=" << *hypo << " " << hypo->GetBitmap() << endl; + m_stack.Add(hypo, hypoRecycler, mgr.arcLists); + + edge->CreateNext(mgr, item, m_queue, m_seenPositions, m_queueItemRecycler); + + ++pops; + } + + // create hypo from every edge. Increase diversity + if (mgr.system.options.cube.diversity) { + while (!m_queue.empty()) { + QueueItem *item = m_queue.top(); + m_queue.pop(); + + if (item->hypoIndex == 0 && item->tpIndex == 0) { + // add hypo to stack + Hypothesis *hypo = item->hypo; + //cerr << "hypo=" << *hypo << " " << hypo->GetBitmap() << endl; + m_stack.Add(hypo, hypoRecycler, mgr.arcLists); + } + } + } +} + +void Search::PostDecode(size_t stackInd) +{ + MemPool &pool = mgr.GetPool(); + + const InputPaths &paths = mgr.GetInputPaths(); + const Matrix &pathMatrix = paths.GetMatrix(); + size_t inputSize = pathMatrix.GetRows(); + size_t numPaths = pathMatrix.GetCols(); + + BOOST_FOREACH(const Stack::Coll::value_type &val, m_stack.GetColl()) { + const Bitmap &hypoBitmap = *val.first.first; + size_t firstGap = hypoBitmap.GetFirstGapPos(); + size_t hypoEndPos = val.first.second; + + Moses2::HypothesisColl &hypos = *val.second; + + //cerr << "key=" << hypoBitmap << " " << firstGap << " " << inputSize << endl; + + // create edges to next hypos from existing hypos + for (size_t startPos = firstGap; startPos < inputSize; ++startPos) { + for (size_t pathInd = 0; pathInd < numPaths; ++pathInd) { + const InputPath *path = pathMatrix.GetValue(startPos, pathInd); + + if (path == NULL) { + break; + } + if (path->GetNumRules() == 0) { + continue; + } + + const Range &pathRange = path->range; + //cerr << "pathRange=" << pathRange << endl; + if (!CanExtend(hypoBitmap, hypoEndPos, pathRange)) { + continue; + } + + const ReorderingConstraint &reorderingConstraint = mgr.GetInput().GetReorderingConstraint(); + if (!reorderingConstraint.Check(hypoBitmap, startPos, pathRange.GetEndPos())) { + continue; + } + + const Bitmap &newBitmap = mgr.GetBitmaps().GetBitmap(hypoBitmap, pathRange); + size_t numWords = newBitmap.GetNumWordsCovered(); + + CubeEdges &edges = *m_cubeEdges[numWords]; + + // sort hypo for a particular bitmap and hypoEndPos + const Hypotheses &sortedHypos = hypos.GetSortedAndPrunedHypos(mgr, mgr.arcLists); + + size_t numPt = mgr.system.mappings.size(); + for (size_t i = 0; i < numPt; ++i) { + const TargetPhrases *tps = path->targetPhrases[i]; + if (tps && tps->GetSize()) { + CubeEdge *edge = new (pool.Allocate()) CubeEdge(mgr, sortedHypos, *path, *tps, newBitmap); + edges.push_back(edge); + } + } + } + } + } +} + +const Hypothesis *Search::GetBestHypo() const +{ + const Hypothesis *bestHypo = m_stack.GetBestHypo(); + return bestHypo; +} + +void Search::AddInitialTrellisPaths(TrellisPaths &paths) const +{ + const Stack::Coll &coll = m_stack.GetColl(); + BOOST_FOREACH(const Stack::Coll::value_type &val, coll) { + Moses2::HypothesisColl &hypos = *val.second; + const Hypotheses &sortedHypos = hypos.GetSortedAndPrunedHypos(mgr, mgr.arcLists); + + BOOST_FOREACH(const HypothesisBase *hypoBase, sortedHypos) { + const Hypothesis *hypo = static_cast(hypoBase); + TrellisPath *path = new TrellisPath(hypo, mgr.arcLists); + paths.Add(path); + } + } +} + +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Search.h b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Search.h new file mode 100644 index 0000000000000000000000000000000000000000..0dfe9dfb2fe4f6b72e2ba16730dc6a7676a1ef20 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Search.h @@ -0,0 +1,62 @@ +/* + * Search.h + * + * Created on: 16 Nov 2015 + * Author: hieu + */ + +#pragma once +#include +#include "../Search.h" +#include "Misc.h" +#include "Stack.h" +#include "../../legacy/Range.h" +#include "../../MemPoolAllocator.h" + +namespace Moses2 +{ + +class Bitmap; +class Hypothesis; +class InputPath; +class TargetPhrases; +class TargetPhraseImpl; + +namespace NSCubePruningMiniStack +{ + +class Search: public Moses2::Search +{ +public: + Search(Manager &mgr); + virtual ~Search(); + + virtual void Decode(); + const Hypothesis *GetBestHypo() const; + + void AddInitialTrellisPaths(TrellisPaths &paths) const; + +protected: + Stack m_stack; + + CubeEdge::Queue m_queue; + CubeEdge::SeenPositions m_seenPositions; + + // CUBE PRUNING VARIABLES + // setup + MemPoolAllocator m_cubeEdgeAlloc; + typedef std::vector > CubeEdges; + std::vector m_cubeEdges; + + QueueItemRecycler m_queueItemRecycler; + + // CUBE PRUNING + // decoding + void Decode(size_t stackInd); + void PostDecode(size_t stackInd); +}; + +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Stack.cpp b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Stack.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0565aa4025816938e2ab0095623b55394897150b --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Stack.cpp @@ -0,0 +1,123 @@ +/* + * Stack.cpp + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#include +#include +#include "Stack.h" +#include "../Hypothesis.h" +#include "../Manager.h" +#include "../../Scores.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningMiniStack +{ +Stack::Stack(const Manager &mgr) : + m_mgr(mgr), m_coll( + MemPoolAllocator >( + mgr.GetPool())), m_miniStackRecycler( + MemPoolAllocator(mgr.GetPool())) +{ +} + +Stack::~Stack() +{ + BOOST_FOREACH(const Coll::value_type &val, m_coll) { + const Moses2::HypothesisColl *miniStack = val.second; + delete miniStack; + } + + while (!m_miniStackRecycler.empty()) { + Moses2::HypothesisColl *miniStack = m_miniStackRecycler.back(); + m_miniStackRecycler.pop_back(); + delete miniStack; + + } +} + +void Stack::Add(Hypothesis *hypo, Recycler &hypoRecycle, + ArcLists &arcLists) +{ + HypoCoverage key(&hypo->GetBitmap(), hypo->GetInputPath().range.GetEndPos()); + Moses2::HypothesisColl &coll = GetMiniStack(key); + coll.Add(m_mgr, hypo, hypoRecycle, arcLists); +} + +const Hypothesis *Stack::GetBestHypo() const +{ + SCORE bestScore = -std::numeric_limits::infinity(); + const HypothesisBase *bestHypo = NULL; + BOOST_FOREACH(const Coll::value_type &val, m_coll) { + const Moses2::HypothesisColl &hypos = *val.second; + const Moses2::HypothesisBase *hypo = hypos.GetBestHypo(); + + if (hypo && hypo->GetFutureScore() > bestScore) { + bestScore = hypo->GetFutureScore(); + bestHypo = hypo; + } + } + return &bestHypo->Cast(); +} + +size_t Stack::GetHypoSize() const +{ + size_t ret = 0; + BOOST_FOREACH(const Coll::value_type &val, m_coll) { + const Moses2::HypothesisColl &hypos = *val.second; + ret += hypos.GetSize(); + } + return ret; +} + +Moses2::HypothesisColl &Stack::GetMiniStack(const HypoCoverage &key) +{ + Moses2::HypothesisColl *ret; + Coll::iterator iter = m_coll.find(key); + if (iter == m_coll.end()) { + if (m_miniStackRecycler.empty()) { + ret = new Moses2::HypothesisColl(m_mgr); + } else { + ret = m_miniStackRecycler.back(); + ret->Clear(); + m_miniStackRecycler.pop_back(); + } + + m_coll[key] = ret; + } else { + ret = iter->second; + } + return *ret; +} + +void Stack::Clear() +{ + BOOST_FOREACH(const Coll::value_type &val, m_coll) { + Moses2::HypothesisColl *miniStack = val.second; + m_miniStackRecycler.push_back(miniStack); + } + + m_coll.clear(); +} + +void Stack::DebugCounts() +{ + cerr << "counts="; + BOOST_FOREACH(const Coll::value_type &val, GetColl()) { + const Moses2::HypothesisColl &miniStack = *val.second; + size_t count = miniStack.GetSize(); + cerr << count << " "; + } + cerr << endl; +} + +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Stack.h b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Stack.h new file mode 100644 index 0000000000000000000000000000000000000000..abd564b3f013d4e6910c7f4afcbe2b1873e83357 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/CubePruningMiniStack/Stack.h @@ -0,0 +1,75 @@ +/* + * Stack.h + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#pragma once +#include +#include +#include +#include "../Hypothesis.h" +#include "../../TypeDef.h" +#include "../../Vector.h" +#include "../../MemPool.h" +#include "../../MemPoolAllocator.h" +#include "../../Recycler.h" +#include "../../HypothesisColl.h" +#include "../../legacy/Util2.h" + +namespace Moses2 +{ + +class Manager; +class HypothesisBase; +class ArcLists; + +namespace NSCubePruningMiniStack +{ + +class Stack +{ +protected: + +public: + typedef std::pair HypoCoverage; + // bitmap and current endPos of hypos + + typedef boost::unordered_map, std::equal_to, + MemPoolAllocator > > Coll; + + Stack(const Manager &mgr); + virtual ~Stack(); + + size_t GetHypoSize() const; + + Coll &GetColl() { + return m_coll; + } + const Coll &GetColl() const { + return m_coll; + } + + void Add(Hypothesis *hypo, Recycler &hypoRecycle, + ArcLists &arcLists); + + Moses2::HypothesisColl &GetMiniStack(const HypoCoverage &key); + + const Hypothesis *GetBestHypo() const; + void Clear(); + + void DebugCounts(); + +protected: + const Manager &m_mgr; + Coll m_coll; + + std::deque > m_miniStackRecycler; + +}; + +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/Hypothesis.cpp b/mosesdecoder/moses2/PhraseBased/Hypothesis.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e907c1a8bf120be4a3a5241f233699ff8b19ec4e --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Hypothesis.cpp @@ -0,0 +1,226 @@ +/* + * Hypothesis.cpp + * + * Created on: 24 Oct 2015 + * Author: hieu hoang + */ +#include +#include +#include +#include "Hypothesis.h" +#include "Manager.h" +#include "Sentence.h" +#include "TargetPhraseImpl.h" +#include "../InputPathBase.h" +#include "../System.h" +#include "../Scores.h" +#include "../Phrase.h" +#include "../FF/StatefulFeatureFunction.h" + +using namespace std; + +namespace Moses2 +{ +Hypothesis *Hypothesis::Create(MemPool &pool, Manager &mgr) +{ +// ++g_numHypos; + Hypothesis *ret; + + Recycler &recycler = mgr.GetHypoRecycle(); + ret = static_cast(recycler.Get()); + if (ret) { + // got new hypo from recycler. Do nothing + } else { + ret = new (pool.Allocate()) Hypothesis(pool, mgr.system); + //cerr << "Hypothesis=" << sizeof(Hypothesis) << " " << ret << endl; + recycler.Keep(ret); + } + return ret; +} + +Hypothesis::Hypothesis(MemPool &pool, const System &system) : + HypothesisBase(pool, system), m_currTargetWordsRange() +{ +} + +Hypothesis::~Hypothesis() +{ + // TODO Auto-generated destructor stub +} + +void Hypothesis::Init(Manager &mgr, const InputPathBase &path, + const TargetPhraseImpl &tp, const Bitmap &bitmap) +{ + m_mgr = &mgr; + m_targetPhrase = &tp; + m_sourceCompleted = &bitmap; + m_path = &path; + m_prevHypo = NULL; + + m_currTargetWordsRange.SetStartPos(NOT_FOUND); + m_currTargetWordsRange.SetEndPos(NOT_FOUND); + + m_estimatedScore = 0; + m_scores->Reset(mgr.system); +} + +void Hypothesis::Init(Manager &mgr, const Hypothesis &prevHypo, + const InputPathBase &path, const TargetPhraseImpl &tp, const Bitmap &bitmap, + SCORE estimatedScore) +{ + m_mgr = &mgr; + m_targetPhrase = &tp; + m_sourceCompleted = &bitmap; + m_path = &path; + m_prevHypo = &prevHypo; + + m_currTargetWordsRange.SetStartPos( + prevHypo.m_currTargetWordsRange.GetEndPos() + 1); + m_currTargetWordsRange.SetEndPos( + prevHypo.m_currTargetWordsRange.GetEndPos() + tp.GetSize()); + + m_estimatedScore = estimatedScore; + + m_scores->Reset(mgr.system); + m_scores->PlusEquals(mgr.system, prevHypo.GetScores()); + m_scores->PlusEquals(mgr.system, GetTargetPhrase().GetScores()); +} + +size_t Hypothesis::hash() const +{ + // coverage + size_t seed = (size_t) m_sourceCompleted; + + seed = HypothesisBase::hash(seed); + return seed; +} + +bool Hypothesis::operator==(const Hypothesis &other) const +{ + // coverage + if (m_sourceCompleted != other.m_sourceCompleted) { + return false; + } + + bool ret = HypothesisBase::operator ==(other); + return ret; +} + +std::string Hypothesis::Debug(const System &system) const +{ + stringstream out; + + // coverage + out << GetBitmap() << " " << GetInputPath().range << " "; + + // states + const std::vector &sfffs = + GetManager().system.featureFunctions.GetStatefulFeatureFunctions(); + size_t numStatefulFFs = sfffs.size(); + for (size_t i = 0; i < numStatefulFFs; ++i) { + const FFState &state = *GetState(i); + out << "(" << state << ") "; + } + + // string + //Debug(out, m_mgr->system); + out << " "; + out << "fc=" << GetFutureScore() << " "; + out << GetScores().Debug(GetManager().system); + + return out.str(); +} + +void Hypothesis::OutputToStream(std::ostream &out) const +{ + if (m_prevHypo) { + m_prevHypo->OutputToStream(out); + } + //cerr << "range=" << GetInputPath().range << endl; + + const TargetPhrase &tp = GetTargetPhrase(); + if (tp.GetSize()) { + const SubPhrase &subPhrase = static_cast(GetInputPath()).subPhrase; + //cerr << "tp=" << tp.Debug(m_mgr->system) << endl; + //cerr << "subPhrase=" << subPhrase.Debug(m_mgr->system) << endl; + + tp.OutputToStream(m_mgr->system, subPhrase, out); + } + + if (m_path->range.GetStartPos() != NOT_FOUND) { + if (m_mgr->system.options.output.ReportSegmentation == 1) { + // just report phrase segmentation + out << "|" << m_path->range.GetStartPos() << "-" << m_path->range.GetEndPos() << "| "; + } else if (m_mgr->system.options.output.ReportSegmentation == 2) { + // more detailed info about every segment + out << "|"; + + // phrase segmentation + out << m_path->range.GetStartPos() << "-" << m_path->range.GetEndPos() << ","; + + // score breakdown + m_scores->OutputBreakdown(out, m_mgr->system); + + out << "| "; + } + } +} + +void Hypothesis::EmptyHypothesisState(const InputType &input) +{ + const std::vector &sfffs = + GetManager().system.featureFunctions.GetStatefulFeatureFunctions(); + BOOST_FOREACH(const StatefulFeatureFunction *sfff, sfffs) { + size_t statefulInd = sfff->GetStatefulInd(); + FFState *state = m_ffStates[statefulInd]; + sfff->EmptyHypothesisState(*state, GetManager(), input, *this); + } +} + +void Hypothesis::EvaluateWhenApplied() +{ + const std::vector &sfffs = + GetManager().system.featureFunctions.GetStatefulFeatureFunctions(); + BOOST_FOREACH(const StatefulFeatureFunction *sfff, sfffs) { + EvaluateWhenApplied(*sfff); + } +//cerr << *this << endl; +} + +void Hypothesis::EvaluateWhenApplied(const StatefulFeatureFunction &sfff) +{ + size_t statefulInd = sfff.GetStatefulInd(); + const FFState *prevState = m_prevHypo->GetState(statefulInd); + FFState *thisState = m_ffStates[statefulInd]; + assert(prevState); + sfff.EvaluateWhenApplied(GetManager(), *this, *prevState, *m_scores, + *thisState); + +} + +/** recursive - pos is relative from start of sentence */ +const Word &Hypothesis::GetWord(size_t pos) const +{ + const Hypothesis *hypo = this; + while (pos < hypo->GetCurrTargetWordsRange().GetStartPos()) { + hypo = hypo->GetPrevHypo(); + UTIL_THROW_IF2(hypo == NULL, "Previous hypothesis should not be NULL"); + } + return hypo->GetCurrWord(pos - hypo->GetCurrTargetWordsRange().GetStartPos()); +} + +void Hypothesis::Swap(Hypothesis &other) +{ + /* + Swap(m_targetPhrase, other.m_targetPhrase); + Swap(m_sourceCompleted, other.m_sourceCompleted); + Swap(m_range, other.m_range); + Swap(m_prevHypo, other.m_prevHypo); + Swap(m_ffStates, other.m_ffStates); + Swap(m_estimatedScore, other.m_estimatedScore); + Swap(m_currTargetWordsRange, other.m_currTargetWordsRange); + */ +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/Hypothesis.h b/mosesdecoder/moses2/PhraseBased/Hypothesis.h new file mode 100644 index 0000000000000000000000000000000000000000..71b95a3e38e906123006988499ba3f74ee596a8d --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Hypothesis.h @@ -0,0 +1,117 @@ +/* + * Hypothesis.h + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include "../FF/FFState.h" +#include "../legacy/Bitmap.h" +#include "../legacy/Range.h" +#include "../Scores.h" +#include "../Phrase.h" +#include "../TargetPhrase.h" +#include "../InputPathBase.h" +#include "../HypothesisBase.h" + +namespace Moses2 +{ +class Manager; +class InputType; +class StatefulFeatureFunction; +class TargetPhraseImpl; + +class Hypothesis: public HypothesisBase +{ + Hypothesis(MemPool &pool, const System &system); + +public: + + static Hypothesis *Create(MemPool &pool, Manager &mgr); + virtual ~Hypothesis(); + + // initial, empty hypo + void Init(Manager &mgr, const InputPathBase &path, const TargetPhraseImpl &tp, + const Bitmap &bitmap); + + void Init(Manager &mgr, const Hypothesis &prevHypo, const InputPathBase &path, + const TargetPhraseImpl &tp, const Bitmap &bitmap, SCORE estimatedScore); + + size_t hash() const; + bool operator==(const Hypothesis &other) const; + + inline const Bitmap &GetBitmap() const { + return *m_sourceCompleted; + } + + inline const InputPathBase &GetInputPath() const { + return *m_path; + } + + inline const Range &GetCurrTargetWordsRange() const { + return m_currTargetWordsRange; + } + + SCORE GetFutureScore() const { + return GetScores().GetTotalScore() + m_estimatedScore; + } + + const TargetPhrase &GetTargetPhrase() const { + return *m_targetPhrase; + } + + std::string Debug(const System &system) const; + + virtual void OutputToStream(std::ostream &out) const; + + void EmptyHypothesisState(const InputType &input); + + void EvaluateWhenApplied(); + void EvaluateWhenApplied(const StatefulFeatureFunction &sfff); + + const Hypothesis* GetPrevHypo() const { + return m_prevHypo; + } + + /** curr - pos is relative from CURRENT hypothesis's starting index + * (ie, start of sentence would be some negative number, which is + * not allowed- USE WITH CAUTION) */ + inline const Word &GetCurrWord(size_t pos) const { + return GetTargetPhrase()[pos]; + } + + /** recursive - pos is relative from start of sentence */ + const Word &GetWord(size_t pos) const; + + void Swap(Hypothesis &other); +protected: + const TargetPhrase *m_targetPhrase; + const Bitmap *m_sourceCompleted; + const InputPathBase *m_path; + const Hypothesis *m_prevHypo; + + SCORE m_estimatedScore; + Range m_currTargetWordsRange; +}; + +//////////////////////////////////////////////////////////////////////////////////// +class HypothesisTargetPhraseOrderer +{ +public: + bool operator()(const Hypothesis* a, const Hypothesis* b) const { + PhraseOrdererLexical phraseCmp; + bool ret = phraseCmp(a->GetTargetPhrase(), b->GetTargetPhrase()); + /* + std::cerr << (const Phrase&) a->GetTargetPhrase() << " ||| " + << (const Phrase&) b->GetTargetPhrase() << " ||| " + << ret << std::endl; + */ + return ret; + } +}; + +} + diff --git a/mosesdecoder/moses2/PhraseBased/InputPath.cpp b/mosesdecoder/moses2/PhraseBased/InputPath.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3761080a42af9786a21e6f1273bd557caf990dc4 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/InputPath.cpp @@ -0,0 +1,59 @@ +/* + * InputPath.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include "InputPath.h" +#include "TargetPhrases.h" +#include "../TranslationModel/PhraseTable.h" + +using namespace std; + +namespace Moses2 +{ +InputPath::InputPath(MemPool &pool, const SubPhrase &subPhrase, + const Range &range, size_t numPt, const InputPath *prefixPath) + :InputPathBase(pool, range, numPt, prefixPath) + ,m_numRules(0) + ,subPhrase(subPhrase) +{ + targetPhrases = pool.Allocate(numPt); + Init(targetPhrases, numPt, NULL); +} + +InputPath::~InputPath() +{ + // TODO Auto-generated destructor stub +} + +void InputPath::AddTargetPhrases(const PhraseTable &pt, + const TargetPhrases *tps) +{ + size_t ptInd = pt.GetPtInd(); + targetPhrases[ptInd] = tps; + + if (tps) { + m_numRules += tps->GetSize(); + } +} + +const TargetPhrases *InputPath::GetTargetPhrases(const PhraseTable &pt) const +{ + size_t ptInd = pt.GetPtInd(); + return targetPhrases[ptInd]; +} + +std::string InputPath::Debug(const System &system) const +{ + stringstream out; + + out << range << " " << flush; + out << subPhrase.Debug(system); + return out.str(); +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/InputPath.h b/mosesdecoder/moses2/PhraseBased/InputPath.h new file mode 100644 index 0000000000000000000000000000000000000000..b29c7f5ec6284cea9ae16ae765ba05551b38a2ac --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/InputPath.h @@ -0,0 +1,42 @@ +/* + * InputPath.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include "../InputPathBase.h" + +namespace Moses2 +{ +class TargetPhrases; + +class InputPath: public InputPathBase +{ +public: + const TargetPhrases** targetPhrases; + SubPhrase subPhrase; + + InputPath(MemPool &pool, const SubPhrase &subPhrase, const Range &range, + size_t numPt, const InputPath *prefixPath); + virtual ~InputPath(); + + void AddTargetPhrases(const PhraseTable &pt, const TargetPhrases *tps); + const TargetPhrases *GetTargetPhrases(const PhraseTable &pt) const; + + size_t GetNumRules() const { + return m_numRules; + } + + std::string Debug(const System &system) const; + +protected: + size_t m_numRules; +}; + +} + diff --git a/mosesdecoder/moses2/PhraseBased/InputPaths.cpp b/mosesdecoder/moses2/PhraseBased/InputPaths.cpp new file mode 100644 index 0000000000000000000000000000000000000000..50c00acbb8dd34ec8e85dc38f15731d68785d39f --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/InputPaths.cpp @@ -0,0 +1,65 @@ +/* + * InputPaths.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include "../InputPathsBase.h" +#include "../System.h" +#include "../legacy/Range.h" +#include "Manager.h" +#include "Sentence.h" + +using namespace std; + +namespace Moses2 +{ + +void InputPaths::Init(const InputType &input, const ManagerBase &mgr) +{ + const Sentence &sentence = static_cast(input); + + MemPool &pool = mgr.GetPool(); + size_t numPt = mgr.system.mappings.size(); + size_t size = sentence.GetSize(); + size_t maxLength = min(size, mgr.system.options.search.max_phrase_length); + + m_matrix = new (pool.Allocate >()) Matrix(pool, + size, maxLength); + m_matrix->Init(NULL); + + // create blank path for initial hypo + Range range(NOT_FOUND, NOT_FOUND); + SubPhrase subPhrase = sentence.GetSubPhrase(NOT_FOUND, NOT_FOUND); + m_blank = new (pool.Allocate()) InputPath(pool, subPhrase, range, + numPt, NULL); + + // create normal paths of subphrases through the sentence + for (size_t startPos = 0; startPos < size; ++startPos) { + const InputPath *prefixPath = NULL; + + for (size_t phaseSize = 1; phaseSize <= maxLength; ++phaseSize) { + size_t endPos = startPos + phaseSize - 1; + + if (endPos >= size) { + break; + } + + SubPhrase subPhrase = sentence.GetSubPhrase(startPos, phaseSize); + Range range(startPos, endPos); + + InputPath *path = new (pool.Allocate()) InputPath(pool, + subPhrase, range, numPt, prefixPath); + m_inputPaths.push_back(path); + + prefixPath = path; + + m_matrix->SetValue(startPos, phaseSize - 1, path); + } + } + +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/InputPaths.h b/mosesdecoder/moses2/PhraseBased/InputPaths.h new file mode 100644 index 0000000000000000000000000000000000000000..9089a7c165f438f141bb14dd948c10e1189003a6 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/InputPaths.h @@ -0,0 +1,45 @@ +/* + * InputPaths.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include "InputPath.h" +#include "../MemPool.h" +#include "../InputPathsBase.h" +#include "../legacy/Matrix.h" + +namespace Moses2 +{ + +class System; + +class InputPaths: public InputPathsBase +{ +public: + void Init(const InputType &input, const ManagerBase &mgr); + + const InputPath &GetBlank() const { + return *m_blank; + } + + Matrix &GetMatrix() { + return *m_matrix; + } + + const Matrix &GetMatrix() const { + return *m_matrix; + } + +protected: + InputPath *m_blank; + Matrix *m_matrix; + +}; + +} + diff --git a/mosesdecoder/moses2/PhraseBased/Manager.cpp b/mosesdecoder/moses2/PhraseBased/Manager.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a2a0ba0ad8e7866b0c8aff149c637b05df18c154 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Manager.cpp @@ -0,0 +1,285 @@ +/* + * Manager.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include +#include +#include +#include "Manager.h" +#include "TargetPhraseImpl.h" +#include "InputPath.h" +#include "Sentence.h" +#include "SentenceWithCandidates.h" + +#include "Normal/Search.h" +#include "CubePruningMiniStack/Search.h" + +/* + #include "CubePruningPerMiniStack/Search.h" + #include "CubePruningPerBitmap/Search.h" + #include "CubePruningCardinalStack/Search.h" + #include "CubePruningBitmapStack/Search.h" + */ +#include "../TrellisPaths.h" +#include "../System.h" +#include "../Phrase.h" +#include "../InputPathsBase.h" +#include "../TranslationModel/PhraseTable.h" +#include "../TranslationModel/UnknownWordPenalty.h" +#include "../legacy/Range.h" +#include "../PhraseBased/TargetPhrases.h" + +using namespace std; + +namespace Moses2 +{ +Manager::Manager(System &sys, const TranslationTask &task, + const std::string &inputStr, long translationId) : + ManagerBase(sys, task, inputStr, translationId) + ,m_search(NULL) + ,m_bitmaps(NULL) +{ + //cerr << translationId << " inputStr=" << inputStr << endl; +} + +Manager::~Manager() +{ + //cerr << "Start ~Manager " << this << endl; + delete m_search; + delete m_bitmaps; + //cerr << "Finish ~Manager " << this << endl; +} + +void Manager::Init() +{ + // init pools etc + InitPools(); + + FactorCollection &vocab = system.GetVocab(); + if (system.options.input.input_type == SentenceInputWithCandidates) { + m_input = Moses2::SentenceWithCandidates::CreateFromString(GetPool(), vocab, system, m_inputStr); + } + else { + m_input = Moses2::Sentence::CreateFromString(GetPool(), vocab, system, m_inputStr); + } + system.featureFunctions.InitializeForInput(*this, *m_input); + + m_bitmaps = new Bitmaps(GetPool()); + + const PhraseTable &firstPt = *system.featureFunctions.phraseTables[0]; + m_initPhrase = new (GetPool().Allocate()) TargetPhraseImpl( + GetPool(), firstPt, system, 0); + + const Sentence &sentence = static_cast(GetInput()); + //cerr << "sentence=" << sentence.GetSize() << " " << sentence.Debug(system) << endl; + + m_inputPaths.Init(sentence, *this); + + // xml + const UnknownWordPenalty *unkWP = system.featureFunctions.GetUnknownWordPenalty(); + UTIL_THROW_IF2(unkWP == NULL, "There must be a UnknownWordPenalty FF"); + unkWP->ProcessXML(*this, GetPool(), sentence, m_inputPaths); + + // lookup with every pt + const std::vector &pts = system.mappings; + for (size_t i = 0; i < pts.size(); ++i) { + const PhraseTable &pt = *pts[i]; + //cerr << "Looking up from " << pt.GetName() << endl; + pt.Lookup(*this, m_inputPaths); + } + //m_inputPaths.DeleteUnusedPaths(); + CalcFutureScore(); + + m_bitmaps->Init(sentence.GetSize(), vector(0)); + + switch (system.options.search.algo) { + case Normal: + m_search = new NSNormal::Search(*this); + break; + case NormalBatch: + //m_search = new NSBatch::Search(*this); + UTIL_THROW2("Not implemented"); + break; + case CubePruning: + case CubePruningMiniStack: + m_search = new NSCubePruningMiniStack::Search(*this); + break; + /* + case CubePruningPerMiniStack: + m_search = new NSCubePruningPerMiniStack::Search(*this); + break; + case CubePruningPerBitmap: + m_search = new NSCubePruningPerBitmap::Search(*this); + break; + case CubePruningCardinalStack: + m_search = new NSCubePruningCardinalStack::Search(*this); + break; + case CubePruningBitmapStack: + m_search = new NSCubePruningBitmapStack::Search(*this); + break; + */ + default: + UTIL_THROW2("Unknown search algorithm"); + } +} + +void Manager::Decode() +{ + //cerr << "Start Decode " << this << endl; + + Init(); + m_search->Decode(); + + //cerr << "Finished Decode " << this << endl; +} + +void Manager::CalcFutureScore() +{ + const Sentence &sentence = static_cast(GetInput()); + size_t size = sentence.GetSize(); + m_estimatedScores = + new (GetPool().Allocate()) EstimatedScores(GetPool(), + size); + m_estimatedScores->InitTriangle(-numeric_limits::infinity()); + + // walk all the translation options and record the cheapest option for each span + BOOST_FOREACH(const InputPathBase *path, m_inputPaths) { + const Range &range = path->range; + SCORE bestScore = -numeric_limits::infinity(); + + size_t numPt = system.mappings.size(); + for (size_t i = 0; i < numPt; ++i) { + const TargetPhrases *tps = static_cast(path)->targetPhrases[i]; + if (tps) { + BOOST_FOREACH(const TargetPhraseImpl *tp, *tps) { + SCORE score = tp->GetFutureScore(); + if (score > bestScore) { + bestScore = score; + } + } + } + } + m_estimatedScores->SetValue(range.GetStartPos(), range.GetEndPos(), bestScore); + } + + // now fill all the cells in the strictly upper triangle + // there is no way to modify the diagonal now, in the case + // where no translation option covers a single-word span, + // we leave the +inf in the matrix + // like in chart parsing we want each cell to contain the highest score + // of the full-span trOpt or the sum of scores of joining two smaller spans + + for (size_t colstart = 1; colstart < size; colstart++) { + for (size_t diagshift = 0; diagshift < size - colstart; diagshift++) { + size_t sPos = diagshift; + size_t ePos = colstart + diagshift; + for (size_t joinAt = sPos; joinAt < ePos; joinAt++) { + float joinedScore = m_estimatedScores->GetValue(sPos, joinAt) + + m_estimatedScores->GetValue(joinAt + 1, ePos); + // uncomment to see the cell filling scheme + // TRACE_ERR("[" << sPos << "," << ePos << "] <-? [" + // << sPos << "," << joinAt << "]+[" + // << joinAt+1 << "," << ePos << "] (colstart: " + // << colstart << ", diagshift: " << diagshift << ")" + // << endl); + + if (joinedScore > m_estimatedScores->GetValue(sPos, ePos)) m_estimatedScores->SetValue( + sPos, ePos, joinedScore); + } + } + } + + //cerr << "Square matrix:" << endl; + //cerr << *m_estimatedScores << endl; +} + +std::string Manager::OutputBest() const +{ + stringstream out; + Moses2::FixPrecision(out); + + const Hypothesis *bestHypo = m_search->GetBestHypo(); + if (bestHypo) { + if (system.options.output.ReportHypoScore) { + out << bestHypo->GetScores().GetTotalScore() << " "; + } + + bestHypo->OutputToStream(out); + //cerr << "BEST TRANSLATION: " << *bestHypo; + } else { + if (system.options.output.ReportHypoScore) { + out << "0 "; + } + //cerr << "NO TRANSLATION " << m_input->GetTranslationId() << endl; + } + + return out.str(); + //cerr << endl; +} + +std::string Manager::OutputNBest() +{ + arcLists.Sort(); + + boost::unordered_set distinctHypos; + + TrellisPaths contenders; + m_search->AddInitialTrellisPaths(contenders); + + long transId = GetTranslationId(); + + // MAIN LOOP + stringstream out; + //Moses2::FixPrecision(out); + + size_t maxIter = system.options.nbest.nbest_size * system.options.nbest.factor; + size_t bestInd = 0; + for (size_t i = 0; i < maxIter; ++i) { + if (bestInd > system.options.nbest.nbest_size || contenders.empty()) { + break; + } + + //cerr << "bestInd=" << bestInd << endl; + TrellisPath *path = contenders.Get(); + + bool ok = false; + if (system.options.nbest.only_distinct) { + string tgtPhrase = path->OutputTargetPhrase(system); + //cerr << "tgtPhrase=" << tgtPhrase << endl; + boost::hash string_hash; + size_t hash = string_hash(tgtPhrase); + + if (distinctHypos.insert(hash).second) { + ok = true; + } + } else { + ok = true; + } + + if (ok) { + ++bestInd; + out << transId << " ||| "; + path->OutputToStream(out, system); + out << "\n"; + } + + // create next paths + path->CreateDeviantPaths(contenders, arcLists, GetPool(), system); + + delete path; + } + + return out.str(); +} + +std::string Manager::OutputTransOpt() +{ + return ""; +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/Manager.h b/mosesdecoder/moses2/PhraseBased/Manager.h new file mode 100644 index 0000000000000000000000000000000000000000..1a348f75f37e7783f9cc014e683168770d6a0d86 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Manager.h @@ -0,0 +1,81 @@ +/* + * Manager.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include +#include "../ManagerBase.h" +#include "../Phrase.h" +#include "../TargetPhrase.h" +#include "../MemPool.h" +#include "../Recycler.h" +#include "../EstimatedScores.h" +#include "../legacy/Bitmaps.h" +#include "InputPaths.h" + +namespace Moses2 +{ + +class System; +class TranslationTask; +class PhraseImpl; +class TargetPhraseImpl; +class SearchNormal; +class Search; +class Hypothesis; +class Sentence; +class OutputCollector; + +class Manager: public ManagerBase +{ +public: + Manager(System &sys, const TranslationTask &task, const std::string &inputStr, + long translationId); + + virtual ~Manager(); + + Bitmaps &GetBitmaps() { + return *m_bitmaps; + } + + const EstimatedScores &GetEstimatedScores() const { + return *m_estimatedScores; + } + + const InputPaths &GetInputPaths() const { + return m_inputPaths; + } + + const TargetPhraseImpl &GetInitPhrase() const { + return *m_initPhrase; + } + + void Decode(); + std::string OutputBest() const; + std::string OutputNBest(); + std::string OutputTransOpt(); + +protected: + + InputPaths m_inputPaths; + Bitmaps *m_bitmaps; + EstimatedScores *m_estimatedScores; + TargetPhraseImpl *m_initPhrase; + + Search *m_search; + + // must be run in same thread as Decode() + void Init(); + void CalcFutureScore(); + +}; + +} + diff --git a/mosesdecoder/moses2/PhraseBased/Normal/Search.cpp b/mosesdecoder/moses2/PhraseBased/Normal/Search.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1c158543d41d7bf81d39884715c39feece945e93 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Normal/Search.cpp @@ -0,0 +1,161 @@ +/* + * SearchNormal.cpp + * + * Created on: 25 Oct 2015 + * Author: hieu + */ + +#include "Search.h" +#include +#include +#include "Stack.h" +#include "../Manager.h" +#include "../TrellisPath.h" +#include "../Sentence.h" +#include "../../TrellisPaths.h" +#include "../../InputPathsBase.h" +#include "../../Phrase.h" +#include "../../System.h" +#include "../../PhraseBased/TargetPhrases.h" + +using namespace std; + +namespace Moses2 +{ +namespace NSNormal +{ + +Search::Search(Manager &mgr) + :Moses2::Search(mgr) + , m_stacks(mgr) +{ + // TODO Auto-generated constructor stub + +} + +Search::~Search() +{ + // TODO Auto-generated destructor stub +} + +void Search::Decode() +{ + // init stacks + const Sentence &sentence = static_cast(mgr.GetInput()); + m_stacks.Init(mgr, sentence.GetSize() + 1); + + const Bitmap &initBitmap = mgr.GetBitmaps().GetInitialBitmap(); + Hypothesis *initHypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + initHypo->Init(mgr, mgr.GetInputPaths().GetBlank(), mgr.GetInitPhrase(), + initBitmap); + initHypo->EmptyHypothesisState(mgr.GetInput()); + + m_stacks.Add(initHypo, mgr.GetHypoRecycle(), mgr.arcLists); + + for (size_t stackInd = 0; stackInd < m_stacks.GetSize(); ++stackInd) { + Decode(stackInd); + //cerr << m_stacks << endl; + + // delete stack to save mem + if (stackInd < m_stacks.GetSize() - 1) { + m_stacks.Delete(stackInd); + } + //cerr << m_stacks.Debug(mgr.system) << endl; + } +} + +void Search::Decode(size_t stackInd) +{ + //cerr << "stackInd=" << stackInd << endl; + Stack &stack = m_stacks[stackInd]; + if (&stack == &m_stacks.Back()) { + // last stack. don't do anythin + return; + } + + const Hypotheses &hypos = stack.GetSortedAndPrunedHypos(mgr, mgr.arcLists); + //cerr << "hypos=" << hypos.size() << endl; + + const InputPaths &paths = mgr.GetInputPaths(); + + BOOST_FOREACH(const InputPathBase *path, paths) { + BOOST_FOREACH(const HypothesisBase *hypo, hypos) { + Extend(*static_cast(hypo), *static_cast(path)); + } + } +} + +void Search::Extend(const Hypothesis &hypo, const InputPath &path) +{ + const Bitmap &hypoBitmap = hypo.GetBitmap(); + const Range &hypoRange = hypo.GetInputPath().range; + const Range &pathRange = path.range; + + if (!CanExtend(hypoBitmap, hypoRange.GetEndPos(), pathRange)) { + return; + } + + const ReorderingConstraint &reorderingConstraint = mgr.GetInput().GetReorderingConstraint(); + if (!reorderingConstraint.Check(hypoBitmap, pathRange.GetStartPos(), pathRange.GetEndPos())) { + return; + } + + // extend this hypo + const Bitmap &newBitmap = mgr.GetBitmaps().GetBitmap(hypoBitmap, pathRange); + //SCORE estimatedScore = mgr.GetEstimatedScores().CalcFutureScore2(bitmap, pathRange.GetStartPos(), pathRange.GetEndPos()); + SCORE estimatedScore = mgr.GetEstimatedScores().CalcEstimatedScore(newBitmap); + + size_t numPt = mgr.system.mappings.size(); + const TargetPhrases **tpsAllPt = path.targetPhrases; + for (size_t i = 0; i < numPt; ++i) { + const TargetPhrases *tps = tpsAllPt[i]; + if (tps) { + Extend(hypo, *tps, path, newBitmap, estimatedScore); + } + } +} + +void Search::Extend(const Hypothesis &hypo, const TargetPhrases &tps, + const InputPath &path, const Bitmap &newBitmap, SCORE estimatedScore) +{ + BOOST_FOREACH(const TargetPhraseImpl *tp, tps) { + Extend(hypo, *tp, path, newBitmap, estimatedScore); + } +} + +void Search::Extend(const Hypothesis &hypo, const TargetPhraseImpl &tp, + const InputPath &path, const Bitmap &newBitmap, SCORE estimatedScore) +{ + Hypothesis *newHypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + newHypo->Init(mgr, hypo, path, tp, newBitmap, estimatedScore); + newHypo->EvaluateWhenApplied(); + + m_stacks.Add(newHypo, mgr.GetHypoRecycle(), mgr.arcLists); + + //m_arcLists.AddArc(stackAdded.added, newHypo, stackAdded.other); + //stack.Prune(mgr.GetHypoRecycle(), mgr.system.stackSize, mgr.system.stackSize * 2); + +} + +const Hypothesis *Search::GetBestHypo() const +{ + const Stack &lastStack = m_stacks.Back(); + const Hypothesis *best = lastStack.GetBestHypo(); + return best; +} + +void Search::AddInitialTrellisPaths(TrellisPaths &paths) const +{ + const Stack &lastStack = m_stacks.Back(); + const Hypotheses &hypos = lastStack.GetSortedAndPrunedHypos(mgr, mgr.arcLists); + + BOOST_FOREACH(const HypothesisBase *hypoBase, hypos) { + const Hypothesis *hypo = static_cast(hypoBase); + TrellisPath *path = new TrellisPath(hypo, mgr.arcLists); + paths.Add(path); + } +} + +} // namespace +} + diff --git a/mosesdecoder/moses2/PhraseBased/Normal/Search.h b/mosesdecoder/moses2/PhraseBased/Normal/Search.h new file mode 100644 index 0000000000000000000000000000000000000000..0d487e32b1071c9e0cd0e8fa5794f37977f62a6d --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Normal/Search.h @@ -0,0 +1,51 @@ +/* + * SearchNormal.h + * + * Created on: 25 Oct 2015 + * Author: hieu + */ +#pragma once + +#include +#include "../../legacy/Range.h" +#include "../../legacy/Bitmap.h" +#include "../../TypeDef.h" +#include "../Search.h" +#include "Stacks.h" + +namespace Moses2 +{ +class Hypothesis; +class InputPath; +class TargetPhrases; +class TargetPhraseImpl; + +namespace NSNormal +{ +class Stacks; + +class Search: public Moses2::Search +{ +public: + Search(Manager &mgr); + virtual ~Search(); + + virtual void Decode(); + const Hypothesis *GetBestHypo() const; + + void AddInitialTrellisPaths(TrellisPaths &paths) const; + +protected: + Stacks m_stacks; + + void Decode(size_t stackInd); + void Extend(const Hypothesis &hypo, const InputPath &path); + void Extend(const Hypothesis &hypo, const TargetPhrases &tps, + const InputPath &path, const Bitmap &newBitmap, SCORE estimatedScore); + void Extend(const Hypothesis &hypo, const TargetPhraseImpl &tp, + const InputPath &path, const Bitmap &newBitmap, SCORE estimatedScore); + +}; + +} +} diff --git a/mosesdecoder/moses2/PhraseBased/Normal/Stack.cpp b/mosesdecoder/moses2/PhraseBased/Normal/Stack.cpp new file mode 100644 index 0000000000000000000000000000000000000000..efaa86f2d36c2712ed48253e36c0d70189fc02e1 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Normal/Stack.cpp @@ -0,0 +1,35 @@ +/* + * Stack.cpp + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#include +#include "Stack.h" +#include "../Hypothesis.h" +#include "../Manager.h" +#include "../../Scores.h" +#include "../../HypothesisColl.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSNormal +{ + +Stack::Stack(const Manager &mgr) : + HypothesisColl(mgr) +{ + // TODO Auto-generated constructor stub + +} + +Stack::~Stack() +{ + // TODO Auto-generated destructor stub +} + +} +} diff --git a/mosesdecoder/moses2/PhraseBased/Normal/Stack.h b/mosesdecoder/moses2/PhraseBased/Normal/Stack.h new file mode 100644 index 0000000000000000000000000000000000000000..4ad707ce4162ae56ac8870a98dfae10e85e997d7 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Normal/Stack.h @@ -0,0 +1,32 @@ +/* + * Stack.h + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include "../Hypothesis.h" +#include "../../TypeDef.h" +#include "../../HypothesisColl.h" +#include "../../legacy/Util2.h" + +namespace Moses2 +{ + +namespace NSNormal +{ +class Stack: public HypothesisColl +{ +public: + Stack(const Manager &mgr); + virtual ~Stack(); + +protected: + +}; + +} +} diff --git a/mosesdecoder/moses2/PhraseBased/Normal/Stacks.cpp b/mosesdecoder/moses2/PhraseBased/Normal/Stacks.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a47709676e1d0d3b091a0ee9b5bede4f3642ab18 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Normal/Stacks.cpp @@ -0,0 +1,66 @@ +/* + * Stacks.cpp + * + * Created on: 6 Nov 2015 + * Author: hieu + */ + +#include "Stacks.h" +#include "../Manager.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSNormal +{ + +Stacks::Stacks(const Manager &mgr) : + m_mgr(mgr) +{ + // TODO Auto-generated constructor stub + +} + +Stacks::~Stacks() +{ + for (size_t i = 0; i < m_stacks.size(); ++i) { + delete m_stacks[i]; + } +} + +void Stacks::Init(const Manager &mgr, size_t numStacks) +{ + m_stacks.resize(numStacks); + for (size_t i = 0; i < m_stacks.size(); ++i) { + m_stacks[i] = new Stack(mgr); + } +} + +std::string Stacks::Debug(const System &system) const +{ + stringstream out; + for (size_t i = 0; i < GetSize(); ++i) { + const Stack *stack = m_stacks[i]; + if (stack) { + out << stack->GetSize() << " "; + } else { + out << "N "; + } + } + return out.str(); +} + +void Stacks::Add(Hypothesis *hypo, Recycler &hypoRecycle, + ArcLists &arcLists) +{ + size_t numWordsCovered = hypo->GetBitmap().GetNumWordsCovered(); + //cerr << "numWordsCovered=" << numWordsCovered << endl; + Stack &stack = *m_stacks[numWordsCovered]; + stack.Add(m_mgr, hypo, hypoRecycle, arcLists); +} + +} +} diff --git a/mosesdecoder/moses2/PhraseBased/Normal/Stacks.h b/mosesdecoder/moses2/PhraseBased/Normal/Stacks.h new file mode 100644 index 0000000000000000000000000000000000000000..b6da78a4ee2cef9a4585670eb6b0de433bc21036 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Normal/Stacks.h @@ -0,0 +1,58 @@ +/* + * Stacks.h + * + * Created on: 6 Nov 2015 + * Author: hieu + */ + +#pragma once + +#include +#include "Stack.h" +#include "../../Recycler.h" + +namespace Moses2 +{ +class Manager; +class ArcLists; + +namespace NSNormal +{ + +class Stacks +{ +public: + Stacks(const Manager &mgr); + virtual ~Stacks(); + + void Init(const Manager &mgr, size_t numStacks); + + size_t GetSize() const { + return m_stacks.size(); + } + + const Stack &Back() const { + return *m_stacks.back(); + } + + Stack &operator[](size_t ind) { + return *m_stacks[ind]; + } + + void Delete(size_t ind) { + delete m_stacks[ind]; + m_stacks[ind] = NULL; + } + + void Add(Hypothesis *hypo, Recycler &hypoRecycle, + ArcLists &arcLists); + + std::string Debug(const System &system) const; + +protected: + const Manager &m_mgr; + std::vector m_stacks; +}; + +} +} diff --git a/mosesdecoder/moses2/PhraseBased/PhraseImpl.cpp b/mosesdecoder/moses2/PhraseBased/PhraseImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d72e36083060cca3285d636b294bf12ca91a53f4 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/PhraseImpl.cpp @@ -0,0 +1,27 @@ +/* + * PhraseImpl.cpp + * + * Created on: 19 Feb 2016 + * Author: hieu + */ +#include "PhraseImpl.h" + +using namespace std; + +namespace Moses2 +{ +PhraseImpl *PhraseImpl::CreateFromString(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str) +{ + std::vector toks = Moses2::Tokenize(str); + size_t size = toks.size(); + PhraseImpl *ret; + + ret = new (pool.Allocate()) PhraseImpl(pool, size); + + ret->PhraseImplTemplate::CreateFromString(vocab, system, toks); + return ret; +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/PhraseImpl.h b/mosesdecoder/moses2/PhraseBased/PhraseImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..f199e62d4decf82d4678f10ca0bae0dc1c33774e --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/PhraseImpl.h @@ -0,0 +1,20 @@ +#pragma once +#include "../PhraseImplTemplate.h" +#include "../SubPhrase.h" + +namespace Moses2 +{ + +class PhraseImpl: public PhraseImplTemplate +{ +public: + static PhraseImpl *CreateFromString(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str); + + PhraseImpl(MemPool &pool, size_t size) : + PhraseImplTemplate(pool, size) { + } + +}; + +} diff --git a/mosesdecoder/moses2/PhraseBased/ReorderingConstraint.cpp b/mosesdecoder/moses2/PhraseBased/ReorderingConstraint.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0e84b1f3f853db28b4354ddcfcb4bd82fe7057db --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/ReorderingConstraint.cpp @@ -0,0 +1,252 @@ +#include +#include +#include "ReorderingConstraint.h" +#include "Sentence.h" +#include "../TypeDef.h" +#include "../legacy/Bitmap.h" + +using namespace std; + +namespace Moses2 +{ +//! destructer +ReorderingConstraint::~ReorderingConstraint() +{ + //if (m_wall != NULL) free(m_wall); + //if (m_localWall != NULL) free(m_localWall); +} + +//! allocate memory for reordering walls +void ReorderingConstraint::InitializeWalls(size_t size, int max_distortion) +{ + m_size = size; + + m_wall = m_pool.Allocate(size); + m_localWall = m_pool.Allocate(size); + + m_max_distortion = max_distortion; + + for (size_t pos = 0 ; pos < m_size ; pos++) { + m_wall[pos] = false; + m_localWall[pos] = NOT_A_ZONE; + } +} + +//! has to be called to localized walls +void ReorderingConstraint::FinalizeWalls() +{ + for(size_t z = 0; z < m_zone.size(); z++ ) { + const size_t startZone = m_zone[z].first; + const size_t endZone = m_zone[z].second;// note: wall after endZone is not local + for( size_t pos = startZone; pos < endZone; pos++ ) { + if (m_wall[ pos ]) { + m_localWall[ pos ] = z; + m_wall[ pos ] = false; + //cerr << "SETTING local wall " << pos << std::endl; + } + // enforce that local walls only apply to innermost zone + else if (m_localWall[ pos ] != NOT_A_ZONE) { + size_t assigned_z = m_localWall[ pos ]; + if ((m_zone[assigned_z].first < startZone) || + (m_zone[assigned_z].second > endZone)) { + m_localWall[ pos ] = z; + } + } + } + } +} + +//! set value at a particular position +void ReorderingConstraint::SetWall( size_t pos, bool value ) +{ + //cerr << "SETTING reordering wall at position " << pos << std::endl; + UTIL_THROW_IF2(pos >= m_size, "Wall over length of sentence: " << pos << " >= " << m_size); + m_wall[pos] = value; + m_active = true; +} + +//! set a reordering zone (once entered, need to finish) +void ReorderingConstraint::SetZone( size_t startPos, size_t endPos ) +{ + //cerr << "SETTING zone " << startPos << "-" << endPos << std::endl; + std::pair newZone; + newZone.first = startPos; + newZone.second = endPos; + m_zone.push_back( newZone ); + m_active = true; +} + +//! set walls based on "-monotone-at-punctuation" flag +void ReorderingConstraint::SetMonotoneAtPunctuation( const Sentence &sentence ) +{ + for( size_t i=0; iGetString() == "," || + word[0]->GetString() == "." || + word[0]->GetString() == "!" || + word[0]->GetString() == "?" || + word[0]->GetString() == ":" || + word[0]->GetString() == ";" || + word[0]->GetString() == "\"") { + // set wall before and after punc, but not at sentence start, end + if (i>0 && i1) SetWall( i-1, true ); + } + } +} + +//! check if the current hypothesis extension violates reordering constraints +bool ReorderingConstraint::Check( const Bitmap &bitmap, size_t startPos, size_t endPos ) const +{ + // nothing to be checked, we are done + if (! IsActive() ) return true; + + //cerr << "Check " << bitmap << " " << startPos << "-" << endPos; + + // check walls + size_t firstGapPos = bitmap.GetFirstGapPos(); + // filling first gap -> no wall violation possible + if (firstGapPos != startPos) { + // if there is a wall before the last word, + // we created a gap while moving through wall + // -> violation + for( size_t pos = firstGapPos; pos < endPos; pos++ ) { + if( GetWall( pos ) ) { + //cerr << " hitting wall " << pos << std::endl; + return false; + } + } + } + + // monotone -> no violation possible + size_t lastPos = bitmap.GetLastPos(); + if ((lastPos == NOT_FOUND && startPos == 0) || // nothing translated + (firstGapPos > lastPos && // no gaps + firstGapPos == startPos)) { // translating first empty word + //cerr << " montone, fine." << std::endl; + return true; + } + + // check zones + for(size_t z = 0; z < m_zone.size(); z++ ) { + const size_t startZone = m_zone[z].first; + const size_t endZone = m_zone[z].second; + + // fine, if translation has not reached zone yet and phrase outside zone + if (lastPos < startZone && ( endPos < startZone || startPos > endZone ) ) { + continue; + } + + // already completely translated zone, no violations possible + if (firstGapPos > endZone) { + continue; + } + + // some words are translated beyond the start + // let's look closer if some are in the zone + size_t numWordsInZoneTranslated = 0; + if (lastPos >= startZone) { + for(size_t pos = startZone; pos <= endZone; pos++ ) { + if( bitmap.GetValue( pos ) ) { + numWordsInZoneTranslated++; + } + } + } + + // all words in zone translated, no violation possible + if (numWordsInZoneTranslated == endZone-startZone+1) { + continue; + } + + // flag if this is an active zone + bool activeZone = (numWordsInZoneTranslated > 0); + + // fine, if zone completely untranslated and phrase outside zone + if (!activeZone && ( endPos < startZone || startPos > endZone ) ) { + continue; + } + + // violation, if phrase completely outside active zone + if (activeZone && ( endPos < startZone || startPos > endZone ) ) { + //cerr << " outside active zone" << std::endl; + return false; + } + + // ok, this is what we know now: + // * the phrase is in the zone (at least partially) + // * either zone is already active, or it becomes active now + + + // check, if we are setting us up for a dead end due to distortion limits + + // size_t distortionLimit = (size_t)StaticData::Instance().GetMaxDistortion(); + size_t distortionLimit = m_max_distortion; + if (startPos != firstGapPos && endZone-firstGapPos >= distortionLimit) { + //cerr << " dead end due to distortion limit" << std::endl; + return false; + } + + // let us check on phrases that are partially outside + + // phrase overlaps at the beginning, always ok + if (startPos <= startZone) { + continue; + } + + // phrase goes beyond end, has to fill zone completely + if (endPos > endZone) { + if (endZone-startPos+1 < // num. words filled in by phrase + endZone-startZone+1-numWordsInZoneTranslated) { // num. untranslated + //cerr << " overlap end, but not completing" << std::endl; + return false; + } else { + continue; + } + } + + // now we are down to phrases that are completely inside the zone + // we have to check local walls + bool seenUntranslatedBeforeStartPos = false; + for(size_t pos = startZone; pos < endZone && pos < endPos; pos++ ) { + // be careful when there is a gap before phrase + if( !bitmap.GetValue( pos ) // untranslated word + && pos < startPos ) { // before startPos + seenUntranslatedBeforeStartPos = true; + } + if( seenUntranslatedBeforeStartPos && GetLocalWall( pos, z ) ) { + //cerr << " local wall violation" << std::endl; + return false; + } + } + + // passed all checks for this zone, on to the next one + } + + // passed all checks, no violations + //cerr << " fine." << std::endl; + return true; +} + +std::ostream &ReorderingConstraint::Debug(std::ostream &out, const System &system) const +{ + out << "Zones:"; + for (size_t i = 0; i < m_zone.size(); ++i) { + const std::pair &zone1 = m_zone[i]; + out << zone1.first << "-" << zone1.second << " "; + } + + out << "Walls:"; + for (size_t i = 0; i < m_size; ++i) { + out << m_wall[i]; + } + + out << " Local walls:"; + for (size_t i = 0; i < m_size; ++i) { + out << m_localWall[i] << " "; + } + + return out; +} + +} // namespace + diff --git a/mosesdecoder/moses2/PhraseBased/ReorderingConstraint.h b/mosesdecoder/moses2/PhraseBased/ReorderingConstraint.h new file mode 100644 index 0000000000000000000000000000000000000000..b8d2461e5594e010963437293cf0884da785b9a4 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/ReorderingConstraint.h @@ -0,0 +1,88 @@ +#pragma once +#include +#include +#include "../Vector.h" + +namespace Moses2 +{ +class System; +class Sentence; +class Bitmap; +class MemPool; + +#define NOT_A_ZONE 999999999 + +class ReorderingConstraint +{ +protected: + // const size_t m_size; /**< number of words in sentence */ + size_t m_size; /**< number of words in sentence */ + bool *m_wall; /**< flag for each word if it is a wall */ + //size_t *m_wall; /**< flag for each word if it is a wall */ + size_t *m_localWall; /**< flag for each word if it is a local wall */ + Vector< std::pair > m_zone; /** zones that limit reordering */ + bool m_active; /**< flag indicating, if there are any active constraints */ + int m_max_distortion; + MemPool &m_pool; + + ReorderingConstraint(const ReorderingConstraint &); // do not implement + +public: + + //! create ReorderingConstraint of length size and initialise to zero + ReorderingConstraint(MemPool &pool) + : m_wall(NULL) + , m_localWall(NULL) + , m_active(false) + , m_pool(pool) + , m_zone(pool) + {} + + //! destructer + ~ReorderingConstraint(); + + //! allocate memory for memory for a sentence of a given size + void InitializeWalls(size_t size, int max_distortion); + + //! changes walls in zones into local walls + void FinalizeWalls(); + + //! set value at a particular position + void SetWall( size_t pos, bool value ); + + //! whether a word has been translated at a particular position + bool GetWall(size_t pos) const { + return m_wall[pos]; + } + + //! whether a word has been translated at a particular position + bool GetLocalWall(size_t pos, size_t zone ) const { + return (m_localWall[pos] == zone); + } + + //! set a zone + void SetZone( size_t startPos, size_t endPos ); + + //! returns the vector of zones + Vector< std::pair< size_t,size_t> > & GetZones() { + return m_zone; + } + + //! set the reordering walls based on punctuation in the sentence + void SetMonotoneAtPunctuation( const Sentence & sentence ); + + //! check if all constraints are fulfilled -> all find + bool Check( const Bitmap &bitmap, size_t start, size_t end ) const; + + //! checks if reordering constraints will be enforced + bool IsActive() const { + return m_active; + } + + std::ostream &Debug(std::ostream &out, const System &system) const; + +}; + + +} + diff --git a/mosesdecoder/moses2/PhraseBased/Search.cpp b/mosesdecoder/moses2/PhraseBased/Search.cpp new file mode 100644 index 0000000000000000000000000000000000000000..48f9995ff3bed6c50fe5835ff980e5b32b053da3 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Search.cpp @@ -0,0 +1,115 @@ +/* + * Search.cpp + * + * Created on: 16 Nov 2015 + * Author: hieu + */ + +#include "Search.h" +#include "Manager.h" +#include "../System.h" +#include "../legacy/Bitmap.h" +#include "../legacy/Range.h" + +namespace Moses2 +{ + +Search::Search(Manager &mgr) : + mgr(mgr) +{ + // TODO Auto-generated constructor stub + +} + +Search::~Search() +{ + // TODO Auto-generated destructor stub +} + +bool Search::CanExtend(const Bitmap &hypoBitmap, size_t hypoRangeEndPos, + const Range &pathRange) +{ + const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos(); + + //cerr << "DOING " << hypoBitmap << " [" << hypoRange.GetStartPos() << " " << hypoRange.GetEndPos() << "]" + // " [" << pathRange.GetStartPos() << " " << pathRange.GetEndPos() << "]"; + + if (hypoBitmap.Overlap(pathRange)) { + //cerr << " NO" << endl; + return false; + } + + if (mgr.system.options.reordering.max_distortion == -1) { + return true; + } + + if (mgr.system.options.reordering.max_distortion >= 0) { + // distortion limit + int distortion = ComputeDistortionDistance(hypoRangeEndPos, + pathRange.GetStartPos()); + if (distortion > mgr.system.options.reordering.max_distortion) { + //cerr << " NO" << endl; + return false; + } + } + + // first question: is there a path from the closest translated word to the left + // of the hypothesized extension to the start of the hypothesized extension? + // long version: + // - is there anything to our left? + // - is it farther left than where we're starting anyway? + // - can we get to it? + + // closestLeft is exclusive: a value of 3 means 2 is covered, our + // arc is currently ENDING at 3 and can start at 3 implicitly + + // TODO is this relevant? only for lattice input? + + // ask second question here: we already know we can get to our + // starting point from the closest thing to the left. We now ask the + // follow up: can we get from our end to the closest thing on the + // right? + // + // long version: is anything to our right? is it farther + // right than our (inclusive) end? can our end reach it? + bool isLeftMostEdge = (hypoFirstGapPos == pathRange.GetStartPos()); + + size_t closestRight = hypoBitmap.GetEdgeToTheRightOf(pathRange.GetEndPos()); + /* + if (isWordLattice) { + if (closestRight != endPos + && ((closestRight + 1) < sourceSize) + && !m_source.CanIGetFromAToB(endPos + 1, closestRight + 1)) { + continue; + } + } + */ + + if (isLeftMostEdge) { + // any length extension is okay if starting at left-most edge + + } else { // starting somewhere other than left-most edge, use caution + // the basic idea is this: we would like to translate a phrase + // starting from a position further right than the left-most + // open gap. The distortion penalty for the following phrase + // will be computed relative to the ending position of the + // current extension, so we ask now what its maximum value will + // be (which will always be the value of the hypothesis starting + // at the left-most edge). If this value is less than the + // distortion limit, we don't allow this extension to be made. + Range bestNextExtension(hypoFirstGapPos, hypoFirstGapPos); + + if (ComputeDistortionDistance(pathRange.GetEndPos(), + bestNextExtension.GetStartPos()) > mgr.system.options.reordering.max_distortion) { + //cerr << " NO" << endl; + return false; + } + + // everything is fine, we're good to go + } + + return true; +} + +} + diff --git a/mosesdecoder/moses2/PhraseBased/Search.h b/mosesdecoder/moses2/PhraseBased/Search.h new file mode 100644 index 0000000000000000000000000000000000000000..c908566769cd3cd01968ff1ce6928c5bf8a1e4e5 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Search.h @@ -0,0 +1,58 @@ +/* + * Search.h + * + * Created on: 16 Nov 2015 + * Author: hieu + */ + +#pragma once + +#include +#include "../legacy/Util2.h" + +namespace Moses2 +{ + +class Manager; +class Stack; +class Hypothesis; +class Bitmap; +class Range; +class TrellisPath; + +template +class TrellisPaths; + +class Search +{ +public: + Search(Manager &mgr); + virtual ~Search(); + + virtual void Decode() = 0; + virtual const Hypothesis *GetBestHypo() const = 0; + + virtual void AddInitialTrellisPaths(TrellisPaths &paths) const = 0; + +protected: + Manager &mgr; + //ArcLists m_arcLists; + + bool CanExtend(const Bitmap &hypoBitmap, size_t hypoRangeEndPos, + const Range &pathRange); + + inline int ComputeDistortionDistance(size_t prevEndPos, + size_t currStartPos) const { + int dist = 0; + if (prevEndPos == NOT_FOUND) { + dist = currStartPos; + } else { + dist = (int)prevEndPos - (int)currStartPos + 1; + } + return abs(dist); + } + +}; + +} + diff --git a/mosesdecoder/moses2/PhraseBased/Sentence.cpp b/mosesdecoder/moses2/PhraseBased/Sentence.cpp new file mode 100644 index 0000000000000000000000000000000000000000..173f00419f10dc8d4656b027a2128e27118418ca --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Sentence.cpp @@ -0,0 +1,173 @@ +/* + * Sentence.cpp + * + * Created on: 14 Dec 2015 + * Author: hieu + */ +#include +#include +#include "Sentence.h" +#include "../System.h" +#include "../parameters/AllOptions.h" +#include "../legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ + +Sentence *Sentence::CreateFromString(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str) +{ + Sentence *ret; + + if (system.options.input.xml_policy) { + // xml + ret = CreateFromStringXML(pool, vocab, system, str); + } else { + // no xml + //cerr << "PB Sentence" << endl; + std::vector toks = Tokenize(str); + + size_t size = toks.size(); + ret = new (pool.Allocate()) Sentence(pool, size); + ret->PhraseImplTemplate::CreateFromString(vocab, system, toks, false); + } + + //cerr << "REORDERING CONSTRAINTS:" << ret->GetReorderingConstraint() << endl; + //cerr << "ret=" << ret->Debug(system) << endl; + + return ret; +} + +Sentence *Sentence::CreateFromStringXML(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str) +{ + Sentence *ret; + + vector xmlOptions; + pugi::xml_document doc; + + string str2 = "" + str + ""; + pugi::xml_parse_result result = doc.load(str2.c_str(), + pugi::parse_cdata | pugi::parse_wconv_attribute | pugi::parse_eol | pugi::parse_comments); + pugi::xml_node topNode = doc.child("xml"); + + std::vector toks; + XMLParse(pool, system, 0, topNode, toks, xmlOptions); + + // debug + /* + cerr << "xmloptions:" << endl; + for (size_t i = 0; i < xmlOptions.size(); ++i) { + cerr << xmlOptions[i]->Debug(system) << endl; + } + */ + + // create words + size_t size = toks.size(); + ret = new (pool.Allocate()) Sentence(pool, size); + ret->PhraseImplTemplate::CreateFromString(vocab, system, toks, false); + + // xml + ret->Init(system, size, system.options.reordering.max_distortion); + + ReorderingConstraint &reorderingConstraint = ret->GetReorderingConstraint(); + + // set reordering walls, if "-monotone-at-punction" is set + if (system.options.reordering.monotone_at_punct && ret->GetSize()) { + reorderingConstraint.SetMonotoneAtPunctuation(*ret); + } + + // set walls obtained from xml + for(size_t i=0; iGetNodeName(), "wall") == 0) { + if (xmlOption->startPos) { + UTIL_THROW_IF2(xmlOption->startPos > ret->GetSize(), "wall is beyond the sentence"); // no buggy walls, please + reorderingConstraint.SetWall(xmlOption->startPos - 1, true); + } + } else if (strcmp(xmlOption->GetNodeName(), "zone") == 0) { + reorderingConstraint.SetZone( xmlOption->startPos, xmlOption->startPos + xmlOption->phraseSize -1 ); + } else if (strcmp(xmlOption->GetNodeName(), "ne") == 0) { + FactorType placeholderFactor = system.options.input.placeholder_factor; + UTIL_THROW_IF2(placeholderFactor == NOT_FOUND, + "Placeholder XML in input. Must have argument -placeholder-factor [NUM]"); + UTIL_THROW_IF2(xmlOption->phraseSize != 1, + "Placeholder must only cover 1 word"); + + const Factor *factor = vocab.AddFactor(xmlOption->GetEntity(), system, false); + (*ret)[xmlOption->startPos][placeholderFactor] = factor; + } else { + // default - forced translation. Add to class variable + ret->AddXMLOption(system, xmlOption); + } + } + reorderingConstraint.FinalizeWalls(); + + return ret; +} + +void Sentence::XMLParse( + MemPool &pool, + const System &system, + size_t depth, + const pugi::xml_node &parentNode, + std::vector &toks, + vector &xmlOptions) +{ + // pugixml + for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling()) { + string nodeName = childNode.name(); + //cerr << depth << " nodeName=" << nodeName << endl; + + int startPos = toks.size(); + + string value = childNode.value(); + if (!value.empty()) { + //cerr << depth << "childNode text=" << value << endl; + std::vector subPhraseToks = Tokenize(value); + for (size_t i = 0; i < subPhraseToks.size(); ++i) { + toks.push_back(subPhraseToks[i]); + } + } + + if (!nodeName.empty()) { + XMLOption *xmlOption = new (pool.Allocate()) XMLOption(pool, nodeName, startPos); + + pugi::xml_attribute attr; + attr = childNode.attribute("translation"); + if (!attr.empty()) { + xmlOption->SetTranslation(pool, attr.as_string()); + } + + attr = childNode.attribute("entity"); + if (!attr.empty()) { + xmlOption->SetEntity(pool, attr.as_string()); + } + + attr = childNode.attribute("prob"); + if (!attr.empty()) { + xmlOption->prob = attr.as_float(); + } + + xmlOptions.push_back(xmlOption); + + // recursively call this function. For proper recursive trees + XMLParse(pool, system, depth + 1, childNode, toks, xmlOptions); + + size_t endPos = toks.size(); + xmlOption->phraseSize = endPos - startPos; + + /* + cerr << "xmlOptions="; + xmlOption->Debug(cerr, system); + cerr << endl; + */ + } + + } +} + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/PhraseBased/Sentence.h b/mosesdecoder/moses2/PhraseBased/Sentence.h new file mode 100644 index 0000000000000000000000000000000000000000..ff7c521382a9de0c0f7f87e7425645f496a8284d --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/Sentence.h @@ -0,0 +1,52 @@ +/* + * Sentence.h + * + * Created on: 14 Dec 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include "PhraseImpl.h" +#include "../InputType.h" +#include "../MemPool.h" +#include "../pugixml.hpp" +#include "../legacy/Util2.h" + +namespace Moses2 +{ +class FactorCollection; +class System; + +class Sentence: public InputType, public PhraseImpl +{ +public: + + static Sentence *CreateFromString(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str); + + Sentence(MemPool &pool, size_t size) + :InputType(pool) + ,PhraseImpl(pool, size) + {} + + virtual ~Sentence() + {} + +protected: + static Sentence *CreateFromStringXML(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str); + + static void XMLParse( + MemPool &pool, + const System &system, + size_t depth, + const pugi::xml_node &parentNode, + std::vector &toks, + std::vector &xmlOptions); + +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.cpp b/mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b13a648ee6d7960bed6c741df3aeb7cf6072be44 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.cpp @@ -0,0 +1,97 @@ +/* + * SentenceWithCandidates.cpp + * + * Created on: 14 Dec 2015 + * Author: hieu + */ +#include +#include +#include + +#include "SentenceWithCandidates.h" +#include "../System.h" +#include "../parameters/AllOptions.h" +#include "../legacy/Util2.h" +#include + +using namespace std; +using namespace boost; + +namespace Moses2 +{ + +const string SentenceWithCandidates::INPUT_PART_DELIM = "@@@"; +const string SentenceWithCandidates::PT_LINE_DELIM = "$$$"; + +SentenceWithCandidates *SentenceWithCandidates::CreateFromString(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str) +{ + SentenceWithCandidates *ret; + + // Break input into two parts: the parts are delimited by + typedef split_iterator string_split_iterator; + vector input_parts; + for(string_split_iterator It= make_split_iterator(str, first_finder(SentenceWithCandidates::INPUT_PART_DELIM, is_iequal())); + It!=string_split_iterator(); + ++It) + { + input_parts.push_back(copy_range(*It)); + } + + //cerr << "Number of subparts: " << input_parts.size() << endl; + + if (input_parts.size() ==2 ) { + //cerr << "correct number of parts" << endl ; + } else { + // TODO: how to handle wrong input format + cerr << "INCORRECT number of parts" << endl ; + exit(1); + } + + trim(input_parts[0]); + trim(input_parts[1]); + //cerr << "Input String: " << input_parts[0] << endl ; + //cerr << "Phrase Table: " << input_parts[1] << endl ; + + ///// Process the text part of the input + const string partstr = input_parts[0]; + + // no xml + //cerr << "PB SentenceWithCandidates" << endl; + std::vector toks = Tokenize(partstr); + + size_t size = toks.size(); + ret = new (pool.Allocate()) SentenceWithCandidates(pool, size); + ret->PhraseImplTemplate::CreateFromString(vocab, system, toks, false); + + //cerr << "REORDERING CONSTRAINTS:" << ret->GetReorderingConstraint() << endl; + //cerr << "ret=" << ret->Debug(system) << endl; + + + //// Parse the phrase table of the input + ret->m_phraseTableString = replace_all_copy(input_parts[1],PT_LINE_DELIM,"\n"); + // ret->m_phraseTableString="constant phrase table"; +// cerr << "Extracted Phrase Table String: " << ret->m_phraseTableString << endl; + //cerr << "Extracted Phrase Table String: " << ret->getPhraseTableString() << endl; + + return ret; +} + +SentenceWithCandidates::SentenceWithCandidates(MemPool &pool, size_t size) +:Sentence(pool, size) +{ + //cerr << "SentenceWithCandidates::SentenceWithCandidates" << endl; +} + +SentenceWithCandidates::~SentenceWithCandidates() +{ + //cerr << "SentenceWithCandidates::~SentenceWithCandidates" << endl; +} + +std::string SentenceWithCandidates::Debug(const System &system) const +{ + return "SentenceWithCandidates::Debug"; +} + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.h b/mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.h new file mode 100644 index 0000000000000000000000000000000000000000..fb550d57791f17a2e78e6e336ea2de158488837f --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.h @@ -0,0 +1,46 @@ +/* + * SentenceWithCandidates.h + * + * Created on: 14 Dec 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include "PhraseImpl.h" +#include "Sentence.h" +#include "../MemPool.h" +#include "../pugixml.hpp" +#include "../legacy/Util2.h" + +namespace Moses2 +{ +class FactorCollection; +class System; + +class SentenceWithCandidates: public Sentence +{ +public: + + static const std::string INPUT_PART_DELIM; + static const std::string PT_LINE_DELIM; + + static SentenceWithCandidates *CreateFromString(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str); + + SentenceWithCandidates(MemPool &pool, size_t size); + virtual ~SentenceWithCandidates(); + + virtual std::string Debug(const System &system) const; + std::string virtual getPhraseTableString() const{ + return m_phraseTableString; + } + +private: + std::string m_phraseTableString; + +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/PhraseBased/TargetPhraseImpl.cpp b/mosesdecoder/moses2/PhraseBased/TargetPhraseImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d9bc766d9c3bb6c9976437a0b343b3b19c70f4d4 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/TargetPhraseImpl.cpp @@ -0,0 +1,52 @@ +/* + * TargetPhraseImpl.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#include +#include +#include "TargetPhraseImpl.h" +#include "../Scores.h" +#include "../System.h" +#include "../MemPool.h" +#include "Manager.h" + +using namespace std; + +namespace Moses2 +{ + +TargetPhraseImpl *TargetPhraseImpl::CreateFromString(MemPool &pool, + const PhraseTable &pt, const System &system, const std::string &str) +{ + FactorCollection &vocab = system.GetVocab(); + + vector toks = Tokenize(str); + size_t size = toks.size(); + TargetPhraseImpl *ret = + new (pool.Allocate()) TargetPhraseImpl(pool, pt, system, + size); + ret->PhraseImplTemplate::CreateFromString(vocab, system, toks); + + return ret; +} + +TargetPhraseImpl::TargetPhraseImpl(MemPool &pool, const PhraseTable &pt, + const System &system, size_t size) + :Moses2::TargetPhrase(pool, pt, system, size) +{ + m_scores = new (pool.Allocate()) Scores(system, pool, + system.featureFunctions.GetNumScores()); + + size_t numWithPtData = system.featureFunctions.GetWithPhraseTableInd().size(); + ffData = new (pool.Allocate(numWithPtData)) void *[numWithPtData]; +} + +TargetPhraseImpl::~TargetPhraseImpl() +{ + // TODO Auto-generated destructor stub +} + +} diff --git a/mosesdecoder/moses2/PhraseBased/TargetPhraseImpl.h b/mosesdecoder/moses2/PhraseBased/TargetPhraseImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..026414b5dc930764cd0e32fabda308da1985032d --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/TargetPhraseImpl.h @@ -0,0 +1,57 @@ +/* + * TargetPhraseImpl.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include "../Phrase.h" +#include "../PhraseImplTemplate.h" +#include "../TargetPhrase.h" +#include "../MemPool.h" +#include "../Word.h" +#include "../SubPhrase.h" + +namespace Moses2 +{ + +class Scores; +class Manager; +class System; +class PhraseTable; + +class TargetPhraseImpl: public TargetPhrase +{ +public: + typedef TargetPhrase Parent; + + static TargetPhraseImpl *CreateFromString(MemPool &pool, + const PhraseTable &pt, const System &system, const std::string &str); + TargetPhraseImpl(MemPool &pool, const PhraseTable &pt, const System &system, + size_t size); + //TargetPhraseImpl(MemPool &pool, const System &system, const TargetPhraseImpl ©); + + virtual ~TargetPhraseImpl(); + + SCORE GetFutureScore() const { + return m_scores->GetTotalScore() + m_estimatedScore; + } + + void SetEstimatedScore(const SCORE &value) { + m_estimatedScore = value; + } + + virtual SCORE GetScoreForPruning() const { + return GetFutureScore(); + } + +protected: + SCORE m_estimatedScore; + +}; + +} + diff --git a/mosesdecoder/moses2/PhraseBased/TargetPhrases.cpp b/mosesdecoder/moses2/PhraseBased/TargetPhrases.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5533127534576ca175632691dc15b84c8f9d1111 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/TargetPhrases.cpp @@ -0,0 +1,78 @@ +/* + * TargetPhrases.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include "TargetPhrases.h" +#include "TargetPhraseImpl.h" +#include "../Phrase.h" +#include "../TargetPhrase.h" + +using namespace std; + +namespace Moses2 +{ + +TargetPhrases::TargetPhrases(MemPool &pool, size_t size) : + m_coll(pool, size), m_currInd(0) +{ +} + +/* + TargetPhrases::TargetPhrases(MemPool &pool, const System &system, const TargetPhrases ©) + :m_coll(pool, copy.m_coll.size()) + { + for (size_t i = 0; i < copy.m_coll.size(); ++i) { + const TargetPhrase *tpOrig = copy.m_coll[i]; + assert(tpOrig); + const TargetPhrase *tpClone = new (pool.Allocate()) TargetPhrase(pool, system, *tpOrig); + m_coll[i] = tpClone; + } + } + */ + +TargetPhrases::~TargetPhrases() +{ + // TODO Auto-generated destructor stub +} + +std::string TargetPhrases::Debug(const System &system) const +{ + stringstream out; + BOOST_FOREACH(const TargetPhraseImpl *tp, *this) { + out << tp->Debug(system); + out << endl; + } + return out.str(); +} + +void TargetPhrases::SortAndPrune(size_t tableLimit) +{ + iterator iterMiddle; + iterMiddle = + (tableLimit == 0 || m_coll.size() < tableLimit) ? + m_coll.end() : m_coll.begin() + tableLimit; + + std::partial_sort(m_coll.begin(), iterMiddle, m_coll.end(), + CompareScoreForPruning()); + + if (tableLimit && m_coll.size() > tableLimit) { + m_coll.resize(tableLimit); + } + + //cerr << "TargetPhrases=" << GetSize() << endl; +} + +/* + const TargetPhrases *TargetPhrases::Clone(MemPool &pool, const System &system) const + { + const TargetPhrases *ret = new (pool.Allocate()) TargetPhrases(pool, system, *this); + return ret; + } + */ + +} + diff --git a/mosesdecoder/moses2/PhraseBased/TargetPhrases.h b/mosesdecoder/moses2/PhraseBased/TargetPhrases.h new file mode 100644 index 0000000000000000000000000000000000000000..79595ab40224a98927e2d2cd04104d0ef0f2adec --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/TargetPhrases.h @@ -0,0 +1,61 @@ +/* + * TargetPhrases.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once +#include +#include "../Array.h" + +namespace Moses2 +{ + +class TargetPhraseImpl; + +class Word; +class System; + +class TargetPhrases +{ + typedef TargetPhraseImpl TP; + typedef Array Coll; +public: + typedef Coll::iterator iterator; + typedef Coll::const_iterator const_iterator; + //! iterators + const_iterator begin() const { + return m_coll.begin(); + } + const_iterator end() const { + return m_coll.end(); + } + + TargetPhrases(MemPool &pool, size_t size); + //TargetPhrases(MemPool &pool, const System &system, const TargetPhrases ©); + virtual ~TargetPhrases(); + + void AddTargetPhrase(const TP &targetPhrase) { + m_coll[m_currInd++] = &targetPhrase; + } + + size_t GetSize() const { + return m_coll.size(); + } + + const TP& operator[](size_t ind) const { + return *m_coll[ind]; + } + + void SortAndPrune(size_t tableLimit); + + std::string Debug(const System &system) const; + +protected: + Coll m_coll; + size_t m_currInd; +}; + +} + diff --git a/mosesdecoder/moses2/PhraseBased/TrellisPath.cpp b/mosesdecoder/moses2/PhraseBased/TrellisPath.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5a1132c600a514f867f4419f82d504700dbbcd05 --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/TrellisPath.cpp @@ -0,0 +1,175 @@ +/* + * TrellisPath.cpp + * + * Created on: 16 Mar 2016 + * Author: hieu + */ +#include +#include +#include "TrellisPath.h" +#include "Hypothesis.h" +#include "InputPath.h" +#include "../TrellisPaths.h" +#include "../System.h" +#include "../SubPhrase.h" + +using namespace std; + +namespace Moses2 +{ + +std::string TrellisNode::Debug(const System &system) const +{ + stringstream out; + out << "arcList=" << arcList->size() << " " << ind; + return out.str(); +} + +///////////////////////////////////////////////////////////////////////////////// +TrellisPath::TrellisPath(const Hypothesis *hypo, const ArcLists &arcLists) : + prevEdgeChanged(-1) +{ + AddNodes(hypo, arcLists); + m_scores = &hypo->GetScores(); +} + +TrellisPath::TrellisPath(const TrellisPath &origPath, size_t edgeIndex, + const TrellisNode &newNode, const ArcLists &arcLists, MemPool &pool, + const System &system) : + prevEdgeChanged(edgeIndex) +{ + nodes.reserve(origPath.nodes.size()); + for (size_t currEdge = 0; currEdge < edgeIndex; currEdge++) { + // copy path from parent + nodes.push_back(origPath.nodes[currEdge]); + } + + // 1 deviation + nodes.push_back(newNode); + + // rest of path comes from following best path backwards + const Hypothesis *arc = static_cast(newNode.GetHypo()); + + const Hypothesis *prevHypo = arc->GetPrevHypo(); + while (prevHypo != NULL) { + const ArcList &arcList = arcLists.GetArcList(prevHypo); + TrellisNode node(arcList, 0); + nodes.push_back(node); + + prevHypo = prevHypo->GetPrevHypo(); + } + + const TrellisNode &origNode = origPath.nodes[edgeIndex]; + const HypothesisBase *origHypo = origNode.GetHypo(); + const HypothesisBase *newHypo = newNode.GetHypo(); + + CalcScores(origPath.GetScores(), origHypo->GetScores(), newHypo->GetScores(), + pool, system); +} + +TrellisPath::~TrellisPath() +{ + // TODO Auto-generated destructor stub +} + +SCORE TrellisPath::GetFutureScore() const +{ + return m_scores->GetTotalScore(); +} + +std::string TrellisPath::Debug(const System &system) const +{ + stringstream out; + + out << OutputTargetPhrase(system); + out << "||| "; + + out << GetScores().Debug(system); + out << "||| "; + + out << GetScores().GetTotalScore(); + + return out.str(); +} + +void TrellisPath::OutputToStream(std::ostream &out, const System &system) const +{ + out << OutputTargetPhrase(system); + out << "||| "; + + GetScores().OutputBreakdown(out, system); + out << "||| "; + + out << GetScores().GetTotalScore(); +} + +std::string TrellisPath::OutputTargetPhrase(const System &system) const +{ + std::stringstream out; + for (int i = nodes.size() - 2; i >= 0; --i) { + const TrellisNode &node = nodes[i]; + + const Hypothesis *hypo = static_cast(node.GetHypo()); + const TargetPhrase &tp = hypo->GetTargetPhrase(); + + const InputPath &path = static_cast(hypo->GetInputPath()); + const SubPhrase &subPhrase = path.subPhrase; + + tp.OutputToStream(system, subPhrase, out); + } + return out.str(); +} + +void TrellisPath::CreateDeviantPaths(TrellisPaths &paths, + const ArcLists &arcLists, MemPool &pool, const System &system) const +{ + const size_t sizePath = nodes.size(); + + //cerr << "prevEdgeChanged=" << prevEdgeChanged << endl; + for (size_t currEdge = prevEdgeChanged + 1; currEdge < sizePath; currEdge++) { + TrellisNode newNode = nodes[currEdge]; + assert(newNode.ind == 0); + const ArcList &arcList = *newNode.arcList; + + //cerr << "arcList=" << arcList.size() << endl; + for (size_t i = 1; i < arcList.size(); ++i) { + //cerr << "i=" << i << endl; + newNode.ind = i; + + TrellisPath *deviantPath = new TrellisPath(*this, currEdge, newNode, + arcLists, pool, system); + //cerr << "deviantPath=" << deviantPath << endl; + paths.Add(deviantPath); + } + } +} + +void TrellisPath::CalcScores(const Scores &origScores, + const Scores &origHypoScores, const Scores &newHypoScores, MemPool &pool, + const System &system) +{ + Scores *scores = new (pool.Allocate()) Scores(system, pool, + system.featureFunctions.GetNumScores(), origScores); + scores->PlusEquals(system, newHypoScores); + scores->MinusEquals(system, origHypoScores); + + m_scores = scores; +} + +void TrellisPath::AddNodes(const Hypothesis *hypo, const ArcLists &arcLists) +{ + if (hypo) { + // add this hypo + //cerr << "hypo=" << hypo << " " << flush; + //cerr << *hypo << endl; + const ArcList &list = arcLists.GetArcList(hypo); + TrellisNode node(list, 0); + nodes.push_back(node); + + // add prev hypos + const Hypothesis *prev = hypo->GetPrevHypo(); + AddNodes(prev, arcLists); + } +} + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/PhraseBased/TrellisPath.h b/mosesdecoder/moses2/PhraseBased/TrellisPath.h new file mode 100644 index 0000000000000000000000000000000000000000..6852b43ba9ebb4b6dbda300fdcf001a4b4c5f00d --- /dev/null +++ b/mosesdecoder/moses2/PhraseBased/TrellisPath.h @@ -0,0 +1,84 @@ +/* + * TrellisPath.h + * + * Created on: 16 Mar 2016 + * Author: hieu + */ +#pragma once +#include +#include "../TypeDef.h" +#include "../ArcLists.h" + +namespace Moses2 +{ + +class Scores; +class MemPool; +class Hypothesis; +class System; + +template +class TrellisPaths; + +class TrellisNode +{ +public: + const ArcList *arcList; + size_t ind; + + TrellisNode(const ArcList &varcList, size_t vind) : + arcList(&varcList), ind(vind) { + } + + const HypothesisBase *GetHypo() const { + return (*arcList)[ind]; + } + + std::string Debug(const System &system) const; + +}; + +class TrellisPath +{ +public: + std::vector nodes; + int prevEdgeChanged; + + /**< the last node that was wiggled to create this path + , or NOT_FOUND if this path is the best trans so consist of only hypos + */ + TrellisPath(const Hypothesis *hypo, const ArcLists &arcLists); + + /** create path from another path, deviate at edgeIndex by using arc instead, + * which may change other hypo back from there + */ + TrellisPath(const TrellisPath &origPath, size_t edgeIndex, + const TrellisNode &newNode, const ArcLists &arcLists, MemPool &pool, + const System &system); + + virtual ~TrellisPath(); + + const Scores &GetScores() const { + return *m_scores; + } + SCORE GetFutureScore() const; + + std::string Debug(const System &system) const; + + void OutputToStream(std::ostream &out, const System &system) const; + std::string OutputTargetPhrase(const System &system) const; + + //! create a set of next best paths by wiggling 1 of the node at a time. + void CreateDeviantPaths(TrellisPaths &paths, const ArcLists &arcLists, + MemPool &pool, const System &system) const; + +protected: + const Scores *m_scores; + + void AddNodes(const Hypothesis *hypo, const ArcLists &arcLists); + void CalcScores(const Scores &origScores, const Scores &origHypoScores, + const Scores &newHypoScores, MemPool &pool, const System &system); +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/PhraseImplTemplate.h b/mosesdecoder/moses2/PhraseImplTemplate.h new file mode 100644 index 0000000000000000000000000000000000000000..a3ef32a7f78c09609beb29d63f42c4fc2c3d440e --- /dev/null +++ b/mosesdecoder/moses2/PhraseImplTemplate.h @@ -0,0 +1,81 @@ +/* + * PhraseImplTemplate.h + * + * Created on: 22 Feb 2016 + * Author: hieu + */ + +#pragma once + +#include +#include +#include "Phrase.h" +#include "SubPhrase.h" +#include "legacy/Util2.h" + +namespace Moses2 +{ + +template +class PhraseImplTemplate : public Phrase +{ +public: + PhraseImplTemplate(MemPool &pool, size_t size) : + m_size(size) { + m_words = new (pool.Allocate(size)) WORD[size]; + + } + + PhraseImplTemplate(MemPool &pool, const PhraseImplTemplate ©) : + m_size(copy.GetSize()) { + m_words = new (pool.Allocate(m_size)) WORD[m_size]; + for (size_t i = 0; i < m_size; ++i) { + const WORD &word = copy[i]; + (*this)[i] = word; + } + } + + virtual ~PhraseImplTemplate() { + } + + size_t GetSize() const { + return m_size; + } + + WORD& operator[](size_t pos) { + return m_words[pos]; + } + + const WORD& operator[](size_t pos) const { + return m_words[pos]; + } + + SubPhrase GetSubPhrase(size_t start, size_t size) const { + SubPhrase ret(*this, start, size); + return ret; + } + +protected: + size_t m_size; + WORD *m_words; + + void CreateFromString(FactorCollection &vocab, const System &system, + const std::vector &toks, bool addBOSEOS = false) { + size_t startPos = 0; + if (addBOSEOS) { + startPos = 1; + + m_words[0].CreateFromString(vocab, system, ""); + m_words[m_size-1].CreateFromString(vocab, system, ""); + } + + for (size_t i = 0; i < toks.size(); ++i) { + WORD &word = (*this)[startPos]; + word.CreateFromString(vocab, system, toks[i]); + ++startPos; + } + } +}; + +} + diff --git a/mosesdecoder/moses2/Recycler.cpp b/mosesdecoder/moses2/Recycler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b7a8fb77dccbb1d525bbd60ef3b849c564533eee --- /dev/null +++ b/mosesdecoder/moses2/Recycler.cpp @@ -0,0 +1,13 @@ +/* + * Recycler.cpp + * + * Created on: 2 Jan 2016 + * Author: hieu + */ + +#include "Recycler.h" + +namespace Moses2 +{ + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/Recycler.h b/mosesdecoder/moses2/Recycler.h new file mode 100644 index 0000000000000000000000000000000000000000..60bdddf315120a9569c9591c8ff66da8691006a8 --- /dev/null +++ b/mosesdecoder/moses2/Recycler.h @@ -0,0 +1,68 @@ +/* + * Recycler.h + * + * Created on: 2 Jan 2016 + * Author: hieu + */ +#pragma once + +#include +#include +#include + +namespace Moses2 +{ + +template +class Recycler +{ +public: + Recycler() : + m_currInd(0) { + } + virtual ~Recycler() { + } + + T Get() { + if (!m_coll.empty()) { + T &obj = m_coll.back(); + m_coll.pop_back(); + return obj; + } else if (m_currInd) { + --m_currInd; + T &obj = m_all[m_currInd]; + return obj; + } else { + return NULL; + } + } + + void Clear() { + m_coll.clear(); + m_currInd = m_all.size(); + } + + // call this for new objects when u 1st create it. It is assumed the object will be used right away + void Keep(const T& val) { + m_all.push_back(val); + } + + // call this for existing object to put back into queue for reuse + void Recycle(const T& val) { + m_coll.push_back(val); + } + +protected: + // all objects we're looking after + std::vector m_all; + + // pointer to the object that's just been given out. + // to give out another obj, must decrement THEN give out + size_t m_currInd; + + // objects that have been give back to us + std::deque m_coll; +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/SCFG/ActiveChart.cpp b/mosesdecoder/moses2/SCFG/ActiveChart.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fb4d84bfb2481dbdea1998d92c7fc0b1f8066991 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/ActiveChart.cpp @@ -0,0 +1,109 @@ +#include +#include +#include "ActiveChart.h" +#include "InputPath.h" +#include "Word.h" +#include "Hypothesis.h" +#include "../Vector.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ +SymbolBindElement::SymbolBindElement() +{ +} + +SymbolBindElement::SymbolBindElement( + const Moses2::Range &range, + const SCFG::Word &word, + const Moses2::Hypotheses *hypos) + :m_range(&range) + ,word(&word) + ,hypos(hypos) +{ + assert( (word.isNonTerminal && hypos) || (!word.isNonTerminal && hypos == NULL)); +} + +size_t hash_value(const SymbolBindElement &obj) +{ + size_t ret = (size_t) obj.hypos; + boost::hash_combine(ret, obj.word); + + return ret; +} + +std::string SymbolBindElement::Debug(const System &system) const +{ + stringstream out; + out << "("; + out << *m_range; + out << word->Debug(system); + out << ")"; + + return out.str(); +} + +//////////////////////////////////////////////////////////////////////////// +SymbolBind::SymbolBind(MemPool &pool) + :coll(pool) + ,numNT(0) +{ +} + +void SymbolBind::Add(const Range &range, const SCFG::Word &word, const Moses2::Hypotheses *hypos) +{ + SymbolBindElement ele(range, word, hypos); + coll.push_back(ele); + + if (word.isNonTerminal) { + ++numNT; + } +} + +std::vector SymbolBind::GetNTElements() const +{ + std::vector ret; + + for (size_t i = 0; i < coll.size(); ++i) { + const SymbolBindElement &ele = coll[i]; + //cerr << "ele=" << ele.word->isNonTerminal << " " << ele.hypos << endl; + + if (ele.word->isNonTerminal) { + ret.push_back(&ele); + } + } + + return ret; +} + +std::string SymbolBind::Debug(const System &system) const +{ + stringstream out; + BOOST_FOREACH(const SymbolBindElement &ele, coll) { + out << ele.Debug(system) << " "; + } + return out.str(); +} +//////////////////////////////////////////////////////////////////////////// +ActiveChartEntry::ActiveChartEntry(MemPool &pool) + :m_symbolBind(pool) +{ +} + +//////////////////////////////////////////////////////////////////////////// +ActiveChart::ActiveChart(MemPool &pool) + :entries(pool) +{ +} + +ActiveChart::~ActiveChart() +{ + +} + +} +} + diff --git a/mosesdecoder/moses2/SCFG/ActiveChart.h b/mosesdecoder/moses2/SCFG/ActiveChart.h new file mode 100644 index 0000000000000000000000000000000000000000..baf3a09dd398449a07272b69d42ddb24e1f98b66 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/ActiveChart.h @@ -0,0 +1,126 @@ +#pragma once +#include +#include +#include +#include "../legacy/Range.h" +#include "../HypothesisColl.h" + +namespace Moses2 +{ +class System; +class PhraseTable; + +namespace SCFG +{ +class InputPath; +class Word; + +//////////////////////////////////////////////////////////////////////////// +//! The range covered by each symbol in the source +//! Terminals will cover only 1 word, NT can cover multiple words +class SymbolBindElement +{ +public: + const SCFG::Word *word; // can be term or non-term + + const Moses2::Hypotheses *hypos; // NULL if terminal + + SymbolBindElement(); + SymbolBindElement(const Moses2::Range &range, const SCFG::Word &word, const Moses2::Hypotheses *hypos); + + const Range &GetRange() const { + return *m_range; + } + + bool operator==(const SymbolBindElement &compare) const { + bool ret = hypos == compare.hypos + && word == compare.word; + return ret; + } + + std::string Debug(const System &system) const; + +protected: + const Moses2::Range *m_range; + +}; + +size_t hash_value(const SymbolBindElement &obj); + +//////////////////////////////////////////////////////////////////////////// +class SymbolBind +{ +public: + typedef Vector Coll; + Coll coll; + size_t numNT; + + SymbolBind(MemPool &pool); + + SymbolBind(MemPool &pool, const SymbolBind ©) + :coll(copy.coll) + ,numNT(copy.numNT) + {} + + size_t GetSize() const { + return coll.size(); + } + + std::vector GetNTElements() const; + + void Add(const Range &range, const SCFG::Word &word, const Moses2::Hypotheses *hypos); + + bool operator==(const SymbolBind &compare) const { + return coll == compare.coll; + } + + std::string Debug(const System &system) const; + +}; + +inline size_t hash_value(const SymbolBind &obj) +{ + return boost::hash_value(obj.coll); +} + +//////////////////////////////////////////////////////////////////////////// +class ActiveChartEntry +{ +public: + ActiveChartEntry(MemPool &pool); + + ActiveChartEntry(MemPool &pool, const ActiveChartEntry &prevEntry) + :m_symbolBind(pool, prevEntry.GetSymbolBind()) { + //symbolBinds = new (pool.Allocate()) SymbolBind(pool, *prevEntry.symbolBinds); + } + + const SymbolBind &GetSymbolBind() const { + return m_symbolBind; + } + + virtual void AddSymbolBindElement( + const Range &range, + const SCFG::Word &word, + const Moses2::Hypotheses *hypos, + const PhraseTable &pt) { + m_symbolBind.Add(range, word, hypos); + } + +protected: + SymbolBind m_symbolBind; + +}; + +//////////////////////////////////////////////////////////////////////////// +class ActiveChart +{ +public: + ActiveChart(MemPool &pool); + ~ActiveChart(); + + Vector entries; +}; + +} +} + diff --git a/mosesdecoder/moses2/SCFG/Hypothesis.cpp b/mosesdecoder/moses2/SCFG/Hypothesis.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c7ae8c798e4b59cbe2c2e9ac67e1bea0d009c9e3 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Hypothesis.cpp @@ -0,0 +1,178 @@ +#include +#include +#include "Hypothesis.h" +#include "Manager.h" +#include "ActiveChart.h" +#include "TargetPhraseImpl.h" +#include "Sentence.h" +#include "../System.h" +#include "../Scores.h" +#include "../InputPathBase.h" +#include "../FF/StatefulFeatureFunction.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ +Hypothesis *Hypothesis::Create(MemPool &pool, Manager &mgr) +{ + // ++g_numHypos; + Hypothesis *ret; + //ret = new (pool.Allocate()) Hypothesis(pool, mgr.system); + + Recycler &recycler = mgr.GetHypoRecycle(); + ret = static_cast(recycler.Get()); + if (ret) { + // got new hypo from recycler. Do nothing + } else { + ret = new (pool.Allocate()) Hypothesis(pool, mgr.system); + //cerr << "Hypothesis=" << sizeof(Hypothesis) << " " << ret << endl; + recycler.Keep(ret); + } + return ret; +} + +Hypothesis::Hypothesis(MemPool &pool, + const System &system) + :HypothesisBase(pool, system) + ,m_prevHypos(pool) +{ + +} + +void Hypothesis::Init(SCFG::Manager &mgr, + const SCFG::InputPath &path, + const SCFG::SymbolBind &symbolBind, + const SCFG::TargetPhraseImpl &tp, + const Vector &prevHyposIndices) +{ + m_mgr = &mgr; + m_targetPhrase = &tp; + m_path = &path; + m_symbolBind = &symbolBind; + + m_scores->Reset(mgr.system); + m_scores->PlusEquals(mgr.system, GetTargetPhrase().GetScores()); + + //cerr << "tp=" << tp << endl; + //cerr << "symbolBind=" << symbolBind << endl; + //cerr << endl; + m_prevHypos.resize(symbolBind.numNT); + + size_t currInd = 0; + for (size_t i = 0; i < symbolBind.coll.size(); ++i) { + const SymbolBindElement &ele = symbolBind.coll[i]; + //cerr << "ele=" << ele.word->isNonTerminal << " " << ele.hypos << endl; + + if (ele.hypos) { + const Hypotheses &sortedHypos = *ele.hypos; + + size_t prevHyposInd = prevHyposIndices[currInd]; + assert(prevHyposInd < sortedHypos.size()); + + const Hypothesis *prevHypo = static_cast(sortedHypos[prevHyposInd]); + m_prevHypos[currInd] = prevHypo; + + m_scores->PlusEquals(mgr.system, prevHypo->GetScores()); + + ++currInd; + } + } +} + +SCORE Hypothesis::GetFutureScore() const +{ + return GetScores().GetTotalScore(); +} + +void Hypothesis::EvaluateWhenApplied() +{ + const std::vector &sfffs = + GetManager().system.featureFunctions.GetStatefulFeatureFunctions(); + BOOST_FOREACH(const StatefulFeatureFunction *sfff, sfffs) { + EvaluateWhenApplied(*sfff); + } +//cerr << *this << endl; + +} + +void Hypothesis::EvaluateWhenApplied(const StatefulFeatureFunction &sfff) +{ + const SCFG::Manager &mgr = static_cast(GetManager()); + size_t statefulInd = sfff.GetStatefulInd(); + FFState *thisState = m_ffStates[statefulInd]; + sfff.EvaluateWhenApplied(mgr, *this, statefulInd, GetScores(), + *thisState); + +} + +void Hypothesis::OutputToStream(std::ostream &strm) const +{ + const SCFG::TargetPhraseImpl &tp = GetTargetPhrase(); + //cerr << "tp=" << tp.Debug(m_mgr->system) << endl; + + for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) { + const SCFG::Word &word = tp[targetPos]; + //cerr << "word " << targetPos << "=" << word << endl; + if (word.isNonTerminal) { + //cerr << "is nt" << endl; + // non-term. fill out with prev hypo + size_t nonTermInd = tp.GetAlignNonTerm().GetNonTermIndexMap()[targetPos]; + const Hypothesis *prevHypo = m_prevHypos[nonTermInd]; + prevHypo->OutputToStream(strm); + } else { + word.OutputToStream(*m_mgr, targetPos, *this, strm); + strm << " "; + } + + } +} + +std::string Hypothesis::Debug(const System &system) const +{ + stringstream out; + out << this << flush; + + out << " RANGE:"; + out << m_path->range << " "; + out << m_symbolBind->Debug(system) << " "; + + // score + out << " SCORE:" << GetScores().Debug(GetManager().system) << flush; + + out << m_targetPhrase->Debug(GetManager().system); + + out << "PREV:"; + for (size_t i = 0; i < m_prevHypos.size(); ++i) { + const Hypothesis *prevHypo = m_prevHypos[i]; + out << prevHypo << prevHypo->GetInputPath().range << "(" << prevHypo->GetFutureScore() << ") "; + } + out << endl; + + /* + // recursive + for (size_t i = 0; i < m_prevHypos.size(); ++i) { + const Hypothesis *prevHypo = m_prevHypos[i]; + out << prevHypo->Debug(GetManager().system) << " "; + } + */ + + return out.str(); +} + +void Hypothesis::OutputTransOpt(std::ostream &out) const +{ + out << GetInputPath().range << " " + << "score=" << GetScores().GetTotalScore() << " " + << GetTargetPhrase().Debug(m_mgr->system) << endl; + + BOOST_FOREACH(const Hypothesis *prevHypo, m_prevHypos) { + prevHypo->OutputTransOpt(out); + } +} + +} // namespaces +} + diff --git a/mosesdecoder/moses2/SCFG/Hypothesis.h b/mosesdecoder/moses2/SCFG/Hypothesis.h new file mode 100644 index 0000000000000000000000000000000000000000..fbbd663aaa78fa2f99d535ffc60455bd5858c309 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Hypothesis.h @@ -0,0 +1,76 @@ +#pragma once +#include +#include "InputPath.h" +#include "../HypothesisBase.h" +#include "../MemPool.h" +#include "../Vector.h" + +namespace Moses2 +{ +class InputPathBase; +class StatefulFeatureFunction; + +namespace SCFG +{ +class TargetPhraseImpl; +class Manager; +class SymbolBind; +class InputPath; + +class Hypothesis: public HypothesisBase +{ +public: + static Hypothesis *Create(MemPool &pool, Manager &mgr); + + void Init(SCFG::Manager &mgr, + const SCFG::InputPath &path, + const SCFG::SymbolBind &symbolBind, + const SCFG::TargetPhraseImpl &tp, + const Vector &prevHyposIndices); + + virtual SCORE GetFutureScore() const; + virtual void EvaluateWhenApplied(); + + const SCFG::TargetPhraseImpl &GetTargetPhrase() const { + return *m_targetPhrase; + } + + const SCFG::InputPath &GetInputPath() const { + return *m_path; + } + + const SCFG::SymbolBind &GetSymbolBind() const { + return *m_symbolBind; + } + + const Vector &GetPrevHypos() const { + return m_prevHypos; + } + + //! get a particular previous hypos + const Hypothesis* GetPrevHypo(size_t ind) const { + return m_prevHypos[ind]; + } + + void OutputToStream(std::ostream &strm) const; + void OutputTransOpt(std::ostream &strm) const; + + std::string Debug(const System &system) const; + +protected: + const SCFG::TargetPhraseImpl *m_targetPhrase; + const SCFG::InputPath *m_path; + const SCFG::SymbolBind *m_symbolBind; + + Vector m_prevHypos; // always sorted by source position? + + Hypothesis(MemPool &pool, + const System &system); + + void EvaluateWhenApplied(const StatefulFeatureFunction &sfff); + +}; + +} +} + diff --git a/mosesdecoder/moses2/SCFG/InputPath.cpp b/mosesdecoder/moses2/SCFG/InputPath.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4fcbbb2b0caf05eba8f29532a26bc00f074e8b19 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/InputPath.cpp @@ -0,0 +1,120 @@ +/* + * InputPath.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include "InputPath.h" +#include "TargetPhrases.h" +#include "ActiveChart.h" +#include "../TranslationModel/PhraseTable.h" +#include "../MemPoolAllocator.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ + +InputPath::InputPath(MemPool &pool, const SubPhrase &subPhrase, + const Range &range, size_t numPt, const InputPath *prefixPath) + :InputPathBase(pool, range, numPt, prefixPath) + ,subPhrase(subPhrase) + ,targetPhrases(MemPoolAllocator(pool)) +{ + m_activeChart = pool.Allocate(numPt); + for (size_t i = 0; i < numPt; ++i) { + ActiveChart &memAddr = m_activeChart[i]; + new (&memAddr) ActiveChart(pool); + } +} + +InputPath::~InputPath() +{ + // TODO Auto-generated destructor stub +} + +std::string InputPath::Debug(const System &system) const +{ + stringstream out; + out << range << " "; + out << subPhrase.Debug(system); + out << " " << prefixPath << " "; + + const Vector &activeEntries = GetActiveChart(1).entries; + out << "m_activeChart=" << activeEntries.size() << " "; + + for (size_t i = 0; i < activeEntries.size(); ++i) { + const ActiveChartEntry &entry = *activeEntries[i]; + out << entry.GetSymbolBind().Debug(system); + out << "| "; + } + + // tps + out << "tps=" << targetPhrases.size(); + + out << " "; + BOOST_FOREACH(const SCFG::InputPath::Coll::value_type &valPair, targetPhrases) { + const SymbolBind &symbolBind = valPair.first; + const SCFG::TargetPhrases &tps = *valPair.second; + out << symbolBind.Debug(system); + //out << "=" << tps.GetSize() << " "; + out << tps.Debug(system); + } + + return out.str(); +} + +void InputPath::AddTargetPhrasesToPath( + MemPool &pool, + const System &system, + const PhraseTable &pt, + const SCFG::TargetPhrases &tps, + const SCFG::SymbolBind &symbolBind) +{ + targetPhrases.push_back(Element(symbolBind, &tps)); + /* + Coll::iterator iterColl; + iterColl = targetPhrases.find(symbolBind); + assert(iterColl == targetPhrases.end()); + + targetPhrases[symbolBind] = &tps; + //cerr << "range=" << range << " symbolBind=" << symbolBind.Debug(system) << " tps=" << tps.Debug(system); + */ + /* + SCFG::TargetPhrases *tpsNew; + tpsNew = new (pool.Allocate()) SCFG::TargetPhrases(pool); + targetPhrases[symbolBind] = tpsNew; + + SCFG::TargetPhrases::const_iterator iter; + for (iter = tps.begin(); iter != tps.end(); ++iter) { + const SCFG::TargetPhraseImpl *tp = *iter; + //cerr << "tpCast=" << *tp << endl; + tpsNew->AddTargetPhrase(*tp); + } + cerr << "range=" << range << " symbolBind=" << symbolBind.Debug(system) << " tpsNew=" << tpsNew->Debug(system); + */ +} + +void InputPath::AddActiveChartEntry(size_t ptInd, ActiveChartEntry *chartEntry) +{ + //cerr << " added " << chartEntry << " " << range << " " << ptInd << endl; + ActiveChart &activeChart = m_activeChart[ptInd]; + activeChart.entries.push_back(chartEntry); +} + +size_t InputPath::GetNumRules() const +{ + size_t ret = 0; + BOOST_FOREACH(const Coll::value_type &valPair, targetPhrases) { + const SCFG::TargetPhrases &tps = *valPair.second; + ret += tps.GetSize(); + } + return ret; +} + +} +} + diff --git a/mosesdecoder/moses2/SCFG/InputPath.h b/mosesdecoder/moses2/SCFG/InputPath.h new file mode 100644 index 0000000000000000000000000000000000000000..bef9e0a792af7f9d4df0586ac6262f66ab8eba95 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/InputPath.h @@ -0,0 +1,64 @@ +/* + * InputPath.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include "../InputPathBase.h" +#include "../MemPoolAllocator.h" +#include "TargetPhrases.h" +#include "ActiveChart.h" +#include "Word.h" + +namespace Moses2 +{ +namespace SCFG +{ +class TargetPhrases; +class TargetPhraseImpl; + + +//////////////////////////////////////////////////////////////////////////// +class InputPath: public InputPathBase +{ +public: + typedef std::pair Element; + typedef std::list > Coll; + Coll targetPhrases; + + SubPhrase subPhrase; + + InputPath(MemPool &pool, const SubPhrase &subPhrase, const Range &range, + size_t numPt, const InputPath *prefixPath); + virtual ~InputPath(); + + const ActiveChart &GetActiveChart(size_t ptInd) const { + return m_activeChart[ptInd]; + } + + void AddActiveChartEntry(size_t ptInd, ActiveChartEntry *chartEntry); + + void AddTargetPhrasesToPath( + MemPool &pool, + const System &system, + const PhraseTable &pt, + const SCFG::TargetPhrases &tps, + const SCFG::SymbolBind &symbolBind); + + size_t GetNumRules() const; + + std::string Debug(const System &system) const; + +protected: + ActiveChart *m_activeChart; +}; + +} +} + diff --git a/mosesdecoder/moses2/SCFG/InputPaths.cpp b/mosesdecoder/moses2/SCFG/InputPaths.cpp new file mode 100644 index 0000000000000000000000000000000000000000..77478cd987ed10b1d08f98ff4dc37ef3cc6bd626 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/InputPaths.cpp @@ -0,0 +1,88 @@ +/* + * InputPaths.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include "InputPaths.h" +#include "Sentence.h" +#include "../System.h" +#include "../legacy/Range.h" +#include "Manager.h" +#include "InputPath.h" +#include "Word.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ +void InputPaths::Init(const InputType &input, const ManagerBase &mgr) +{ + const Sentence &sentence = static_cast(input); + MemPool &pool = mgr.GetPool(); + size_t numPt = mgr.system.mappings.size(); + size_t size = sentence.GetSize(); + //cerr << "size=" << size << endl; + + m_matrix = new (pool.Allocate< Matrix >()) Matrix(pool, + size, size + 1); + m_matrix->Init(NULL); + + for (size_t startPos = 0; startPos < size; ++startPos) { + // create path for 0 length string + Range range(startPos, startPos - 1); + SubPhrase subPhrase = sentence.GetSubPhrase(startPos, 0); + + SCFG::InputPath *path = new (pool.Allocate()) SCFG::InputPath(pool, + subPhrase, range, numPt, NULL); + //cerr << "path=" << *path << endl; + m_inputPaths.push_back(path); + m_matrix->SetValue(startPos, 0, path); + + // create normal paths of subphrases through the sentence + const SCFG::InputPath *prefixPath = path; + for (size_t phaseSize = 1; phaseSize <= size; ++phaseSize) { + size_t endPos = startPos + phaseSize - 1; // pb-like indexing. eg. [1-1] covers 1 word, NOT 0 + + if (endPos >= size) { + break; + } + + SubPhrase subPhrase = sentence.GetSubPhrase(startPos, phaseSize); + Range range(startPos, endPos); + + SCFG::InputPath *path = new (pool.Allocate()) + SCFG::InputPath(pool, subPhrase, range, numPt, prefixPath); + //cerr << "path=" << *path << endl; + m_inputPaths.push_back(path); + + prefixPath = path; + m_matrix->SetValue(startPos, phaseSize, path); + } + } + +} + +std::string InputPaths::Debug(const System &system) const +{ + stringstream out; + const Matrix &matrix = GetMatrix(); + for (size_t i = 0; i < matrix.GetRows(); ++i) { + for (size_t j = 0; j < matrix.GetCols(); ++j) { + SCFG::InputPath *path = matrix.GetValue(i, j); + if (path) { + out << path->Debug(system); + out << endl; + } + } + } + return out.str(); +} + +} +} + diff --git a/mosesdecoder/moses2/SCFG/InputPaths.h b/mosesdecoder/moses2/SCFG/InputPaths.h new file mode 100644 index 0000000000000000000000000000000000000000..57c45414fd37a2fcbff5f30da4767072ae8b5212 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/InputPaths.h @@ -0,0 +1,42 @@ +/* + * InputPaths.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include "../InputPathsBase.h" +#include "../legacy/Matrix.h" + +namespace Moses2 +{ + +class Sentence; +class System; + +namespace SCFG +{ +class InputPath; + +class InputPaths: public InputPathsBase +{ +public: + void Init(const InputType &input, const ManagerBase &mgr); + + const Matrix &GetMatrix() const { + return *m_matrix; + } + + std::string Debug(const System &system) const; + +protected: + Matrix *m_matrix; + +}; + +} +} + diff --git a/mosesdecoder/moses2/SCFG/Manager.cpp b/mosesdecoder/moses2/SCFG/Manager.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6e10b32edc715eab788354b2a3f75ff041b3342a --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Manager.cpp @@ -0,0 +1,388 @@ +/* + * Manager.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include +#include +#include "../System.h" +#include "../TranslationModel/PhraseTable.h" +#include "Manager.h" +#include "InputPath.h" +#include "Hypothesis.h" +#include "TargetPhraseImpl.h" +#include "ActiveChart.h" +#include "Sentence.h" + +#include "nbest/KBestExtractor.h" + +using namespace std; + +namespace Moses2 +{ + +namespace SCFG +{ + +Manager::Manager(System &sys, const TranslationTask &task, + const std::string &inputStr, long translationId) + :ManagerBase(sys, task, inputStr, translationId) +{ + +} + +Manager::~Manager() +{ + +} + +void Manager::Decode() +{ + // init pools etc + //cerr << "START InitPools()" << endl; + InitPools(); + //cerr << "START ParseInput()" << endl; + + FactorCollection &vocab = system.GetVocab(); + m_input = Sentence::CreateFromString(GetPool(), vocab, system, m_inputStr, + m_translationId); + + const SCFG::Sentence &sentence = static_cast(GetInput()); + + size_t inputSize = sentence.GetSize(); + //cerr << "inputSize=" << inputSize << endl; + + m_inputPaths.Init(sentence, *this); + //cerr << "CREATED m_inputPaths" << endl; + + m_stacks.Init(*this, inputSize); + //cerr << "CREATED m_stacks" << endl; + + for (int startPos = inputSize - 1; startPos >= 0; --startPos) { + //cerr << endl << "startPos=" << startPos << endl; + SCFG::InputPath &initPath = *m_inputPaths.GetMatrix().GetValue(startPos, 0); + + //cerr << "BEFORE InitActiveChart=" << initPath.Debug(system) << endl; + InitActiveChart(initPath); + //cerr << "AFTER InitActiveChart=" << initPath.Debug(system) << endl; + + int maxPhraseSize = inputSize - startPos + 1; + for (int phraseSize = 1; phraseSize < maxPhraseSize; ++phraseSize) { + //cerr << endl << "phraseSize=" << phraseSize << endl; + + SCFG::InputPath &path = *m_inputPaths.GetMatrix().GetValue(startPos, phraseSize); + + Stack &stack = m_stacks.GetStack(startPos, phraseSize); + + //cerr << "BEFORE LOOKUP path=" << path.Debug(system) << endl; + Lookup(path); + //cerr << "AFTER LOOKUP path=" << path.Debug(system) << endl; + Decode(path, stack); + //cerr << "AFTER DECODE path=" << path.Debug(system) << endl; + + LookupUnary(path); + //cerr << "AFTER LookupUnary path=" << path.Debug(system) << endl; + + //cerr << "#rules=" << path.GetNumRules() << endl; + } + } + + /* + const Stack *stack; + stack = &m_stacks.GetStack(0, 5); + cerr << "stack 0,12:" << stack->Debug(system) << endl; + */ + //m_stacks.OutputStacks(); +} + +void Manager::InitActiveChart(SCFG::InputPath &path) +{ + size_t numPt = system.mappings.size(); + //cerr << "numPt=" << numPt << endl; + + for (size_t i = 0; i < numPt; ++i) { + const PhraseTable &pt = *system.mappings[i]; + //cerr << "START InitActiveChart" << endl; + pt.InitActiveChart(GetPool(), *this, path); + //cerr << "FINISHED InitActiveChart" << endl; + } +} + +void Manager::Lookup(SCFG::InputPath &path) +{ + size_t numPt = system.mappings.size(); + //cerr << "numPt=" << numPt << endl; + + for (size_t i = 0; i < numPt; ++i) { + const PhraseTable &pt = *system.mappings[i]; + size_t maxChartSpan = system.maxChartSpans[i]; + pt.Lookup(GetPool(), *this, maxChartSpan, m_stacks, path); + } + + /* + size_t tpsNum = path.targetPhrases.GetSize(); + if (tpsNum) { + cerr << tpsNum << " " << path << endl; + } + */ +} + +void Manager::LookupUnary(SCFG::InputPath &path) +{ + size_t numPt = system.mappings.size(); + //cerr << "numPt=" << numPt << endl; + + for (size_t i = 0; i < numPt; ++i) { + const PhraseTable &pt = *system.mappings[i]; + pt.LookupUnary(GetPool(), *this, m_stacks, path); + } + + /* + size_t tpsNum = path.targetPhrases.GetSize(); + if (tpsNum) { + cerr << tpsNum << " " << path << endl; + } + */ +} + +/////////////////////////////////////////////////////////////// +// CUBE-PRUNING +/////////////////////////////////////////////////////////////// +void Manager::Decode(SCFG::InputPath &path, Stack &stack) +{ + // clear cube pruning data + //std::vector &container = Container(m_queue); + //container.clear(); + Recycler &hypoRecycler = GetHypoRecycle(); + while (!m_queue.empty()) { + QueueItem *item = m_queue.top(); + m_queue.pop(); + // recycle unused hypos from queue + Hypothesis *hypo = item->hypo; + hypoRecycler.Recycle(hypo); + + // recycle queue item + m_queueItemRecycler.push_back(item); + } + + m_seenPositions.clear(); + + // init queue + BOOST_FOREACH(const InputPath::Coll::value_type &valPair, path.targetPhrases) { + const SymbolBind &symbolBind = valPair.first; + const SCFG::TargetPhrases &tps = *valPair.second; + + CreateQueue(path, symbolBind, tps); + } + + // MAIN LOOP + size_t pops = 0; + while (!m_queue.empty() && pops < system.options.cube.pop_limit) { + //cerr << "pops=" << pops << endl; + QueueItem *item = m_queue.top(); + m_queue.pop(); + + // add hypo to stack + Hypothesis *hypo = item->hypo; + + //cerr << "hypo=" << *hypo << " " << endl; + stack.Add(hypo, GetHypoRecycle(), arcLists); + //cerr << "Added " << *hypo << " " << endl; + + item->CreateNext(GetSystemPool(), GetPool(), *this, m_queue, m_seenPositions, path); + //cerr << "Created next " << endl; + m_queueItemRecycler.push_back(item); + + ++pops; + } + +} + +void Manager::CreateQueue( + const SCFG::InputPath &path, + const SymbolBind &symbolBind, + const SCFG::TargetPhrases &tps) +{ + MemPool &pool = GetPool(); + + SeenPosition *seenItem = new (pool.Allocate()) SeenPosition(pool, symbolBind, tps, symbolBind.numNT); + bool unseen = m_seenPositions.Add(seenItem); + assert(unseen); + + QueueItem *item = QueueItem::Create(GetPool(), *this); + item->Init(GetPool(), symbolBind, tps, seenItem->hypoIndColl); + for (size_t i = 0; i < symbolBind.coll.size(); ++i) { + const SymbolBindElement &ele = symbolBind.coll[i]; + if (ele.hypos) { + const Moses2::Hypotheses *hypos = ele.hypos; + item->AddHypos(*hypos); + } + } + + item->CreateHypo(GetSystemPool(), *this, path, symbolBind); + + //cerr << "hypo=" << item->hypo->Debug(system) << endl; + + m_queue.push(item); +} + +/////////////////////////////////////////////////////////////// +// NON CUBE-PRUNING +/////////////////////////////////////////////////////////////// +/* +void Manager::Decode(SCFG::InputPath &path, Stack &stack) +{ + //cerr << "path=" << path << endl; + + boost::unordered_map::const_iterator iterOuter; + for (iterOuter = path.targetPhrases->begin(); iterOuter != path.targetPhrases->end(); ++iterOuter) { + const SCFG::SymbolBind &symbolBind = iterOuter->first; + + const SCFG::TargetPhrases &tps = *iterOuter->second; + //cerr << "symbolBind=" << symbolBind << " tps=" << tps.GetSize() << endl; + + SCFG::TargetPhrases::const_iterator iter; + for (iter = tps.begin(); iter != tps.end(); ++iter) { + const SCFG::TargetPhraseImpl &tp = **iter; + //cerr << "tp=" << tp << endl; + ExpandHypo(path, symbolBind, tp, stack); + } + } +} +*/ + +void Manager::ExpandHypo( + const SCFG::InputPath &path, + const SCFG::SymbolBind &symbolBind, + const SCFG::TargetPhraseImpl &tp, + Stack &stack) +{ + Recycler &hypoRecycler = GetHypoRecycle(); + + std::vector ntEles = symbolBind.GetNTElements(); + Vector prevHyposIndices(GetPool(), symbolBind.numNT); + assert(ntEles.size() == symbolBind.numNT); + //cerr << "ntEles:" << ntEles.size() << endl; + + size_t ind = 0; + while (IncrPrevHypoIndices(prevHyposIndices, ind, ntEles)) { + SCFG::Hypothesis *hypo = SCFG::Hypothesis::Create(GetSystemPool(), *this); + hypo->Init(*this, path, symbolBind, tp, prevHyposIndices); + hypo->EvaluateWhenApplied(); + + stack.Add(hypo, hypoRecycler, arcLists); + + ++ind; + } +} + +bool Manager::IncrPrevHypoIndices( + Vector &prevHyposIndices, + size_t ind, + const std::vector ntEles) +{ + if (ntEles.size() == 0) { + // no nt. Do the 1st + return ind ? false : true; + } + + size_t numHypos = 0; + + //cerr << "IncrPrevHypoIndices:" << ind << " " << ntEles.size() << " "; + for (size_t i = 0; i < ntEles.size() - 1; ++i) { + const SymbolBindElement &ele = *ntEles[i]; + const Hypotheses &hypos = *ele.hypos; + numHypos = hypos.size(); + + std::div_t divRet = std::div((int)ind, (int)numHypos); + ind = divRet.quot; + + size_t hypoInd = divRet.rem; + prevHyposIndices[i] = hypoInd; + //cerr << "(" << i << "," << ind << "," << numHypos << "," << hypoInd << ")"; + } + + // last + prevHyposIndices.back() = ind; + + + // check if last is over limit + const SymbolBindElement &ele = *ntEles.back(); + const Hypotheses &hypos = *ele.hypos; + numHypos = hypos.size(); + + //cerr << "(" << (ntEles.size() - 1) << "," << ind << "," << numHypos << "," << ind << ")"; + //cerr << endl; + + if (ind >= numHypos) { + return false; + } else { + return true; + } +} + +std::string Manager::OutputBest() const +{ + string out; + const Stack &lastStack = m_stacks.GetLastStack(); + const SCFG::Hypothesis *bestHypo = lastStack.GetBestHypo(); + + if (bestHypo) { + //cerr << "BEST TRANSLATION: " << bestHypo << bestHypo->Debug(system) << endl; + //cerr << " " << out.str() << endl; + stringstream outStrm; + Moses2::FixPrecision(outStrm); + + bestHypo->OutputToStream(outStrm); + + out = outStrm.str(); + out = out.substr(4, out.size() - 10); + + if (system.options.output.ReportHypoScore) { + out = SPrint(bestHypo->GetScores().GetTotalScore()) + " " + out; + } + } else { + if (system.options.output.ReportHypoScore) { + out = "0 "; + } + + //cerr << "NO TRANSLATION " << GetTranslationId() << endl; + } + + return out; +} + +std::string Manager::OutputNBest() +{ + stringstream out; + //Moses2::FixPrecision(out); + + arcLists.Sort(); + //cerr << "arcs=" << arcLists.Debug(system) << endl; + + KBestExtractor extractor(*this); + extractor.OutputToStream(out); + + return out.str(); +} + +std::string Manager::OutputTransOpt() +{ + const Stack &lastStack = m_stacks.GetLastStack(); + const SCFG::Hypothesis *bestHypo = lastStack.GetBestHypo(); + + if (bestHypo) { + stringstream outStrm; + bestHypo->OutputTransOpt(outStrm); + return outStrm.str(); + } else { + return ""; + } +} + +} // namespace +} + diff --git a/mosesdecoder/moses2/SCFG/Manager.h b/mosesdecoder/moses2/SCFG/Manager.h new file mode 100644 index 0000000000000000000000000000000000000000..a9a575896b4803cce7ac404f3b9534972fa0f373 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Manager.h @@ -0,0 +1,86 @@ +/* + * Manager.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include +#include "../ManagerBase.h" +#include "Stacks.h" +#include "InputPaths.h" +#include "Misc.h" + +namespace Moses2 +{ + +namespace SCFG +{ +class SymbolBind; +class TargetPhraseImpl; +class SymbolBindElement; + +class Manager: public Moses2::ManagerBase +{ +public: + Manager(System &sys, const TranslationTask &task, const std::string &inputStr, + long translationId); + + virtual ~Manager(); + void Decode(); + std::string OutputBest() const; + std::string OutputNBest(); + std::string OutputTransOpt(); + + const InputPaths &GetInputPaths() const { + return m_inputPaths; + } + + QueueItemRecycler &GetQueueItemRecycler() { + return m_queueItemRecycler; + } + + const Stacks &GetStacks() const { + return m_stacks; + } + +protected: + Stacks m_stacks; + SCFG::InputPaths m_inputPaths; + + void InitActiveChart(SCFG::InputPath &path); + void Lookup(SCFG::InputPath &path); + void LookupUnary(SCFG::InputPath &path); + void Decode(SCFG::InputPath &path, Stack &stack); + + void ExpandHypo( + const SCFG::InputPath &path, + const SCFG::SymbolBind &symbolBind, + const SCFG::TargetPhraseImpl &tp, + Stack &stack); + + bool IncrPrevHypoIndices( + Vector &prevHyposIndices, + size_t ind, + const std::vector ntEles); + + // cube pruning + Queue m_queue; + SeenPositions m_seenPositions; + + QueueItemRecycler m_queueItemRecycler; + + void CreateQueue( + const SCFG::InputPath &path, + const SymbolBind &symbolBind, + const SCFG::TargetPhrases &tps); +}; + +} +} + diff --git a/mosesdecoder/moses2/SCFG/Misc.cpp b/mosesdecoder/moses2/SCFG/Misc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9a340928a0e745bb222cceccb92e8a5d1a6d1c93 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Misc.cpp @@ -0,0 +1,230 @@ +/* + * Misc.cpp + * + * Created on: 2 Jun 2016 + * Author: hieu + */ +#include +#include +#include "Misc.h" +#include "Manager.h" +#include "TargetPhrases.h" + +using namespace std; + +namespace Moses2 +{ + +namespace SCFG +{ + +//////////////////////////////////////////////////////// +SeenPosition::SeenPosition(MemPool &pool, + const SymbolBind &vSymbolBind, + const SCFG::TargetPhrases &vtps, + size_t numNT) + :symbolBind(vSymbolBind) + ,tps(vtps) + ,tpInd(0) + ,hypoIndColl(pool, numNT, 0) +{ +} + +SeenPosition::SeenPosition(MemPool &pool, + const SymbolBind &vSymbolBind, + const SCFG::TargetPhrases &vtps, + size_t vtpInd, + const Vector &vhypoIndColl) + :symbolBind(vSymbolBind) + ,tps(vtps) + ,tpInd(vtpInd) + ,hypoIndColl(pool, vhypoIndColl.size()) +{ + for (size_t i = 0; i < hypoIndColl.size(); ++i) { + hypoIndColl[i] = vhypoIndColl[i]; + } +} + +std::string SeenPosition::Debug(const System &system) const +{ + stringstream out; + out << &tps << " " << tpInd << " "; + + for (size_t i = 0; i < hypoIndColl.size(); ++i) { + out << hypoIndColl[i] << " "; + } + + return out.str(); +} + +bool SeenPosition::operator==(const SeenPosition &compare) const +{ + if (&symbolBind != &compare.symbolBind) { + return false; + } + + if (&tps != &compare.tps) { + return false; + } + + if (tpInd != compare.tpInd) { + return false; + } + + if (hypoIndColl != compare.hypoIndColl) { + return false; + } + + return true; +} + +size_t SeenPosition::hash() const +{ + size_t ret = (size_t) &symbolBind; + boost::hash_combine(ret, &tps); + boost::hash_combine(ret, tpInd); + boost::hash_combine(ret, hypoIndColl); + return ret; +} + +//////////////////////////////////////////////////////// +bool SeenPositions::Add(const SeenPosition *item) +{ + std::pair ret = m_coll.insert(item); + return ret.second; +} + +//////////////////////////////////////////////////////// +QueueItem *QueueItem::Create(MemPool &pool, SCFG::Manager &mgr) +{ + //QueueItem *item = new (pool.Allocate()) QueueItem(pool); + //return item; + + QueueItemRecycler &queueItemRecycler = mgr.GetQueueItemRecycler(); + QueueItem *ret; + if (!queueItemRecycler.empty()) { + // use item from recycle bin + ret = queueItemRecycler.back(); + queueItemRecycler.pop_back(); + } else { + // create new item + ret = new (pool.Allocate()) QueueItem(pool); + } + + return ret; + +} + +QueueItem::QueueItem(MemPool &pool) + :m_hypoIndColl(NULL) +{ + +} + +void QueueItem::Init( + MemPool &pool, + const SymbolBind &vSymbolBind, + const SCFG::TargetPhrases &vTPS, + const Vector &hypoIndColl) +{ + symbolBind = &vSymbolBind; + tps = &vTPS; + tpInd = 0; + m_hyposColl = new (pool.Allocate()) HyposColl(pool); + m_hypoIndColl = &hypoIndColl; +} + +void QueueItem::Init( + MemPool &pool, + const SymbolBind &vSymbolBind, + const SCFG::TargetPhrases &vTPS, + size_t vTPInd, + const Vector &hypoIndColl) +{ + symbolBind = &vSymbolBind; + tps = &vTPS; + tpInd = vTPInd; + m_hyposColl = NULL; + m_hypoIndColl = &hypoIndColl; +} + +void QueueItem::AddHypos(const Moses2::Hypotheses &hypos) +{ + m_hyposColl->push_back(&hypos); +} + +void QueueItem::CreateHypo( + MemPool &systemPool, + SCFG::Manager &mgr, + const SCFG::InputPath &path, + const SCFG::SymbolBind &symbolBind) +{ + const SCFG::TargetPhraseImpl &tp = (*tps)[tpInd]; + + hypo = SCFG::Hypothesis::Create(systemPool, mgr); + hypo->Init(mgr, path, symbolBind, tp, *m_hypoIndColl); + hypo->EvaluateWhenApplied(); +} + +void QueueItem::CreateNext( + MemPool &systemPool, + MemPool &mgrPool, + SCFG::Manager &mgr, + SCFG::Queue &queue, + SeenPositions &seenPositions, + const SCFG::InputPath &path) +{ + //cerr << "tpInd=" << tpInd << " " << tps->GetSize() << endl; + if (tpInd + 1 < tps->GetSize()) { + + const SCFG::TargetPhraseImpl &tp = (*tps)[tpInd + 1]; + SeenPosition *seenItem = new (mgrPool.Allocate()) SeenPosition(mgrPool, *symbolBind, *tps, tpInd + 1, *m_hypoIndColl); + bool unseen = seenPositions.Add(seenItem); + + if (unseen) { + QueueItem *item = QueueItem::Create(mgrPool, mgr); + item->Init(mgrPool, *symbolBind, *tps, tpInd + 1, *m_hypoIndColl); + item->m_hyposColl = m_hyposColl; + item->CreateHypo(systemPool, mgr, path, *symbolBind); + + queue.push(item); + } + } + + assert(m_hyposColl->size() == m_hypoIndColl->size()); + const SCFG::TargetPhraseImpl &tp = (*tps)[tpInd]; + for (size_t i = 0; i < m_hyposColl->size(); ++i) { + const Moses2::Hypotheses &hypos = *(*m_hyposColl)[i]; + size_t hypoInd = (*m_hypoIndColl)[i] + 1; // increment hypo + + if (hypoInd < hypos.size()) { + SeenPosition *seenItem = new (mgrPool.Allocate()) SeenPosition(mgrPool, *symbolBind, *tps, tpInd, *m_hypoIndColl); + seenItem->hypoIndColl[i] = hypoInd; + bool unseen = seenPositions.Add(seenItem); + + if (unseen) { + QueueItem *item = QueueItem::Create(mgrPool, mgr); + item->Init(mgrPool, *symbolBind, *tps, tpInd, seenItem->hypoIndColl); + + item->m_hyposColl = m_hyposColl; + item->CreateHypo(systemPool, mgr, path, *symbolBind); + + queue.push(item); + } + } + } +} + +std::string QueueItem::Debug(const System &system) const +{ + stringstream out; + out << hypo << " " << &(*tps)[tpInd] << "(" << tps << " " << tpInd << ") "; + for (size_t i = 0; i < m_hypoIndColl->size(); ++i) { + out << (*m_hypoIndColl)[i] << " "; + } + + return out.str(); +} + +} +} diff --git a/mosesdecoder/moses2/SCFG/Misc.h b/mosesdecoder/moses2/SCFG/Misc.h new file mode 100644 index 0000000000000000000000000000000000000000..85c7584407d503b137a1296e2cd5a278623f4b05 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Misc.h @@ -0,0 +1,146 @@ +/* + * Misc.h + * + * Created on: 2 Jun 2016 + * Author: hieu + */ +#pragma once +#include +#include +#include +#include "../HypothesisColl.h" +#include "../Vector.h" +#include "Hypothesis.h" + +namespace Moses2 +{ + +namespace SCFG +{ +class SymbolBind; +class TargetPhrases; +class Queue; + +/////////////////////////////////////////// +class SeenPosition +{ +public: + const SymbolBind &symbolBind; + const SCFG::TargetPhrases &tps; + size_t tpInd; + Vector hypoIndColl; + + SeenPosition(MemPool &pool, + const SymbolBind &vSymbolBind, + const SCFG::TargetPhrases &vtps, + size_t numNT); + SeenPosition(MemPool &pool, + const SymbolBind &vSymbolBind, + const SCFG::TargetPhrases &vtps, + size_t vtpInd, + const Vector &vhypoIndColl); + + bool operator==(const SeenPosition &compare) const; + size_t hash() const; + + std::string Debug(const System &system) const; + +}; + +/////////////////////////////////////////// + +class SeenPositions +{ +public: + bool Add(const SeenPosition *item); + + void clear() { + m_coll.clear(); + } + + +protected: + typedef boost::unordered_set, UnorderedComparer > Coll; + Coll m_coll; +}; + +/////////////////////////////////////////// +class QueueItem +{ +public: + SCFG::Hypothesis *hypo; + + static QueueItem *Create(MemPool &pool, SCFG::Manager &mgr); + + void Init( + MemPool &pool, + const SymbolBind &symbolBind, + const SCFG::TargetPhrases &tps, + const Vector &hypoIndColl); + void Init( + MemPool &pool, + const SymbolBind &symbolBind, + const SCFG::TargetPhrases &tps, + size_t vTPInd, + const Vector &hypoIndColl); + void AddHypos(const Moses2::Hypotheses &hypos); + void CreateHypo( + MemPool &systemPool, + SCFG::Manager &mgr, + const SCFG::InputPath &path, + const SCFG::SymbolBind &symbolBind); + + void CreateNext( + MemPool &systemPool, + MemPool &mgrPool, + SCFG::Manager &mgr, + SCFG::Queue &queue, + SeenPositions &seenPositions, + const SCFG::InputPath &path); + + std::string Debug(const System &system) const; + +protected: + typedef Vector HyposColl; + HyposColl *m_hyposColl; + + const SymbolBind *symbolBind; + const SCFG::TargetPhrases *tps; + size_t tpInd; + + const Vector *m_hypoIndColl; // pointer to variable in seen position + // hypos and ind to the 1 we're using + + QueueItem(MemPool &pool); + +}; + +/////////////////////////////////////////// + +typedef std::deque QueueItemRecycler; + +/////////////////////////////////////////// +class QueueItemOrderer +{ +public: + bool operator()(QueueItem* itemA, QueueItem* itemB) const { + HypothesisFutureScoreOrderer orderer; + return !orderer(itemA->hypo, itemB->hypo); + } +}; + +/////////////////////////////////////////// +class Queue : public std::priority_queue, + QueueItemOrderer> +{ + +}; + + +} +} + + + diff --git a/mosesdecoder/moses2/SCFG/PhraseImpl.cpp b/mosesdecoder/moses2/SCFG/PhraseImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..398e8e21760f1104e0f3a3285e8c58ac07a7d1a8 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/PhraseImpl.cpp @@ -0,0 +1,37 @@ +/* + * PhraseImpl.cpp + * + * Created on: 19 Feb 2016 + * Author: hieu + */ +#include "PhraseImpl.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ +PhraseImpl *PhraseImpl::CreateFromString(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str, bool skipLastWord) +{ + std::vector toks = Moses2::Tokenize(str); + size_t size = toks.size(); + if (skipLastWord) { + --size; + } + PhraseImpl *ret; + + ret = new (pool.Allocate()) PhraseImpl(pool, size); + + for (size_t i = 0; i < size; ++i) { + SCFG::Word &word = (*ret)[i]; + word.CreateFromString(vocab, system, toks[i]); + } + + return ret; +} + +} +} + diff --git a/mosesdecoder/moses2/SCFG/PhraseImpl.h b/mosesdecoder/moses2/SCFG/PhraseImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..f61bf29150c7295b91eb0592063ac8ae272d0a26 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/PhraseImpl.h @@ -0,0 +1,25 @@ +#pragma once +#include "../PhraseImplTemplate.h" +#include "../SubPhrase.h" +#include "Word.h" + +namespace Moses2 +{ +namespace SCFG +{ + +class PhraseImpl: public PhraseImplTemplate +{ +public: + static PhraseImpl *CreateFromString(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str, bool skipLastWord = true); + + PhraseImpl(MemPool &pool, size_t size) : + PhraseImplTemplate(pool, size) { + } + +}; + +} +} + diff --git a/mosesdecoder/moses2/SCFG/Sentence.cpp b/mosesdecoder/moses2/SCFG/Sentence.cpp new file mode 100644 index 0000000000000000000000000000000000000000..de82e3ee9ec1b4fa48eb8f35398779956130c8bb --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Sentence.cpp @@ -0,0 +1,154 @@ +/* + * Sentence.cpp + * + * Created on: 14 Dec 2015 + * Author: hieu + */ + +#include "Sentence.h" +#include "../System.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ +Sentence *Sentence::CreateFromString(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str, long translationId) +{ + //cerr << "SCFG Sentence" << endl; + + Sentence *ret; + + if (system.options.input.xml_policy) { + // xml + ret = CreateFromStringXML(pool, vocab, system, str); + //cerr << "ret=" << ret->Debug(system) << endl; + } else { + std::vector toks = Tokenize(str); + size_t size = toks.size() + 2; + + ret = new (pool.Allocate()) Sentence(pool, size); + ret->PhraseImplTemplate::CreateFromString(vocab, system, toks, true); + + } + + return ret; +} + +Sentence *Sentence::CreateFromStringXML(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str) +{ + Sentence *ret; + + vector xmlOptions; + pugi::xml_document doc; + + string str2 = "" + str + ""; + pugi::xml_parse_result result = doc.load(str2.c_str(), + pugi::parse_cdata | pugi::parse_wconv_attribute | pugi::parse_eol | pugi::parse_comments); + pugi::xml_node topNode = doc.child("xml"); + + std::vector toks; + XMLParse(pool, system, 0, topNode, toks, xmlOptions); + + // debug + /* + cerr << "xmloptions:" << endl; + for (size_t i = 0; i < xmlOptions.size(); ++i) { + cerr << xmlOptions[i]->Debug(system) << endl; + } + */ + + // create words + size_t size = toks.size() + 2; + ret = new (pool.Allocate()) Sentence(pool, size); + ret->PhraseImplTemplate::CreateFromString(vocab, system, toks, true); + + // xml + for(size_t i=0; iGetNodeName(), "ne") == 0) { + FactorType placeholderFactor = system.options.input.placeholder_factor; + UTIL_THROW_IF2(placeholderFactor == NOT_FOUND, + "Placeholder XML in input. Must have argument -placeholder-factor [NUM]"); + UTIL_THROW_IF2(xmlOption->phraseSize != 1, + "Placeholder must only cover 1 word"); + + const Factor *factor = vocab.AddFactor(xmlOption->GetEntity(), system, false); + (*ret)[xmlOption->startPos + 1][placeholderFactor] = factor; + } else { + // default - forced translation. Add to class variable + ret->AddXMLOption(system, xmlOption); + } + } + + //cerr << "ret=" << ret->Debug(system) << endl; + return ret; +} + +void Sentence::XMLParse( + MemPool &pool, + const System &system, + size_t depth, + const pugi::xml_node &parentNode, + std::vector &toks, + vector &xmlOptions) +{ + // pugixml + for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling()) { + string nodeName = childNode.name(); + //cerr << depth << " nodeName=" << nodeName << endl; + + int startPos = toks.size(); + + string value = childNode.value(); + if (!value.empty()) { + //cerr << depth << "childNode text=" << value << endl; + std::vector subPhraseToks = Tokenize(value); + for (size_t i = 0; i < subPhraseToks.size(); ++i) { + toks.push_back(subPhraseToks[i]); + } + } + + if (!nodeName.empty()) { + XMLOption *xmlOption = new (pool.Allocate()) XMLOption(pool, nodeName, startPos); + + pugi::xml_attribute attr; + attr = childNode.attribute("translation"); + if (!attr.empty()) { + xmlOption->SetTranslation(pool, attr.as_string()); + } + + attr = childNode.attribute("entity"); + if (!attr.empty()) { + xmlOption->SetEntity(pool, attr.as_string()); + } + + attr = childNode.attribute("prob"); + if (!attr.empty()) { + xmlOption->prob = attr.as_float(); + } + + xmlOptions.push_back(xmlOption); + + // recursively call this function. For proper recursive trees + XMLParse(pool, system, depth + 1, childNode, toks, xmlOptions); + + size_t endPos = toks.size(); + xmlOption->phraseSize = endPos - startPos; + + /* + cerr << "xmlOptions="; + xmlOption->Debug(cerr, system); + cerr << endl; + */ + } + + } +} + +} +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/SCFG/Sentence.h b/mosesdecoder/moses2/SCFG/Sentence.h new file mode 100644 index 0000000000000000000000000000000000000000..1f4378caf8d6083bd647cbd9f210454f14bd8d78 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Sentence.h @@ -0,0 +1,54 @@ +/* + * Sentence.h + * + * Created on: 14 Dec 2015 + * Author: hieu + */ +#pragma once + +#include +#include "PhraseImpl.h" +#include "../InputType.h" +#include "../MemPool.h" +#include "../legacy/Util2.h" +#include "../pugixml.hpp" + +namespace Moses2 +{ +class FactorCollection; +class System; + +namespace SCFG +{ + +class Sentence: public InputType, public PhraseImpl +{ +public: + static Sentence *CreateFromString(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str, long translationId); + + Sentence(MemPool &pool, size_t size) + :InputType(pool) + ,PhraseImpl(pool, size) + {} + + virtual ~Sentence() + {} + +protected: + static Sentence *CreateFromStringXML(MemPool &pool, FactorCollection &vocab, + const System &system, const std::string &str); + + static void XMLParse( + MemPool &pool, + const System &system, + size_t depth, + const pugi::xml_node &parentNode, + std::vector &toks, + std::vector &xmlOptions); + +}; + +} +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/SCFG/Stack.cpp b/mosesdecoder/moses2/SCFG/Stack.cpp new file mode 100644 index 0000000000000000000000000000000000000000..25517d00697f05453435b746ef8d81823f980eea --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Stack.cpp @@ -0,0 +1,106 @@ +#include +#include "Stacks.h" +#include "Hypothesis.h" +#include "TargetPhraseImpl.h" +#include "Manager.h" + +using namespace std; + +namespace Moses2 +{ + +namespace SCFG +{ + +Stack::Stack(const Manager &mgr) + :m_mgr(mgr) +{ +} + +Stack::~Stack() +{ + BOOST_FOREACH (const Coll::value_type &valPair, m_coll) { + Moses2::HypothesisColl *hypos = valPair.second; + delete hypos; + } +} + +void Stack::Add(SCFG::Hypothesis *hypo, Recycler &hypoRecycle, + ArcLists &arcLists) +{ + const SCFG::TargetPhraseImpl &tp = hypo->GetTargetPhrase(); + const SCFG::Word &lhs = tp.lhs; + //cerr << "lhs=" << lhs << endl; + + HypothesisColl &coll = GetColl(lhs); + coll.Add(m_mgr, hypo, hypoRecycle, arcLists); +} + +size_t Stack::GetSize() const +{ + size_t ret = 0; + BOOST_FOREACH (const Coll::value_type &valPair, m_coll) { + Moses2::HypothesisColl &hypos = *valPair.second; + ret += hypos.GetSize(); + } + return ret; +} + +const Moses2::HypothesisColl *Stack::GetColl(const SCFG::Word &nt) const +{ + assert(nt.isNonTerminal); + Coll::const_iterator iter = m_coll.find(nt); + if (iter != m_coll.end()) { + return NULL; + } else { + return iter->second; + } +} + +Moses2::HypothesisColl &Stack::GetColl(const SCFG::Word &nt) +{ + Moses2::HypothesisColl *ret; + Coll::iterator iter; + iter = m_coll.find(nt); + if (iter == m_coll.end()) { + ret = new Moses2::HypothesisColl(m_mgr); + m_coll[nt] = ret; + } else { + ret = iter->second; + } + return *ret; +} + +const Hypothesis *Stack::GetBestHypo() const +{ + SCORE bestScore = -std::numeric_limits::infinity(); + const HypothesisBase *bestHypo = NULL; + BOOST_FOREACH(const Coll::value_type &val, m_coll) { + const Moses2::HypothesisColl &hypos = *val.second; + const Moses2::HypothesisBase *hypo = hypos.GetBestHypo(); + + if (hypo->GetFutureScore() > bestScore) { + bestScore = hypo->GetFutureScore(); + bestHypo = hypo; + } + } + return &bestHypo->Cast(); +} + +std::string Stack::Debug(const System &system) const +{ + stringstream out; + BOOST_FOREACH (const SCFG::Stack::Coll::value_type &valPair, m_coll) { + const SCFG::Word &lhs = valPair.first; + const Moses2::HypothesisColl &hypos = *valPair.second; + out << "lhs=" << lhs.Debug(system); + out << "=" << hypos.GetSize() << endl; + out << hypos.Debug(system); + out << endl; + } + + return out.str(); +} + +} +} diff --git a/mosesdecoder/moses2/SCFG/Stack.h b/mosesdecoder/moses2/SCFG/Stack.h new file mode 100644 index 0000000000000000000000000000000000000000..413f0749b4fa89179203a264e71354d9a2bb0f17 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Stack.h @@ -0,0 +1,51 @@ +#pragma once +#include +#include "../HypothesisColl.h" +#include "../Recycler.h" +#include "Word.h" + +namespace Moses2 +{ +class HypothesisBase; +class ArcLists; + +namespace SCFG +{ +class Hypothesis; +class Manager; + +class Stack +{ +public: + typedef boost::unordered_map Coll; + + Stack(const Manager &mgr); + virtual ~Stack(); + + const Coll &GetColl() const { + return m_coll; + } + + const Moses2::HypothesisColl *GetColl(const SCFG::Word &nt) const; + + size_t GetSize() const; + + void Add(SCFG::Hypothesis *hypo, Recycler &hypoRecycle, + ArcLists &arcLists); + + const Hypothesis *GetBestHypo() const; + + std::string Debug(const System &system) const; + +protected: + const Manager &m_mgr; + Coll m_coll; + + Moses2::HypothesisColl &GetColl(const SCFG::Word &nt); + +}; + +} + +} + diff --git a/mosesdecoder/moses2/SCFG/Stacks.cpp b/mosesdecoder/moses2/SCFG/Stacks.cpp new file mode 100644 index 0000000000000000000000000000000000000000..63214c7c341d7cc37fc78e6713e3494217945986 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Stacks.cpp @@ -0,0 +1,56 @@ +#include "Stacks.h" +#include "Stack.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ +Stacks::~Stacks() +{ + for (size_t i = 0; i < m_cells.size(); ++i) { + std::vector &inner = m_cells[i]; + for (size_t j = 0; j < inner.size(); ++j) { + Stack *stack = inner[j]; + delete stack; + } + } +} + +void Stacks::Init(SCFG::Manager &mgr, size_t size) +{ + m_cells.resize(size); + for (size_t startPos = 0; startPos < size; ++startPos) { + std::vector &inner = m_cells[startPos]; + inner.reserve(size - startPos); + for (size_t endPos = startPos; endPos < size; ++endPos) { + inner.push_back(new Stack(mgr)); + } + } +} + +void Stacks::OutputStacks() const +{ + size_t size = m_cells.size(); + + for (size_t startPos = 0; startPos < size; ++startPos) { + cerr.width(3); + cerr << startPos << " "; + } + cerr << endl; + for (size_t width = 1; width <= size; width++) { + for( size_t space = 0; space < width-1; space++ ) { + cerr << " "; + } + for (size_t startPos = 0; startPos <= size-width; ++startPos) { + cerr.width(3); + cerr << GetStack(startPos, width).GetSize() << " "; + } + cerr << endl; + } + +} + +} +} diff --git a/mosesdecoder/moses2/SCFG/Stacks.h b/mosesdecoder/moses2/SCFG/Stacks.h new file mode 100644 index 0000000000000000000000000000000000000000..09aedb01e412808f8fbd16a5c2392628af4f53f9 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Stacks.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include +#include "Stack.h" + +namespace Moses2 +{ +class ManagerBase; + +namespace SCFG +{ +class Stacks +{ +public: + virtual ~Stacks(); + + void Init(SCFG::Manager &mgr, size_t size); + + const Stack &GetStack(size_t startPos, size_t size) const { + return *m_cells[startPos][size - 1]; + } + + Stack &GetStack(size_t startPos, size_t size) { + return *m_cells[startPos][size - 1]; + } + + void OutputStacks() const; + + const Stack &GetLastStack() const { + return GetStack(0, m_cells.size()); + } + +protected: + std::vector > m_cells; + +}; + +} + +} + diff --git a/mosesdecoder/moses2/SCFG/TargetPhraseImpl.cpp b/mosesdecoder/moses2/SCFG/TargetPhraseImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e58e057e48f5714cad9d48a00ccd1d0aa09272c0 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/TargetPhraseImpl.cpp @@ -0,0 +1,125 @@ +/* + * TargetPhraseImpl.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#include +#include "TargetPhraseImpl.h" +#include "../Scores.h" +#include "../System.h" +#include "../MemPool.h" +#include "../PhraseBased/Manager.h" +#include "../AlignmentInfoCollection.h" +#include "../TranslationModel/PhraseTable.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ + +TargetPhraseImpl *TargetPhraseImpl::CreateFromString(MemPool &pool, + const PhraseTable &pt, const System &system, const std::string &str) +{ + //cerr << "str=" << str << endl; + FactorCollection &vocab = system.GetVocab(); + + vector toks = Tokenize(str); + size_t size = toks.size() - 1; + TargetPhraseImpl *ret = + new (pool.Allocate()) TargetPhraseImpl(pool, pt, system, + size); + + for (size_t i = 0; i < size; ++i) { + SCFG::Word &word = (*ret)[i]; + word.CreateFromString(vocab, system, toks[i]); + } + + // lhs + ret->lhs.CreateFromString(vocab, system, toks.back()); + //cerr << "ret=" << *ret << endl; + return ret; +} + +TargetPhraseImpl::TargetPhraseImpl(MemPool &pool, + const PhraseTable &pt, + const System &system, + size_t size) + :Moses2::TargetPhrase(pool, pt, system, size) + ,m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + +{ + m_scores = new (pool.Allocate()) Scores(system, pool, + system.featureFunctions.GetNumScores()); + +} + +TargetPhraseImpl::~TargetPhraseImpl() +{ + // TODO Auto-generated destructor stub +} + +std::string TargetPhraseImpl::Debug(const System &system) const +{ + stringstream out; + out << lhs.Debug(system); + out << " -> "; + for (size_t i = 0; i < GetSize(); ++i) { + const SCFG::Word &word = (*this)[i]; + out << word.Debug(system) << " "; + } + out << "pt=" << pt.GetName(); + out << " SCORES:" << GetScores().Debug(system); + out << " ALIGN-T:"; + out << GetAlignTerm().Debug(system); + out << " ALIGN-NT:"; + out << GetAlignNonTerm().Debug(system); + + return out.str(); +} + +void TargetPhraseImpl::SetAlignmentInfo(const std::string &alignString) +{ + AlignmentInfo::CollType alignTerm, alignNonTerm; + + vector toks = Tokenize(alignString); + for (size_t i = 0; i < toks.size(); ++i) { + vector alignPair = Tokenize(toks[i], "-"); + UTIL_THROW_IF2(alignPair.size() != 2, "Wrong alignment format"); + + size_t sourcePos = alignPair[0]; + size_t targetPos = alignPair[1]; + + if ((*this)[targetPos].isNonTerminal) { + alignNonTerm.insert(std::pair(sourcePos, targetPos)); + } else { + alignTerm.insert(std::pair(sourcePos, targetPos)); + } + } + + SetAlignTerm(alignTerm); + SetAlignNonTerm(alignNonTerm); + // cerr << "TargetPhrase::SetAlignmentInfo(const StringPiece &alignString) this:|" << *this << "|\n"; + + //cerr << "alignTerm=" << alignTerm.size() << endl; + //cerr << "alignNonTerm=" << alignNonTerm.size() << endl; + +} + +size_t TargetPhraseImpl::GetNumNonTerms() const +{ + size_t ret = 0; + for (size_t i = 0; i < GetSize(); ++i) { + if ((*this)[i].isNonTerminal) { + ++ret; + } + } + return ret; +} + + +} +} diff --git a/mosesdecoder/moses2/SCFG/TargetPhraseImpl.h b/mosesdecoder/moses2/SCFG/TargetPhraseImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..286ce215753eb937410185eb2ea62869c0b18292 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/TargetPhraseImpl.h @@ -0,0 +1,91 @@ +/* + * TargetPhraseImpl.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include "../Phrase.h" +#include "../PhraseImplTemplate.h" +#include "../TargetPhrase.h" +#include "../MemPool.h" +#include "../SubPhrase.h" +#include "../AlignmentInfoCollection.h" +#include "Word.h" + +namespace Moses2 +{ +class Scores; +class Manager; +class System; +class PhraseTable; +class AlignmentInfo; + +namespace SCFG +{ + +class TargetPhraseImpl: public Moses2::TargetPhrase +{ +public: + typedef Moses2::TargetPhrase Parent; + + SCFG::Word lhs; + + static TargetPhraseImpl *CreateFromString(MemPool &pool, + const PhraseTable &pt, const System &system, const std::string &str); + + TargetPhraseImpl(MemPool &pool, const PhraseTable &pt, const System &system, + size_t size); + //TargetPhraseImpl(MemPool &pool, const System &system, const TargetPhraseImpl ©); + + virtual ~TargetPhraseImpl(); + + const AlignmentInfo &GetAlignNonTerm() const { + return *m_alignNonTerm; + } + + void SetAlignNonTerm(const AlignmentInfo &alignInfo) { + m_alignNonTerm = &alignInfo; + } + + void SetAlignmentInfo(const std::string &alignString); + + SCORE GetFutureScore() const { + return m_scores->GetTotalScore() + m_estimatedScore; + } + + virtual SCORE GetScoreForPruning() const { + return GetFutureScore(); + } + + void SetEstimatedScore(const SCORE &value) { + m_estimatedScore = value; + } + + std::string Debug(const System &system) const; + + size_t GetNumNonTerms() const; + + //mutable void *chartState; +protected: + SCORE m_estimatedScore; + + const AlignmentInfo *m_alignNonTerm; + + // ALNREP = alignment representation, + // see AlignmentInfo constructors for supported representations + template + void + SetAlignNonTerm(const ALNREP &coll) { + m_alignNonTerm = AlignmentInfoCollection::Instance().Add(coll); + } + +}; + + +} +} + diff --git a/mosesdecoder/moses2/SCFG/TargetPhrases.cpp b/mosesdecoder/moses2/SCFG/TargetPhrases.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fbef79e9ccb46dc14799ebcbcd9dbf73641874da --- /dev/null +++ b/mosesdecoder/moses2/SCFG/TargetPhrases.cpp @@ -0,0 +1,66 @@ +/* + * TargetPhrases.cpp + * + * Created on: 15 Apr 2016 + * Author: hieu + */ + +#include +#include +#include +#include "TargetPhrases.h" +#include "TargetPhraseImpl.h" +#include "../TargetPhrase.h" +#include "../TranslationModel/PhraseTable.h" + +namespace Moses2 +{ +namespace SCFG +{ +TargetPhrases::TargetPhrases(MemPool &pool) + :m_coll(pool) +{ +} + +TargetPhrases::TargetPhrases(MemPool &pool, size_t size) + :m_coll(pool) +{ + m_coll.reserve(size); +} + +TargetPhrases::~TargetPhrases() +{ + // TODO Auto-generated destructor stub +} + +void TargetPhrases::SortAndPrune(size_t tableLimit) +{ + iterator iterMiddle; + iterMiddle = + (tableLimit == 0 || m_coll.size() < tableLimit) ? + m_coll.end() : m_coll.begin() + tableLimit; + + std::partial_sort(m_coll.begin(), iterMiddle, m_coll.end(), + CompareScoreForPruning()); + + if (tableLimit && m_coll.size() > tableLimit) { + m_coll.resize(tableLimit); + } + + //cerr << "TargetPhrases=" << GetSize() << endl; +} + +std::string TargetPhrases::Debug(const System &system) const +{ + std::stringstream out; + + out << m_coll.size() << std::endl; + BOOST_FOREACH(const SCFG::TargetPhraseImpl *tp, m_coll) { + out << tp->Debug(system); + out << std::endl; + } + return out.str(); +} + +} +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/SCFG/TargetPhrases.h b/mosesdecoder/moses2/SCFG/TargetPhrases.h new file mode 100644 index 0000000000000000000000000000000000000000..8bdea7d09088da495b23bbe4e55033c1a8635771 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/TargetPhrases.h @@ -0,0 +1,64 @@ +/* + * TargetPhrases.h + * + * Created on: 15 Apr 2016 + * Author: hieu + */ + +#pragma once +#include +#include +#include "../Vector.h" + +namespace Moses2 +{ +class MemPool; +class System; + +namespace SCFG +{ +class TargetPhraseImpl; + +class TargetPhrases +{ + typedef Moses2::Vector Coll; + +public: + typedef Coll::iterator iterator; + typedef Coll::const_iterator const_iterator; + //! iterators + const_iterator begin() const { + return m_coll.begin(); + } + const_iterator end() const { + return m_coll.end(); + } + + const SCFG::TargetPhraseImpl& operator[](size_t ind) const { + return *m_coll[ind]; + } + + TargetPhrases(MemPool &pool); + TargetPhrases(MemPool &pool, size_t size); + virtual ~TargetPhrases(); + + size_t GetSize() const { + return m_coll.size(); + } + + void AddTargetPhrase(const SCFG::TargetPhraseImpl &targetPhrase) { + m_coll.push_back(&targetPhrase); + } + + void SortAndPrune(size_t tableLimit); + + std::string Debug(const System &system) const; + +protected: + Coll m_coll; + +}; + +} +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/SCFG/Word.cpp b/mosesdecoder/moses2/SCFG/Word.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8f67fb0fa5816a98d6438f75e3c9a23d54c7c54b --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Word.cpp @@ -0,0 +1,147 @@ +/* + * Word.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include "Word.h" +#include "Hypothesis.h" +#include "ActiveChart.h" +#include "TargetPhraseImpl.h" +#include "Sentence.h" +#include "../legacy/Util2.h" +#include "../System.h" +#include "../AlignmentInfo.h" +#include "../ManagerBase.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ +Word::Word(const SCFG::Word ©) + :Moses2::Word(copy) + ,isNonTerminal(copy.isNonTerminal) +{ +} + +void Word::CreateFromString(FactorCollection &vocab, + const System &system, + const std::string &str) +{ + vector toks; + + if (str[0] == '[' && str[str.size() - 1] == ']') { + isNonTerminal = true; + + size_t startPos = str.find("[", 1); + bool doubleNT = startPos != string::npos; + + if (doubleNT) { + assert(startPos != string::npos); + string str2 = str.substr(startPos + 1, str.size() - startPos - 2); + toks = Tokenize(str2, "|"); + } else { + string str2 = str.substr(1, str.size() - 2); + toks = Tokenize(str2, "|"); + } + } else { + isNonTerminal = false; + toks = Tokenize(str, "|"); + } + + // parse string + for (size_t i = 0; i < toks.size(); ++i) { + const string &tok = toks[i]; + //cerr << "tok=" << tok << endl; + + const Factor *factor = vocab.AddFactor(tok, system, isNonTerminal); + m_factors[i] = factor; + } +} + +size_t Word::hash() const +{ + size_t ret = Moses2::Word::hash(); + boost::hash_combine(ret, isNonTerminal); + return ret; +} + +size_t Word::hash(const std::vector &factors) const +{ + size_t seed = isNonTerminal; + for (size_t i = 0; i < factors.size(); ++i) { + FactorType factorType = factors[i]; + const Factor *factor = m_factors[factorType]; + boost::hash_combine(seed, factor); + } + return seed; +} + +void Word::OutputToStream(const System &system, std::ostream &out) const +{ + if (isNonTerminal) { + out << "["; + } + Moses2::Word::OutputToStream(system, out); + if (isNonTerminal) { + out << "]"; + } +} + +void Word::OutputToStream( + const ManagerBase &mgr, + size_t targetPos, + const SCFG::Hypothesis &hypo, + std::ostream &out) const +{ + const SCFG::TargetPhraseImpl &tp = hypo.GetTargetPhrase(); + const SCFG::SymbolBind &symbolBind = hypo.GetSymbolBind(); + + bool outputWord = true; + if (mgr.system.options.input.placeholder_factor != NOT_FOUND) { + const AlignmentInfo &alignInfo = tp.GetAlignTerm(); + std::set sourceAligns = alignInfo.GetAlignmentsForTarget(targetPos); + if (sourceAligns.size() == 1) { + size_t sourcePos = *sourceAligns.begin(); + /* + cerr << "sourcePos=" << sourcePos << endl; + cerr << "tp=" << tp.Debug(mgr.system) << endl; + cerr << "m_symbolBind=" << symbolBind.Debug(mgr.system) << endl; + */ + assert(sourcePos < symbolBind.GetSize()); + const Range &inputRange = symbolBind.coll[sourcePos].GetRange(); + assert(inputRange.GetNumWordsCovered() == 1); + const SCFG::Sentence &sentence = static_cast(mgr.GetInput()); + const SCFG::Word &sourceWord = sentence[inputRange.GetStartPos()]; + const Factor *factor = sourceWord[mgr.system.options.input.placeholder_factor]; + if (factor) { + out << factor->GetString(); + outputWord = false; + } + } + } + + if (outputWord) { + OutputToStream(mgr.system, out); + } +} + +std::string Word::Debug(const System &system) const +{ + stringstream out; + if (isNonTerminal) { + out << "["; + } + out << Moses2::Word::Debug(system); + if (isNonTerminal) { + out << "]"; + } + return out.str(); +} + +} +} + diff --git a/mosesdecoder/moses2/SCFG/Word.h b/mosesdecoder/moses2/SCFG/Word.h new file mode 100644 index 0000000000000000000000000000000000000000..e039f92e844f6b2f27b1d6c997982571d1f6d419 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/Word.h @@ -0,0 +1,63 @@ +/* + * Word.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include "../Word.h" + +namespace Moses2 +{ +class ManagerBase; + +namespace SCFG +{ +class Hypothesis; + +class Word: public Moses2::Word +{ +public: + bool isNonTerminal; + + explicit Word() {} + explicit Word(const SCFG::Word ©); + + void CreateFromString(FactorCollection &vocab, + const System &system, + const std::string &str); + + bool operator==(const SCFG::Word &compare) const { + int cmp = Moses2::Word::Compare(compare); + if (cmp == 0 && isNonTerminal == compare.isNonTerminal) { + return true; + } else { + return false; + } + } + + size_t hash() const; + virtual size_t hash(const std::vector &factors) const; + + virtual void OutputToStream(const System &system, std::ostream &out) const; + virtual void OutputToStream( + const ManagerBase &mgr, + size_t targetPos, + const SCFG::Hypothesis &hypo, + std::ostream &out) const; + + virtual std::string Debug(const System &system) const; + +protected: +}; + +inline size_t hash_value(const SCFG::Word &word) +{ + return word.hash(); +} + +} +} + diff --git a/mosesdecoder/moses2/SCFG/nbest/KBestExtractor.cpp b/mosesdecoder/moses2/SCFG/nbest/KBestExtractor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..14d12c4b5a33f56d98395fc5eb329b55c7b18ba5 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/nbest/KBestExtractor.cpp @@ -0,0 +1,74 @@ +/* + * KBestExtractor.cpp + * + * Created on: 2 Aug 2016 + * Author: hieu + */ +#include +#include +#include "KBestExtractor.h" +#include "../Manager.h" +#include "../Hypothesis.h" +#include "../Stacks.h" +#include "../Stack.h" +#include "../Sentence.h" +#include "../../System.h" +#include "../../Scores.h" +#include "../../legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ +//bool g_debug = false; + +namespace SCFG +{ +///////////////////////////////////////////////////////////// +KBestExtractor::KBestExtractor(const SCFG::Manager &mgr) + :m_mgr(mgr) +{ + +} + +KBestExtractor::~KBestExtractor() +{ +} + +void KBestExtractor::OutputToStream(std::stringstream &strm) +{ + //cerr << "1" << flush; + const Stack &lastStack = m_mgr.GetStacks().GetLastStack(); + UTIL_THROW_IF2(lastStack.GetColl().size() != 1, "Only suppose to be 1 hypo coll in last stack"); + UTIL_THROW_IF2(lastStack.GetColl().begin()->second == NULL, "NULL hypo collection"); + + const Hypotheses &hypos = lastStack.GetColl().begin()->second->GetSortedAndPrunedHypos(m_mgr, m_mgr.arcLists); + UTIL_THROW_IF2(hypos.size() != 1, "Only suppose to be 1 hypo in collection"); + const HypothesisBase *hypo = hypos[0]; + + const ArcLists &arcLists = m_mgr.arcLists; + const ArcList &arcList = arcLists.GetArcList(hypo); + NBests &nbests = m_nbestColl.GetOrCreateNBests(m_mgr, arcList); + + size_t ind = 0; + while (nbests.Extend(m_mgr, m_nbestColl, ind)) { + const NBest &deriv = nbests.Get(ind); + strm << m_mgr.GetTranslationId() << " ||| "; + //cerr << "1" << flush; + strm << deriv.GetStringExclSentenceMarkers(); + //cerr << "2" << flush; + strm << " ||| "; + deriv.GetScores().OutputBreakdown(strm, m_mgr.system); + //cerr << "3" << flush; + strm << "||| "; + strm << deriv.GetScores().GetTotalScore(); + //cerr << "4" << flush; + + strm << endl; + + ++ind; + } +} + +} +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/SCFG/nbest/KBestExtractor.h b/mosesdecoder/moses2/SCFG/nbest/KBestExtractor.h new file mode 100644 index 0000000000000000000000000000000000000000..91b62d60b4e2a453ea323e818e3ddca67c80a5bf --- /dev/null +++ b/mosesdecoder/moses2/SCFG/nbest/KBestExtractor.h @@ -0,0 +1,40 @@ +/* + * KBestExtractor.h + * + * Created on: 2 Aug 2016 + * Author: hieu + */ +#pragma once +#include +#include +#include +#include "NBest.h" +#include "NBests.h" +#include "NBestColl.h" + +namespace Moses2 +{ +class Scores; + +namespace SCFG +{ +class Manager; +class Hypothesis; +class NBests; +class NBestScoreOrderer; + +///////////////////////////////////////////////////////////// +class KBestExtractor +{ +public: + KBestExtractor(const SCFG::Manager &mgr); + virtual ~KBestExtractor(); + + void OutputToStream(std::stringstream &strm); +protected: + const SCFG::Manager &m_mgr; + NBestColl m_nbestColl; +}; + +} +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/SCFG/nbest/NBest.cpp b/mosesdecoder/moses2/SCFG/nbest/NBest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1057fa0046a4be5ee748ed8164b3eb2c54738f2a --- /dev/null +++ b/mosesdecoder/moses2/SCFG/nbest/NBest.cpp @@ -0,0 +1,192 @@ +/* + * NBest.cpp + * + * Created on: 24 Aug 2016 + * Author: hieu + */ +#include +#include +#include "util/exception.hh" +#include "NBest.h" +#include "NBests.h" +#include "NBestColl.h" +#include "../Manager.h" +#include "../TargetPhraseImpl.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ + +NBest::NBest( + const SCFG::Manager &mgr, + const ArcList &varcList, + size_t vind, + NBestColl &nbestColl) + :arcList(&varcList) + ,arcInd(vind) +{ + const SCFG::Hypothesis &hypo = GetHypo(); + + // copy scores from best hypo + MemPool &pool = mgr.GetPool(); + m_scores = new (pool.Allocate()) + Scores(mgr.system, pool, mgr.system.featureFunctions.GetNumScores(), hypo.GetScores()); + + // children + const ArcLists &arcLists = mgr.arcLists; + //const SCFG::TargetPhraseImpl &tp = hypo.GetTargetPhrase(); + + const Vector &prevHypos = hypo.GetPrevHypos(); + for (size_t i = 0; i < prevHypos.size(); ++i) { + const SCFG::Hypothesis *prevHypo = prevHypos[i]; + const ArcList &childArc = arcLists.GetArcList(prevHypo); + NBests &childNBests = nbestColl.GetOrCreateNBests(mgr, childArc); + Child child(&childNBests, 0); + children.push_back(child); + } + + stringstream strm; + OutputToStream(mgr, strm); + m_str = strm.str(); +} + +NBest::NBest(const SCFG::Manager &mgr, + const NBest &orig, + size_t childInd, + NBestColl &nbestColl) + :arcList(orig.arcList) + ,arcInd(orig.arcInd) + ,children(orig.children) +{ + Child &child = children[childInd]; + size_t &ind = child.second; + ++ind; + UTIL_THROW_IF2(ind >= child.first->GetSize(), + "out of bound:" << ind << ">=" << child.first->GetSize()); + + // scores + MemPool &pool = mgr.GetPool(); + m_scores = new (pool.Allocate()) + Scores(mgr.system, + pool, + mgr.system.featureFunctions.GetNumScores(), + orig.GetScores()); + + const Scores &origScores = orig.GetChild(childInd).GetScores(); + const Scores &newScores = GetChild(childInd).GetScores(); + + m_scores->MinusEquals(mgr.system, origScores); + m_scores->PlusEquals(mgr.system, newScores); + + stringstream strm; + OutputToStream(mgr, strm); + m_str = strm.str(); +} + +const SCFG::Hypothesis &NBest::GetHypo() const +{ + const HypothesisBase *hypoBase = (*arcList)[arcInd]; + const SCFG::Hypothesis &hypo = *static_cast(hypoBase); + return hypo; +} + +const NBest &NBest::GetChild(size_t ind) const +{ + const Child &child = children[ind]; + const NBests &nbests = *child.first; + const NBest &nbest = nbests.Get(child.second); + return nbest; +} + + +void NBest::CreateDeviants( + const SCFG::Manager &mgr, + NBestColl &nbestColl, + Contenders &contenders) const +{ + if (arcInd + 1 < arcList->size()) { + // to use next arclist, all children must be 1st. Not sure if this is correct + bool ok = true; + BOOST_FOREACH(const Child &child, children) { + if (child.second) { + ok = false; + break; + } + } + + if (ok) { + NBest *next = new NBest(mgr, *arcList, arcInd + 1, nbestColl); + contenders.push(next); + } + } + + for (size_t childInd = 0; childInd < children.size(); ++childInd) { + const Child &child = children[childInd]; + NBests &childNBests = *child.first; + bool extended = childNBests.Extend(mgr, nbestColl, child.second + 1); + if (extended) { + //cerr << "HH1 " << childInd << endl; + NBest *next = new NBest(mgr, *this, childInd, nbestColl); + + //cerr << "HH2 " << childInd << endl; + contenders.push(next); + //cerr << "HH3 " << childInd << endl; + } + } +} + +void NBest::OutputToStream( + const SCFG::Manager &mgr, + std::stringstream &strm) const +{ + const SCFG::Hypothesis &hypo = GetHypo(); + //strm << &hypo << " "; + + const SCFG::TargetPhraseImpl &tp = hypo.GetTargetPhrase(); + + for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) { + const SCFG::Word &word = tp[targetPos]; + //cerr << "word " << pos << "=" << word << endl; + if (word.isNonTerminal) { + //cerr << "is nt" << endl; + // non-term. fill out with prev hypo + size_t nonTermInd = tp.GetAlignNonTerm().GetNonTermIndexMap()[targetPos]; + + UTIL_THROW_IF2(nonTermInd >= children.size(), "Out of bounds:" << nonTermInd << ">=" << children.size()); + + const NBest &nbest = GetChild(nonTermInd); + strm << nbest.GetString(); + } else { + //cerr << "not nt" << endl; + word.OutputToStream(hypo.GetManager(), targetPos, hypo, strm); + + strm << " "; + } + } +} + +std::string NBest::Debug(const System &system) const +{ + stringstream strm; + strm << GetScores().GetTotalScore() << " " + << arcList << "(" + << arcList->size() << ")[" + << arcInd << "] "; + for (size_t i = 0; i < children.size(); ++i) { + const Child &child = children[i]; + const NBest &childNBest = child.first->Get(child.second); + + strm << child.first << "(" + << child.first->GetSize() << ")[" + << child.second << "]"; + strm << childNBest.GetScores().GetTotalScore() << " "; + } + return strm.str(); +} + +} +} diff --git a/mosesdecoder/moses2/SCFG/nbest/NBest.h b/mosesdecoder/moses2/SCFG/nbest/NBest.h new file mode 100644 index 0000000000000000000000000000000000000000..6b406fa170ba3a3f9aa85f6efb4be1e6b98d162a --- /dev/null +++ b/mosesdecoder/moses2/SCFG/nbest/NBest.h @@ -0,0 +1,99 @@ +/* + * NBest.h + * + * Created on: 24 Aug 2016 + * Author: hieu + */ + +#pragma once +#include +#include +#include +#include +#include "../../Scores.h" +#include "../../ArcLists.h" + +namespace Moses2 +{ +class Scores; +class System; + +namespace SCFG +{ +class NBest; +class NBests; +class NBestScoreOrderer; +class Manager; +class NBestColl; +class Hypothesis; + +///////////////////////////////////////////////////////////// +typedef std::priority_queue, NBestScoreOrderer> Contenders; + +///////////////////////////////////////////////////////////// +class NBest +{ +public: + const ArcList *arcList; + size_t arcInd; + + typedef std::pair Child; // key to another NBest + typedef std::vector Children; + Children children; + + NBest(const SCFG::Manager &mgr, + const ArcList &varcList, + size_t vind, + NBestColl &nbestColl); + + NBest(const SCFG::Manager &mgr, + const NBest &orig, + size_t childInd, + NBestColl &nbestColl); + + + void CreateDeviants( + const SCFG::Manager &mgr, + NBestColl &nbestColl, + Contenders &contenders) const; + + const Scores &GetScores() const { + return *m_scores; + } + + const NBest &GetChild(size_t ind) const; + + const std::string &GetString() const { + return m_str; + } + + std::string GetStringExclSentenceMarkers() const { + std::string ret = m_str.substr(4, m_str.size() - 10); + return ret; + } + + std::string Debug(const System &system) const; + +protected: + Scores *m_scores; + std::string m_str; + + const SCFG::Hypothesis &GetHypo() const; + + void OutputToStream( + const SCFG::Manager &mgr, + std::stringstream &strm) const; +}; + +///////////////////////////////////////////////////////////// +class NBestScoreOrderer +{ +public: + bool operator()(const NBest* a, const NBest* b) const { + return a->GetScores().GetTotalScore() < b->GetScores().GetTotalScore(); + } +}; + +} +} + diff --git a/mosesdecoder/moses2/SCFG/nbest/NBestColl.cpp b/mosesdecoder/moses2/SCFG/nbest/NBestColl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..38a9ac867ff35ffe63f3eab9062f4884b2427f6e --- /dev/null +++ b/mosesdecoder/moses2/SCFG/nbest/NBestColl.cpp @@ -0,0 +1,52 @@ +/* + * NBestColl.cpp + * + * Created on: 24 Aug 2016 + * Author: hieu + */ +#include +#include "util/exception.hh" +#include "NBestColl.h" +#include "NBests.h" +#include "../Manager.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ + +///////////////////////////////////////////////////////////// +NBestColl::~NBestColl() +{ + BOOST_FOREACH(const Coll::value_type &valPair, m_candidates) { + NBests *nbests = valPair.second; + delete nbests; + } +} + +void NBestColl::Add(const SCFG::Manager &mgr, const ArcList &arcList) +{ + NBests &nbests = GetOrCreateNBests(mgr, arcList); + //cerr << "nbests for " << &nbests << ":"; +} + +NBests &NBestColl::GetOrCreateNBests(const SCFG::Manager &mgr, const ArcList &arcList) +{ + NBests *ret; + Coll::iterator iter = m_candidates.find(&arcList); + if(iter == m_candidates.end()) { + ret = new NBests(mgr, arcList, *this); + m_candidates[&arcList] = ret; + } else { + ret = iter->second; + } + return *ret; +} + + +} +} + diff --git a/mosesdecoder/moses2/SCFG/nbest/NBestColl.h b/mosesdecoder/moses2/SCFG/nbest/NBestColl.h new file mode 100644 index 0000000000000000000000000000000000000000..01e5763e49eda14d53ceafba19b11a465e11026b --- /dev/null +++ b/mosesdecoder/moses2/SCFG/nbest/NBestColl.h @@ -0,0 +1,36 @@ +/* + * NBestColl.h + * + * Created on: 24 Aug 2016 + * Author: hieu + */ +#pragma once +#include +#include "../../ArcLists.h" + + +namespace Moses2 +{ +namespace SCFG +{ +class NBests; +class Manager; + +class NBestColl +{ +public: + virtual ~NBestColl(); + + void Add(const SCFG::Manager &mgr, const ArcList &arcList); + NBests &GetOrCreateNBests(const SCFG::Manager &mgr, const ArcList &arcList); + +protected: + typedef boost::unordered_map Coll; + Coll m_candidates; + +}; + +} +} + + diff --git a/mosesdecoder/moses2/SCFG/nbest/NBests.cpp b/mosesdecoder/moses2/SCFG/nbest/NBests.cpp new file mode 100644 index 0000000000000000000000000000000000000000..27376977fd2bbf50955ba7cf67fb8713f26cceab --- /dev/null +++ b/mosesdecoder/moses2/SCFG/nbest/NBests.cpp @@ -0,0 +1,109 @@ +/* + * NBests.cpp + * + * Created on: 24 Aug 2016 + * Author: hieu + */ + +#include +#include "NBests.h" +#include "../Manager.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ +namespace SCFG +{ +NBests::NBests(const SCFG::Manager &mgr, + const ArcList &arcList, + NBestColl &nbestColl) + :indIter(0) +{ + // best + NBest *contender = new NBest(mgr, arcList, 0, nbestColl); + contenders.push(contender); + bool extended = Extend(mgr, nbestColl, 0); + assert(extended); +} + +NBests::~NBests() +{ + BOOST_FOREACH(const NBest *nbest, m_coll) { + delete nbest; + } + + // delete bad contenders left in queue + while (!contenders.empty()) { + NBest *contender = contenders.top(); + contenders.pop(); + delete contender; + } +} + +bool NBests::Extend(const SCFG::Manager &mgr, + NBestColl &nbestColl, + size_t ind) +{ + if (ind < m_coll.size()) { + // asking for 1 we've dont already + return true; + } + + assert(ind == m_coll.size()); + + // checks + if (ind >= mgr.system.options.nbest.nbest_size) { + return false; + } + + size_t maxIter = mgr.system.options.nbest.nbest_size * mgr.system.options.nbest.factor; + + // MAIN LOOP, create 1 new deriv. + // The loop is for distinct nbest + bool ok = false; + while (!ok) { + ++indIter; + if (indIter > maxIter) { + return false; + } + + if (contenders.empty()) { + return false; + } + + NBest *contender = contenders.top(); + contenders.pop(); + + contender->CreateDeviants(mgr, nbestColl, contenders); + + if (mgr.system.options.nbest.only_distinct) { + const string &tgtPhrase = contender->GetString(); + //cerr << "tgtPhrase=" << tgtPhrase << endl; + boost::hash string_hash; + size_t hash = string_hash(tgtPhrase); + + if (distinctHypos.insert(hash).second) { + ok = true; + } + } else { + ok = true; + } + + if (ok) { + Add(contender); + //cerr << best->GetScores().GetTotalScore() << " "; + //cerr << best->Debug(mgr.system) << endl; + return true; + } else { + delete contender; + } + } + + return false; +} + +} +} + diff --git a/mosesdecoder/moses2/SCFG/nbest/NBests.h b/mosesdecoder/moses2/SCFG/nbest/NBests.h new file mode 100644 index 0000000000000000000000000000000000000000..97fe9a025e12ef5cda0f8c39ac6056dd98321409 --- /dev/null +++ b/mosesdecoder/moses2/SCFG/nbest/NBests.h @@ -0,0 +1,54 @@ +/* + * NBests.h + * + * Created on: 24 Aug 2016 + * Author: hieu + */ + +#pragma once +#include +#include "NBest.h" + +namespace Moses2 +{ +namespace SCFG +{ + +class NBests +{ +public: + Contenders contenders; + boost::unordered_set distinctHypos; + + NBests(const SCFG::Manager &mgr, + const ArcList &arcList, + NBestColl &nbestColl); + + virtual ~NBests(); + + size_t GetSize() const { + return m_coll.size(); + } + + const NBest &Get(size_t ind) const { + return *m_coll[ind]; + } + + bool Extend(const SCFG::Manager &mgr, + NBestColl &nbestColl, + size_t ind); + +protected: + std::vector m_coll; + size_t indIter; + + void Add(const NBest *nbest) { + m_coll.push_back(nbest); + } + +}; + + +} +} + diff --git a/mosesdecoder/moses2/Scores.cpp b/mosesdecoder/moses2/Scores.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6cf12142227574a815acf06b62c8bed9d82ed6b2 --- /dev/null +++ b/mosesdecoder/moses2/Scores.cpp @@ -0,0 +1,283 @@ +/* + * Scores.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#include +#include +#include +#include +#include "Scores.h" +#include "Weights.h" +#include "System.h" +#include "FF/FeatureFunction.h" +#include "FF/FeatureFunctions.h" +#include "legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ + +Scores::Scores(const System &system, MemPool &pool, size_t numScores) : + m_total(0) +{ + if (system.options.nbest.nbest_size) { + m_scores = new (pool.Allocate(numScores)) SCORE[numScores]; + Init(m_scores, numScores, 0); + } else { + m_scores = NULL; + } +} + +Scores::Scores(const System &system, MemPool &pool, size_t numScores, + const Scores &origScores) : + m_total(origScores.m_total) +{ + if (system.options.nbest.nbest_size) { + m_scores = new (pool.Allocate(numScores)) SCORE[numScores]; + memcpy(m_scores, origScores.m_scores, sizeof(SCORE) * numScores); + } else { + m_scores = NULL; + } +} + +Scores::~Scores() +{ + +} + +const SCORE *Scores::GetScores(const FeatureFunction &featureFunction) const +{ + assert(m_scores); + size_t ffStartInd = featureFunction.GetStartInd(); + const SCORE &scores = m_scores[ffStartInd]; + return &scores; +} + +void Scores::Reset(const System &system) +{ + if (system.options.nbest.nbest_size) { + size_t numScores = system.featureFunctions.GetNumScores(); + Init(m_scores, numScores, 0); + } + m_total = 0; +} + +void Scores::PlusEquals(const System &system, + const FeatureFunction &featureFunction, const SCORE &score) +{ + assert(featureFunction.GetNumScores() == 1); + + const Weights &weights = system.weights; + + size_t ffStartInd = featureFunction.GetStartInd(); + if (system.options.nbest.nbest_size) { + m_scores[ffStartInd] += score; + } + SCORE weight = weights[ffStartInd]; + m_total += score * weight; +} + +void Scores::PlusEquals(const System &system, + const FeatureFunction &featureFunction, const SCORE &score, size_t offset) +{ + assert(offset < featureFunction.GetNumScores()); + + const Weights &weights = system.weights; + + size_t ffStartInd = featureFunction.GetStartInd(); + if (system.options.nbest.nbest_size) { + m_scores[ffStartInd + offset] += score; + } + SCORE weight = weights[ffStartInd + offset]; + m_total += score * weight; +} + +void Scores::PlusEquals(const System &system, + const FeatureFunction &featureFunction, const std::vector &scores) +{ + assert(scores.size() == featureFunction.GetNumScores()); + + const Weights &weights = system.weights; + + size_t ffStartInd = featureFunction.GetStartInd(); + for (size_t i = 0; i < scores.size(); ++i) { + SCORE incrScore = scores[i]; + if (system.options.nbest.nbest_size) { + m_scores[ffStartInd + i] += incrScore; + } + //cerr << "ffStartInd=" << ffStartInd << " " << i << endl; + SCORE weight = weights[ffStartInd + i]; + m_total += incrScore * weight; + } +} + +void Scores::PlusEquals(const System &system, + const FeatureFunction &featureFunction, SCORE scores[]) +{ + //assert(scores.size() == featureFunction.GetNumScores()); + + const Weights &weights = system.weights; + + size_t ffStartInd = featureFunction.GetStartInd(); + for (size_t i = 0; i < featureFunction.GetNumScores(); ++i) { + SCORE incrScore = scores[i]; + if (system.options.nbest.nbest_size) { + m_scores[ffStartInd + i] += incrScore; + } + //cerr << "ffStartInd=" << ffStartInd << " " << i << endl; + SCORE weight = weights[ffStartInd + i]; + m_total += incrScore * weight; + } +} + +void Scores::PlusEquals(const System &system, const Scores &other) +{ + size_t numScores = system.featureFunctions.GetNumScores(); + if (system.options.nbest.nbest_size) { + for (size_t i = 0; i < numScores; ++i) { + m_scores[i] += other.m_scores[i]; + } + } + m_total += other.m_total; +} + +void Scores::MinusEquals(const System &system, const Scores &other) +{ + size_t numScores = system.featureFunctions.GetNumScores(); + if (system.options.nbest.nbest_size) { + for (size_t i = 0; i < numScores; ++i) { + m_scores[i] -= other.m_scores[i]; + } + } + m_total -= other.m_total; +} + +void Scores::Assign(const System &system, + const FeatureFunction &featureFunction, const SCORE &score) +{ + assert(featureFunction.GetNumScores() == 1); + + const Weights &weights = system.weights; + + size_t ffStartInd = featureFunction.GetStartInd(); + + if (system.options.nbest.nbest_size) { + assert(m_scores[ffStartInd] == 0); + m_scores[ffStartInd] = score; + } + SCORE weight = weights[ffStartInd]; + m_total += score * weight; + +} + +void Scores::Assign(const System &system, + const FeatureFunction &featureFunction, const std::vector &scores) +{ + assert(scores.size() == featureFunction.GetNumScores()); + + const Weights &weights = system.weights; + + size_t ffStartInd = featureFunction.GetStartInd(); + for (size_t i = 0; i < scores.size(); ++i) { + SCORE incrScore = scores[i]; + + if (system.options.nbest.nbest_size) { + assert(m_scores[ffStartInd + i] == 0); + m_scores[ffStartInd + i] = incrScore; + } + //cerr << "ffStartInd=" << ffStartInd << " " << i << endl; + SCORE weight = weights[ffStartInd + i]; + m_total += incrScore * weight; + } +} + +void Scores::CreateFromString(const std::string &str, + const FeatureFunction &featureFunction, const System &system, + bool transformScores) +{ + vector scores = Tokenize(str); + if (transformScores) { + std::transform(scores.begin(), scores.end(), scores.begin(), + TransformScore); + std::transform(scores.begin(), scores.end(), scores.begin(), FloorScore); + } + + /* + std::copy(scores.begin(),scores.end(), + std::ostream_iterator(cerr," ")); + */ + + PlusEquals(system, featureFunction, scores); +} + +std::string Scores::Debug(const System &system) const +{ + stringstream out; + out << "total=" << m_total; + + if (system.options.nbest.nbest_size) { + out << ", "; + BOOST_FOREACH(const FeatureFunction *ff, system.featureFunctions.GetFeatureFunctions()) { + out << ff->GetName() << "= "; + for (size_t i = ff->GetStartInd(); i < (ff->GetStartInd() + ff->GetNumScores()); ++i) { + out << m_scores[i] << " "; + } + } + } + + return out.str(); +} + +void Scores::OutputBreakdown(std::ostream &out, const System &system) const +{ + if (system.options.nbest.nbest_size) { + BOOST_FOREACH(const FeatureFunction *ff, system.featureFunctions.GetFeatureFunctions()) { + if (ff->IsTuneable()) { + out << ff->GetName() << "= "; + for (size_t i = ff->GetStartInd(); i < (ff->GetStartInd() + ff->GetNumScores()); ++i) { + out << m_scores[i] << " "; + } + } + } + } +} + +// static functions to work out estimated scores +SCORE Scores::CalcWeightedScore(const System &system, + const FeatureFunction &featureFunction, SCORE scores[]) +{ + SCORE ret = 0; + + const Weights &weights = system.weights; + + size_t ffStartInd = featureFunction.GetStartInd(); + for (size_t i = 0; i < featureFunction.GetNumScores(); ++i) { + SCORE incrScore = scores[i]; + + //cerr << "ffStartInd=" << ffStartInd << " " << i << endl; + SCORE weight = weights[ffStartInd + i]; + ret += incrScore * weight; + } + + return ret; +} + +SCORE Scores::CalcWeightedScore(const System &system, + const FeatureFunction &featureFunction, SCORE score) +{ + const Weights &weights = system.weights; + assert(featureFunction.GetNumScores() == 1); + + size_t ffStartInd = featureFunction.GetStartInd(); + SCORE weight = weights[ffStartInd]; + SCORE ret = score * weight; + + return ret; +} + +} + diff --git a/mosesdecoder/moses2/Scores.h b/mosesdecoder/moses2/Scores.h new file mode 100644 index 0000000000000000000000000000000000000000..5069fda3608039d549a53c7704e6a61bb174c462 --- /dev/null +++ b/mosesdecoder/moses2/Scores.h @@ -0,0 +1,81 @@ +/* + * Scores.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once +#include +#include +#include "TypeDef.h" +#include "MemPool.h" + +namespace Moses2 +{ + +class FeatureFunction; +class FeatureFunctions; +class System; + +class Scores +{ +public: + Scores(const System &system, MemPool &pool, size_t numScores); + Scores(const System &system, MemPool &pool, size_t numScores, + const Scores &origScores); + + virtual ~Scores(); + + SCORE GetTotalScore() const { + return m_total; + } + + const SCORE *GetScores(const FeatureFunction &featureFunction) const; + + void Reset(const System &system); + + void CreateFromString(const std::string &str, + const FeatureFunction &featureFunction, const System &system, + bool transformScores); + + void PlusEquals(const System &system, const FeatureFunction &featureFunction, + const SCORE &score); + + void PlusEquals(const System &system, const FeatureFunction &featureFunction, + const SCORE &score, size_t offset); + + void PlusEquals(const System &system, const FeatureFunction &featureFunction, + const std::vector &scores); + + void PlusEquals(const System &system, const FeatureFunction &featureFunction, + SCORE scores[]); + + void PlusEquals(const System &system, const Scores &scores); + + void MinusEquals(const System &system, const Scores &scores); + + void Assign(const System &system, const FeatureFunction &featureFunction, + const SCORE &score); + + void Assign(const System &system, const FeatureFunction &featureFunction, + const std::vector &scores); + + std::string Debug(const System &system) const; + + void OutputBreakdown(std::ostream &out, const System &system) const; + + // static functions to work out estimated scores + static SCORE CalcWeightedScore(const System &system, + const FeatureFunction &featureFunction, SCORE scores[]); + + static SCORE CalcWeightedScore(const System &system, + const FeatureFunction &featureFunction, SCORE score); + +protected: + SCORE *m_scores; + SCORE m_total; +}; + +} + diff --git a/mosesdecoder/moses2/SubPhrase.cpp b/mosesdecoder/moses2/SubPhrase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4d3c20f145048409c78b8d82d6e7608a2eab545c --- /dev/null +++ b/mosesdecoder/moses2/SubPhrase.cpp @@ -0,0 +1,17 @@ +/* + * SubPhrase.cpp + * + * Created on: 19 Feb 2016 + * Author: hieu + */ +#include "SubPhrase.h" + +using namespace std; + +namespace Moses2 +{ + + + +} + diff --git a/mosesdecoder/moses2/SubPhrase.h b/mosesdecoder/moses2/SubPhrase.h new file mode 100644 index 0000000000000000000000000000000000000000..21b003912666f5150f9314660595873b83b6d825 --- /dev/null +++ b/mosesdecoder/moses2/SubPhrase.h @@ -0,0 +1,54 @@ +#pragma once +#include +#include "Phrase.h" +#include "Word.h" +#include "SCFG/Word.h" + +namespace Moses2 +{ +class System; + +template +class SubPhrase: public Phrase +{ +public: + SubPhrase(const Phrase &origPhrase, size_t start, size_t size) + :m_origPhrase(&origPhrase) + ,m_start(start) + ,m_size(size) + {} + + virtual const WORD& operator[](size_t pos) const { + return (*m_origPhrase)[pos + m_start]; + } + + virtual size_t GetSize() const { + return m_size; + } + + SubPhrase GetSubPhrase(size_t start, size_t size) const { + SubPhrase ret(*m_origPhrase, m_start + start, size); + return ret; + } + + virtual std::string Debug(const System &system) const { + std::stringstream out; + if (GetSize()) { + out << (*this)[0].Debug(system); + for (size_t i = 1; i < GetSize(); ++i) { + const WORD &word = (*this)[i]; + out << " " << word.Debug(system); + } + } + + return out.str(); + } + +protected: + const Phrase *m_origPhrase; + size_t m_start, m_size; +}; + + +} + diff --git a/mosesdecoder/moses2/System.cpp b/mosesdecoder/moses2/System.cpp new file mode 100644 index 0000000000000000000000000000000000000000..91c247b75d7c35e7458110d878b7e602f0f6bfd2 --- /dev/null +++ b/mosesdecoder/moses2/System.cpp @@ -0,0 +1,224 @@ +/* + * System.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include +#include +#include +#include "System.h" +#include "FF/FeatureFunction.h" +#include "TranslationModel/UnknownWordPenalty.h" +#include "legacy/Util2.h" +#include "util/exception.hh" + +using namespace std; + +namespace Moses2 +{ + +thread_local MemPool System::m_managerPool; +thread_local MemPool System::m_systemPool; +thread_local Recycler System::m_hypoRecycler; + +System::System(const Parameter ¶msArg) : + params(paramsArg), featureFunctions(*this) +{ + options.init(paramsArg); + IsPb(); + + bestCollector.reset(new OutputCollector()); + + params.SetParameter(cpuAffinityOffset, "cpu-affinity-offset", -1); + params.SetParameter(cpuAffinityOffsetIncr, "cpu-affinity-increment", 1); + + const PARAM_VEC *section; + + // output collectors + if (options.nbest.nbest_size) { + nbestCollector.reset(new OutputCollector(options.nbest.output_file_path)); + } + + if (!options.output.detailed_transrep_filepath.empty()) { + detailedTranslationCollector.reset(new OutputCollector(options.output.detailed_transrep_filepath)); + } + + featureFunctions.Create(); + LoadWeights(); + + if (params.GetParam("show-weights")) { + cerr << "Showing weights then exit" << endl; + featureFunctions.ShowWeights(weights); + //return; + } + + cerr << "START featureFunctions.Load()" << endl; + featureFunctions.Load(); + cerr << "START LoadMappings()" << endl; + LoadMappings(); + cerr << "END LoadMappings()" << endl; + LoadDecodeGraphBackoff(); + cerr << "END LoadDecodeGraphBackoff()" << endl; + + UTIL_THROW_IF2(options.input.xml_policy == XmlConstraint, "XmlConstraint not supported"); + + // max spans for scfg decoding + if (!isPb) { + section = params.GetParam("max-chart-span"); + if (section && section->size()) { + maxChartSpans = Scan(*section); + maxChartSpans.resize(mappings.size(), DEFAULT_MAX_CHART_SPAN); + + /* + cerr << "maxChartSpans=" << maxChartSpans.size(); + for (size_t i = 0; i < maxChartSpans.size(); ++i) { + cerr << " " << mappings[i]->GetName() << "=" << maxChartSpans[i]; + } + cerr << endl; + */ + } + } + +} + +System::~System() +{ +} + +void System::LoadWeights() +{ + weights.Init(featureFunctions); + + //cerr << "Weights:" << endl; + typedef std::map > WeightMap; + const WeightMap &allWeights = params.GetAllWeights(); + + // check all weights are there for all FF + const std::vector &ffs = featureFunctions.GetFeatureFunctions(); + BOOST_FOREACH(const FeatureFunction *ff, ffs) { + if (ff->IsTuneable()) { + const std::string &ffName = ff->GetName(); + WeightMap::const_iterator iterWeight = allWeights.find(ffName); + UTIL_THROW_IF2(iterWeight == allWeights.end(), "Must specify weight for " << ffName); + } + } + + + // set weight + BOOST_FOREACH(const WeightMap::value_type &valPair, allWeights) { + const string &ffName = valPair.first; + const std::vector &ffWeights = valPair.second; + /* + cerr << ffName << "="; + for (size_t i = 0; i < ffWeights.size(); ++i) { + cerr << ffWeights[i] << " "; + } + cerr << endl; + */ + weights.SetWeights(featureFunctions, ffName, ffWeights); + } +} + +void System::LoadMappings() +{ + const PARAM_VEC *vec = params.GetParam("mapping"); + UTIL_THROW_IF2(vec == NULL, "Must have [mapping] section"); + + BOOST_FOREACH(const std::string &line, *vec) { + vector toks = Tokenize(line); + assert( (toks.size() == 2 && toks[0] == "T") || (toks.size() == 3 && toks[1] == "T") ); + + size_t ptInd; + if (toks.size() == 2) { + ptInd = Scan(toks[1]); + } else { + ptInd = Scan(toks[2]); + } + const PhraseTable *pt = featureFunctions.GetPhraseTableExcludeUnknownWordPenalty(ptInd); + mappings.push_back(pt); + } + +// unk pt + const UnknownWordPenalty *unkWP = featureFunctions.GetUnknownWordPenalty(); + if (unkWP) { + mappings.push_back(unkWP); + } +} + +void System::LoadDecodeGraphBackoff() +{ + const PARAM_VEC *vec = params.GetParam("decoding-graph-backoff"); + + for (size_t i = 0; i < mappings.size(); ++i) { + PhraseTable *pt = const_cast(mappings[i]); + + if (vec && vec->size() < i) { + pt->decodeGraphBackoff = Scan((*vec)[i]); + } else if (pt == featureFunctions.GetUnknownWordPenalty()) { + pt->decodeGraphBackoff = 1; + } else { + pt->decodeGraphBackoff = 0; + } + } +} + +MemPool &System::GetSystemPool() const +{ + return m_systemPool; +} + +MemPool &System::GetManagerPool() const +{ + return m_managerPool; +} + +FactorCollection &System::GetVocab() const +{ + return m_vocab; +} + +Recycler &System::GetHypoRecycler() const +{ + return m_hypoRecycler; +} + +Batch &System::GetBatch(MemPool &pool) const +{ + Batch *obj; + obj = m_batch.get(); + if (obj == NULL) { + obj = new Batch(pool); + m_batch.reset(obj); + } + assert(obj); + return *obj; +} + +void System::IsPb() +{ + switch (options.search.algo) { + case Normal: + case NormalBatch: + case CubePruning: + case CubePruningPerMiniStack: + case CubePruningPerBitmap: + case CubePruningCardinalStack: + case CubePruningBitmapStack: + case CubePruningMiniStack: + isPb = true; + break; + case CYKPlus: + isPb = false; + break; + default: + abort(); + break; + } +} + + +} + diff --git a/mosesdecoder/moses2/System.h b/mosesdecoder/moses2/System.h new file mode 100644 index 0000000000000000000000000000000000000000..732b2ed4d36e398e6e360a1834008490a2620614 --- /dev/null +++ b/mosesdecoder/moses2/System.h @@ -0,0 +1,87 @@ +/* + * System.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once +#include +#include +#include +#include +#include +#include "FF/FeatureFunctions.h" +#include "Weights.h" +#include "MemPool.h" +#include "Recycler.h" +#include "legacy/FactorCollection.h" +#include "legacy/Parameter.h" +#include "TypeDef.h" +#include "legacy/Bitmaps.h" +#include "legacy/OutputCollector.h" +#include "parameters/AllOptions.h" + +namespace Moses2 +{ +namespace NSCubePruning +{ +class Stack; +} + +class FeatureFunction; +class StatefulFeatureFunction; +class PhraseTable; +class HypothesisBase; + +class System +{ +public: + const Parameter ¶ms; + AllOptions options; + FeatureFunctions featureFunctions; + Weights weights; + std::vector mappings; + + std::vector maxChartSpans; + bool isPb; + + mutable boost::shared_ptr bestCollector, nbestCollector, detailedTranslationCollector; + + // moses.ini params + int cpuAffinityOffset; + int cpuAffinityOffsetIncr; + + System(const Parameter ¶msArg); + virtual ~System(); + + MemPool &GetSystemPool() const; + MemPool &GetManagerPool() const; + FactorCollection &GetVocab() const; + + Recycler &GetHypoRecycler() const; + + Batch &GetBatch(MemPool &pool) const; + +protected: + mutable FactorCollection m_vocab; + //mutable boost::thread_specific_ptr m_managerPool; + //mutable boost::thread_specific_ptr m_systemPool; + thread_local static MemPool m_managerPool; + thread_local static MemPool m_systemPool; + thread_local static Recycler m_hypoRecycler; + + //thread_local static MemPool d; + + mutable boost::thread_specific_ptr m_batch; + + void LoadWeights(); + void LoadMappings(); + void LoadDecodeGraphBackoff(); + + void IsPb(); + +}; + +} + diff --git a/mosesdecoder/moses2/TargetPhrase.cpp b/mosesdecoder/moses2/TargetPhrase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..600d41ae75fd19578737d70a01f0b57b9c853b1b --- /dev/null +++ b/mosesdecoder/moses2/TargetPhrase.cpp @@ -0,0 +1,15 @@ +/* + * TargetPhrase.cpp + * + * Created on: 26 Apr 2016 + * Author: hieu + */ + +#include "TargetPhrase.h" +#include "System.h" +#include "Scores.h" + +namespace Moses2 +{ + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/TargetPhrase.h b/mosesdecoder/moses2/TargetPhrase.h new file mode 100644 index 0000000000000000000000000000000000000000..2522f85df5d2c9eb8763de84591a7bfdf475cef1 --- /dev/null +++ b/mosesdecoder/moses2/TargetPhrase.h @@ -0,0 +1,164 @@ +/* + * TargetPhrase.h + * + * Created on: 26 Apr 2016 + * Author: hieu + */ + +#pragma once +#include +#include "PhraseImplTemplate.h" +#include "System.h" +#include "Scores.h" +#include "AlignmentInfoCollection.h" +#include "TranslationModel/PhraseTable.h" + +namespace Moses2 +{ +class AlignmentInfo; + +template +class TargetPhrase: public PhraseImplTemplate +{ +public: + typedef PhraseImplTemplate Parent; + const PhraseTable &pt; + mutable void **ffData; + SCORE *scoreProperties; + + TargetPhrase(MemPool &pool, const PhraseTable &pt, const System &system, size_t size) + : PhraseImplTemplate(pool, size) + , pt(pt) + , scoreProperties(NULL) + , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) { + m_scores = new (pool.Allocate()) Scores(system, pool, + system.featureFunctions.GetNumScores()); + } + + Scores &GetScores() { + return *m_scores; + } + + const Scores &GetScores() const { + return *m_scores; + } + + virtual SCORE GetScoreForPruning() const = 0; + + SCORE *GetScoresProperty(int propertyInd) const { + return scoreProperties ? scoreProperties + propertyInd : NULL; + } + + const AlignmentInfo &GetAlignTerm() const { + return *m_alignTerm; + } + + void SetAlignTerm(const AlignmentInfo &alignInfo) { + m_alignTerm = &alignInfo; + } + + // ALNREP = alignment representation, + // see AlignmentInfo constructors for supported representations + template + void + SetAlignTerm(const ALNREP &coll) { + m_alignTerm = AlignmentInfoCollection::Instance().Add(coll); + } + + virtual void SetAlignmentInfo(const std::string &alignString) { + AlignmentInfo::CollType alignTerm; + + std::vector toks = Tokenize(alignString); + for (size_t i = 0; i < toks.size(); ++i) { + std::vector alignPair = Tokenize(toks[i], "-"); + UTIL_THROW_IF2(alignPair.size() != 2, "Wrong alignment format"); + + size_t sourcePos = alignPair[0]; + size_t targetPos = alignPair[1]; + + alignTerm.insert(std::pair(sourcePos, targetPos)); + } + + SetAlignTerm(alignTerm); + // cerr << "TargetPhrase::SetAlignmentInfo(const StringPiece &alignString) this:|" << *this << "|\n"; + + //cerr << "alignTerm=" << alignTerm.size() << endl; + //cerr << "alignNonTerm=" << alignNonTerm.size() << endl; + + } + + void OutputToStream(const System &system, const Phrase &inputPhrase, std::ostream &out) const { + // get placeholders + FactorType placeholderFactor = system.options.input.placeholder_factor; + std::map placeholders; + if (placeholderFactor != NOT_FOUND) { + // creates map of target position -> factor for placeholders + placeholders = GetPlaceholders(system, inputPhrase); + } + + size_t size = PhraseImplTemplate::GetSize(); + for (size_t i = 0; i < size; ++i) { + // output placeholder, if any + std::map::const_iterator iter = placeholders.find(i); + if (iter == placeholders.end()) { + const WORD &word = (*this)[i]; + word.OutputToStream(system, out); + } else { + const Factor *factor = iter->second; + out << *factor; + } + + out << " "; + } + } + + std::map GetPlaceholders(const System &system, const Phrase &inputPhrase) const { + FactorType placeholderFactor = system.options.input.placeholder_factor; + std::map ret; + //std::cerr << "inputPhrase=" << inputPhrase.Debug(system) << std::endl; + + for (size_t sourcePos = 0; sourcePos < inputPhrase.GetSize(); ++sourcePos) { + const Factor *factor = inputPhrase[sourcePos][placeholderFactor]; + if (factor) { + //std::cerr << "factor=" << *factor << std::endl; + //std::cerr << "tp=" << Debug(system) << std::endl; + std::set targetPos = GetAlignTerm().GetAlignmentsForSource(sourcePos); + UTIL_THROW_IF2(targetPos.size() != 1, + "Placeholder should be aligned to 1, and only 1, word:" << targetPos.size() << "!=1"); + ret[*targetPos.begin()] = factor; + } + } + + return ret; + } + + virtual std::string Debug(const System &system) const { + std::stringstream out; + out << Phrase::Debug(system); + out << " pt=" << pt.GetName() << " "; + out << " SCORES:" << GetScores().Debug(system); + out << " ALIGN-T:"; + out << GetAlignTerm().Debug(system); + + return out.str(); + } + +protected: + Scores *m_scores; + const AlignmentInfo *m_alignTerm; +}; + +/////////////////////////////////////////////////////////////////////// +template +struct CompareScoreForPruning { + bool operator()(const TP *a, const TP *b) const { + return a->GetScoreForPruning() > b->GetScoreForPruning(); + } + + bool operator()(const TP &a, const TP &b) const { + return a.GetScoreForPruning() > b.GetScoreForPruning(); + } +}; + +} /* namespace Moses2a */ + diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/BlockHashIndex.cpp b/mosesdecoder/moses2/TranslationModel/CompactPT/BlockHashIndex.cpp new file mode 100644 index 0000000000000000000000000000000000000000..47f03626ab9b06bd090e9fffdcb6551dbaf47ec4 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/BlockHashIndex.cpp @@ -0,0 +1,418 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "ThrowingFwrite.h" +#include "BlockHashIndex.h" +#include "CmphStringVectorAdapter.h" +#include "util/exception.hh" +#include "util/string_stream.hh" + +#ifdef HAVE_CMPH +#include "cmph.h" +#endif + +namespace Moses2 +{ +#ifdef WITH_THREADS +BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits, + size_t threadsNum) : + m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), m_fileHandle(0), m_fileHandleStart( + 0), m_landmarks(true), m_size(0), m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges( + 0), m_threadPool(threadsNum) +{ +#ifndef HAVE_CMPH + std::cerr << "minphr: CMPH support not compiled in." << std::endl; + exit(1); +#endif +} +#else +BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits) + : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_fileHandle(0), m_fileHandleStart(0), m_size(0), + m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0) +{ +#ifndef HAVE_CMPH + std::cerr << "minphr: CMPH support not compiled in." << std::endl; + exit(1); +#endif +} +#endif + +BlockHashIndex::~BlockHashIndex() +{ +#ifdef HAVE_CMPH + for (std::vector::iterator it = m_hashes.begin(); it != m_hashes.end(); + it++) + if (*it != 0) cmph_destroy((cmph_t*) *it); + + for (std::vector*>::iterator it = m_arrays.begin(); + it != m_arrays.end(); it++) + if (*it != 0) delete *it; +#endif +} + +size_t BlockHashIndex::GetHash(const char* key) +{ + std::string keyStr(key); + size_t i = std::distance(m_landmarks.begin(), + std::upper_bound(m_landmarks.begin(), m_landmarks.end(), keyStr)) - 1; + + if (i == 0ul - 1) return GetSize(); + + size_t pos = GetHash(i, key); + if (pos != GetSize()) return (1ul << m_orderBits) * i + pos; + else return GetSize(); +} + +size_t BlockHashIndex::GetFprint(const char* key) const +{ + size_t hash; + MurmurHash3_x86_32(key, std::strlen(key), 100000, &hash); + hash &= (1ul << m_fingerPrintBits) - 1; + return hash; +} + +size_t BlockHashIndex::GetHash(size_t i, const char* key) +{ +//#ifdef WITH_THREADS +// boost::mutex::scoped_lock lock(m_mutex); +//#endif + //if(m_hashes[i] == 0) + //LoadRange(i); +#ifdef HAVE_CMPH + size_t idx = cmph_search((cmph_t*) m_hashes[i], key, + (cmph_uint32) strlen(key)); +#else + assert(0); + size_t idx = 0; +#endif + + std::pair orderPrint = m_arrays[i]->Get(idx, m_orderBits, + m_fingerPrintBits); + m_clocks[i] = clock(); + + if (GetFprint(key) == orderPrint.second) return orderPrint.first; + else return GetSize(); +} + +size_t BlockHashIndex::GetHash(std::string key) +{ + return GetHash(key.c_str()); +} + +size_t BlockHashIndex::operator[](std::string key) +{ + return GetHash(key); +} + +size_t BlockHashIndex::operator[](char* key) +{ + return GetHash(key); +} + +size_t BlockHashIndex::Save(std::string filename) +{ + std::FILE* mphf = std::fopen(filename.c_str(), "w"); + size_t size = Save(mphf); + std::fclose(mphf); + return size; +} + +void BlockHashIndex::BeginSave(std::FILE * mphf) +{ + m_fileHandle = mphf; + ThrowingFwrite(&m_orderBits, sizeof(size_t), 1, m_fileHandle); + ThrowingFwrite(&m_fingerPrintBits, sizeof(size_t), 1, m_fileHandle); + + m_fileHandleStart = std::ftell(m_fileHandle); + + size_t relIndexPos = 0; + ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle); +} + +void BlockHashIndex::SaveRange(size_t i) +{ +#ifdef HAVE_CMPH + if (m_seekIndex.size() <= i) m_seekIndex.resize(i + 1); + m_seekIndex[i] = std::ftell(m_fileHandle) - m_fileHandleStart; + cmph_dump((cmph_t*) m_hashes[i], m_fileHandle); + m_arrays[i]->Save(m_fileHandle); +#endif +} + +void BlockHashIndex::SaveLastRange() +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + while (!m_queue.empty() && m_lastSaved + 1 == -m_queue.top()) { + size_t current = -m_queue.top(); + m_queue.pop(); + SaveRange(current); + m_lastSaved = current; + } +} + +void BlockHashIndex::DropRange(size_t i) +{ +#ifdef HAVE_CMPH + if (m_hashes[i] != 0) { + cmph_destroy((cmph_t*) m_hashes[i]); + m_hashes[i] = 0; + } + if (m_arrays[i] != 0) { + delete m_arrays[i]; + m_arrays[i] = 0; + m_clocks[i] = 0; + } + m_numLoadedRanges--; +#endif +} + +void BlockHashIndex::DropLastRange() +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + while (m_lastDropped != m_lastSaved) + DropRange(++m_lastDropped); +} + +#ifdef WITH_THREADS +void BlockHashIndex::WaitAll() +{ + m_threadPool.Stop(true); +} +#endif + +size_t BlockHashIndex::FinalizeSave() +{ +#ifdef WITH_THREADS + m_threadPool.Stop(true); +#endif + + SaveLastRange(); + + size_t relIndexPos = std::ftell(m_fileHandle) - m_fileHandleStart; + + std::fseek(m_fileHandle, m_fileHandleStart, SEEK_SET); + ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle); + + std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET); + m_landmarks.save(m_fileHandle); + + size_t seekIndexSize = m_seekIndex.size(); + ThrowingFwrite(&seekIndexSize, sizeof(size_t), 1, m_fileHandle); + ThrowingFwrite(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle); + + ThrowingFwrite(&m_size, sizeof(size_t), 1, m_fileHandle); + + size_t fileHandleStop = std::ftell(m_fileHandle); + return fileHandleStop - m_fileHandleStart + sizeof(m_orderBits) + + sizeof(m_fingerPrintBits); +} + +size_t BlockHashIndex::Save(std::FILE * mphf) +{ + m_queue = std::priority_queue(); + BeginSave(mphf); + for (size_t i = 0; i < m_hashes.size(); i++) + SaveRange(i); + return FinalizeSave(); +} + +size_t BlockHashIndex::LoadIndex(std::FILE* mphf) +{ + m_fileHandle = mphf; + + size_t beginning = std::ftell(mphf); + + size_t read = 0; + read += std::fread(&m_orderBits, sizeof(size_t), 1, mphf); + read += std::fread(&m_fingerPrintBits, sizeof(size_t), 1, mphf); + m_fileHandleStart = std::ftell(m_fileHandle); + + size_t relIndexPos; + read += std::fread(&relIndexPos, sizeof(size_t), 1, mphf); + std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET); + + m_landmarks.load(mphf); + + size_t seekIndexSize; + read += std::fread(&seekIndexSize, sizeof(size_t), 1, m_fileHandle); + m_seekIndex.resize(seekIndexSize); + read += std::fread(&m_seekIndex[0], sizeof(size_t), seekIndexSize, + m_fileHandle); + m_hashes.resize(seekIndexSize, 0); + m_clocks.resize(seekIndexSize, 0); + m_arrays.resize(seekIndexSize, 0); + + read += std::fread(&m_size, sizeof(size_t), 1, m_fileHandle); + + size_t end = std::ftell(mphf); + + return end - beginning; +} + +void BlockHashIndex::LoadRange(size_t i) +{ +#ifdef HAVE_CMPH + std::fseek(m_fileHandle, m_fileHandleStart + m_seekIndex[i], SEEK_SET); + cmph_t* hash = cmph_load(m_fileHandle); + m_arrays[i] = new PairedPackedArray<>(0, m_orderBits, m_fingerPrintBits); + m_arrays[i]->Load(m_fileHandle); + + m_hashes[i] = (void*) hash; + m_clocks[i] = clock(); + + m_numLoadedRanges++; +#endif +} + +size_t BlockHashIndex::Load(std::string filename) +{ + std::FILE* mphf = std::fopen(filename.c_str(), "r"); + size_t size = Load(mphf); + std::fclose(mphf); + return size; +} + +size_t BlockHashIndex::Load(std::FILE * mphf) +{ + size_t byteSize = LoadIndex(mphf); + size_t end = std::ftell(mphf); + + for (size_t i = 0; i < m_seekIndex.size(); i++) + LoadRange(i); + std::fseek(m_fileHandle, end, SEEK_SET); + return byteSize; +} + +size_t BlockHashIndex::GetSize() const +{ + return m_size; +} + +void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance) +{ + /* + #ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); + #endif + size_t n = m_hashes.size() * ratio; + size_t max = n * (1 + tolerance); + if(m_numLoadedRanges > max) { + typedef std::vector > LastLoaded; + LastLoaded lastLoaded; + for(size_t i = 0; i < m_hashes.size(); i++) + if(m_hashes[i] != 0) + lastLoaded.push_back(std::make_pair(m_clocks[i], i)); + + std::sort(lastLoaded.begin(), lastLoaded.end()); + for(LastLoaded::reverse_iterator it = lastLoaded.rbegin() + size_t(n * (1 - tolerance)); + it != lastLoaded.rend(); it++) + DropRange(it->second); + }*/ +} + +void BlockHashIndex::CalcHash(size_t current, void* source_void) +{ +#ifdef HAVE_CMPH + cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void; + cmph_config_t *config = cmph_config_new(source); + cmph_config_set_algo(config, CMPH_CHD); + + cmph_t* hash = cmph_new(config); + PairedPackedArray<> *pv = new PairedPackedArray<>(source->nkeys, m_orderBits, + m_fingerPrintBits); + + size_t i = 0; + + source->rewind(source->data); + + std::string lastKey = ""; + while (i < source->nkeys) { + unsigned keylen; + char* key; + source->read(source->data, &key, &keylen); + std::string temp(key, keylen); + source->dispose(source->data, key, keylen); + + if (lastKey > temp) { + if (source->nkeys != 2 || temp != "###DUMMY_KEY###") { + util::StringStream strme; + strme + << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort\n"; + strme << "1: " << lastKey << "\n"; + strme << "2: " << temp << "\n"; + UTIL_THROW2(strme.str()); + } + } + lastKey = temp; + + size_t fprint = GetFprint(temp.c_str()); + size_t idx = cmph_search(hash, temp.c_str(), (cmph_uint32) temp.size()); + + pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits); + i++; + } + + cmph_config_destroy(config); + +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + if (m_hashes.size() <= current) { + m_hashes.resize(current + 1, 0); + m_arrays.resize(current + 1, 0); + m_clocks.resize(current + 1, 0); + } + + m_hashes[current] = (void*) hash; + m_arrays[current] = pv; + m_clocks[current] = clock(); + m_queue.push(-current); +#endif +} + +#ifdef HAVE_CMPH +void* BlockHashIndex::vectorAdapter(std::vector& v) +{ + return (void*) CmphVectorAdapter(v); +} + +void* BlockHashIndex::vectorAdapter( + StringVector& sv) +{ + return (void*) CmphStringVectorAdapter(sv); +} + +void* BlockHashIndex::vectorAdapter( + StringVector& sv) +{ + return (void*) CmphStringVectorAdapter(sv); +} +#endif + +} diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/BlockHashIndex.h b/mosesdecoder/moses2/TranslationModel/CompactPT/BlockHashIndex.h new file mode 100644 index 0000000000000000000000000000000000000000..10c55601eea33f3dbb4797b3753daf57c358ae40 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/BlockHashIndex.h @@ -0,0 +1,195 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_BlockHashIndex_h +#define moses_BlockHashIndex_h + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "MurmurHash3.h" +#include "StringVector.h" +#include "PackedArray.h" +#include "util/exception.hh" +#include "util/string_stream.hh" + +#ifdef WITH_THREADS +#include "../../legacy/ThreadPool.h" +#else +#include +#endif + +#include + +namespace Moses2 +{ + +class BlockHashIndex +{ +private: + std::priority_queue m_queue; + + size_t m_orderBits; + size_t m_fingerPrintBits; + + std::FILE* m_fileHandle; + size_t m_fileHandleStart; + + StringVector m_landmarks; + + std::vector m_hashes; + std::vector m_clocks; + std::vector*> m_arrays; + + std::vector m_seekIndex; + + size_t m_size; + int m_lastSaved; + int m_lastDropped; + size_t m_numLoadedRanges; + +#ifdef WITH_THREADS + ThreadPool m_threadPool; + boost::mutex m_mutex; + + template + class HashTask: public Task + { + public: + HashTask(int id, BlockHashIndex& hash, Keys& keys) : + m_id(id), m_hash(hash), m_keys(new Keys(keys)) { + } + + virtual void Run() { + m_hash.CalcHash(m_id, *m_keys); + } + + virtual ~HashTask() { + delete m_keys; + } + + private: + int m_id; + BlockHashIndex& m_hash; + Keys* m_keys; + }; +#endif + + size_t GetFprint(const char* key) const; + size_t GetHash(size_t i, const char* key); + +public: +#ifdef WITH_THREADS + BlockHashIndex(size_t orderBits, size_t fingerPrintBits, + size_t threadsNum = 2); +#else + BlockHashIndex(size_t orderBits, size_t fingerPrintBits); +#endif + + ~BlockHashIndex(); + + size_t GetHash(const char* key); + size_t GetHash(std::string key); + + size_t operator[](std::string key); + size_t operator[](char* key); + + void BeginSave(std::FILE* mphf); + void SaveRange(size_t i); + void SaveLastRange(); + size_t FinalizeSave(); + +#ifdef WITH_THREADS + void WaitAll(); +#endif + + void DropRange(size_t i); + void DropLastRange(); + + size_t LoadIndex(std::FILE* mphf); + void LoadRange(size_t i); + + size_t Save(std::string filename); + size_t Save(std::FILE * mphf); + + size_t Load(std::string filename); + size_t Load(std::FILE * mphf); + + size_t GetSize() const; + + void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1); + + template + void AddRange(Keys &keys) { + size_t current = m_landmarks.size(); + + if (m_landmarks.size() && m_landmarks.back().str() >= keys[0]) { + util::StringStream strme; + strme + << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort\n"; + strme << "1: " << m_landmarks.back().str() << "\n"; + strme << "2: " << keys[0] << "\n"; + UTIL_THROW2(strme.str()); + } + + m_landmarks.push_back(keys[0]); + m_size += keys.size(); + + if (keys.size() == 1) { + // add dummy key to avoid null hash + keys.push_back("###DUMMY_KEY###"); + } + +#ifdef WITH_THREADS + + boost::shared_ptr > ht( + new HashTask(current, *this, keys)); + m_threadPool.Submit(ht); +#else + CalcHash(current, keys); +#endif + } + + template + void CalcHash(size_t current, Keys &keys) { +#ifdef HAVE_CMPH + void* source = vectorAdapter(keys); + CalcHash(current, source); +#endif + } + + void CalcHash(size_t current, void* source); + +#ifdef HAVE_CMPH + void* vectorAdapter(std::vector& v); + void* vectorAdapter(StringVector& sv); + void* vectorAdapter(StringVector& sv); +#endif +}; + +} +#endif diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/CanonicalHuffman.h b/mosesdecoder/moses2/TranslationModel/CompactPT/CanonicalHuffman.h new file mode 100644 index 0000000000000000000000000000000000000000..eb11c730adaf3ad558868f6a1a5f2d5c9faf5f11 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/CanonicalHuffman.h @@ -0,0 +1,321 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_CanonicalHuffman_h +#define moses_CanonicalHuffman_h + +#include +#include +#include +#include + +#include "ThrowingFwrite.h" + +namespace Moses2 +{ + +template +class CanonicalHuffman +{ +private: + std::vector m_symbols; + std::vector m_firstCodes; + std::vector m_lengthIndex; + + typedef boost::unordered_map > EncodeMap; + EncodeMap m_encodeMap; + + struct MinHeapSorter { + std::vector& m_vec; + + MinHeapSorter(std::vector& vec) : + m_vec(vec) { + } + + bool operator()(size_t a, size_t b) { + return m_vec[a] > m_vec[b]; + } + }; + + template + void CalcLengths(Iterator begin, Iterator end, std::vector& lengths) { + size_t n = std::distance(begin, end); + std::vector A(2 * n, 0); + + m_symbols.resize(n); + size_t i = 0; + for (Iterator it = begin; it != end; it++) { + m_symbols[i] = it->first; + + A[i] = n + i; + A[n + i] = it->second; + i++; + } + + if (n == 1) { + lengths.push_back(1); + return; + } + + MinHeapSorter hs(A); + std::make_heap(A.begin(), A.begin() + n, hs); + + size_t h = n; + size_t m1, m2; + while (h > 1) { + m1 = A[0]; + std::pop_heap(A.begin(), A.begin() + h, hs); + + h--; + + m2 = A[0]; + std::pop_heap(A.begin(), A.begin() + h, hs); + + A[h] = A[m1] + A[m2]; + A[h - 1] = h; + A[m1] = A[m2] = h; + + std::push_heap(A.begin(), A.begin() + h, hs); + } + + A[1] = 0; + for (size_t i = 2; i < 2 * n; i++) + A[i] = A[A[i]] + 1; + + lengths.resize(n); + for (size_t i = 0; i < n; i++) + lengths[i] = A[i + n]; + } + + void CalcCodes(std::vector& lengths) { + std::vector numLength; + for (std::vector::iterator it = lengths.begin(); + it != lengths.end(); it++) { + size_t length = *it; + if (numLength.size() <= length) numLength.resize(length + 1, 0); + numLength[length]++; + } + + m_lengthIndex.resize(numLength.size()); + m_lengthIndex[0] = 0; + for (size_t l = 1; l < numLength.size(); l++) + m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1]; + + size_t maxLength = numLength.size() - 1; + + m_firstCodes.resize(maxLength + 1, 0); + for (size_t l = maxLength - 1; l > 0; l--) + m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2; + + std::vector t_symbols; + t_symbols.resize(lengths.size()); + + std::vector nextCode = m_firstCodes; + for (size_t i = 0; i < lengths.size(); i++) { + Data data = m_symbols[i]; + size_t length = lengths[i]; + + size_t pos = m_lengthIndex[length] + + (nextCode[length] - m_firstCodes[length]); + t_symbols[pos] = data; + + nextCode[length] = nextCode[length] + 1; + } + + m_symbols.swap(t_symbols); + } + + void CreateCodeMap() { + for (size_t l = 1; l < m_lengthIndex.size(); l++) { + size_t intCode = m_firstCodes[l]; + size_t num = ( + (l + 1 < m_lengthIndex.size()) ? + m_lengthIndex[l + 1] : m_symbols.size()) - m_lengthIndex[l]; + + for (size_t i = 0; i < num; i++) { + Data data = m_symbols[m_lengthIndex[l] + i]; + boost::dynamic_bitset<> bitCode(l, intCode); + m_encodeMap[data] = bitCode; + intCode++; + } + } + } + + const boost::dynamic_bitset<>& Encode(Data data) const { + typename EncodeMap::const_iterator it = m_encodeMap.find(data); + UTIL_THROW_IF2(it == m_encodeMap.end(), + "Cannot find symbol in encoding map"); + return it->second; + } + + template + void PutCode(BitWrapper& bitWrapper, const boost::dynamic_bitset<>& code) { + for (int j = code.size() - 1; j >= 0; j--) + bitWrapper.Put(code[j]); + } + +public: + + template + CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true) { + std::vector lengths; + CalcLengths(begin, end, lengths); + CalcCodes(lengths); + + if (forEncoding) CreateCodeMap(); + } + + CanonicalHuffman(std::FILE* pFile, bool forEncoding = false) { + Load(pFile); + + if (forEncoding) CreateCodeMap(); + } + + template + void Put(BitWrapper& bitWrapper, Data data) { + PutCode(bitWrapper, Encode(data)); + } + + template + Data Read(BitWrapper& bitWrapper) { + if (bitWrapper.TellFromEnd()) { + size_t intCode = bitWrapper.Read(); + size_t len = 1; + while (intCode < m_firstCodes[len]) { + intCode = 2 * intCode + bitWrapper.Read(); + len++; + } + return m_symbols[m_lengthIndex[len] + (intCode - m_firstCodes[len])]; + } + return Data(); + } + + size_t Load(std::FILE* pFile) { + size_t start = std::ftell(pFile); + size_t read = 0; + + size_t size; + read += std::fread(&size, sizeof(size_t), 1, pFile); + m_symbols.resize(size); + read += std::fread(&m_symbols[0], sizeof(Data), size, pFile); + + read += std::fread(&size, sizeof(size_t), 1, pFile); + m_firstCodes.resize(size); + read += std::fread(&m_firstCodes[0], sizeof(size_t), size, pFile); + + read += std::fread(&size, sizeof(size_t), 1, pFile); + m_lengthIndex.resize(size); + read += std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile); + + return std::ftell(pFile) - start; + } + + size_t Save(std::FILE* pFile) { + size_t start = std::ftell(pFile); + + size_t size = m_symbols.size(); + ThrowingFwrite(&size, sizeof(size_t), 1, pFile); + ThrowingFwrite(&m_symbols[0], sizeof(Data), size, pFile); + + size = m_firstCodes.size(); + ThrowingFwrite(&size, sizeof(size_t), 1, pFile); + ThrowingFwrite(&m_firstCodes[0], sizeof(size_t), size, pFile); + + size = m_lengthIndex.size(); + ThrowingFwrite(&size, sizeof(size_t), 1, pFile); + ThrowingFwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile); + + return std::ftell(pFile) - start; + } +}; + +template +class BitWrapper +{ +private: + Container& m_data; + + typename Container::iterator m_iterator; + typename Container::value_type m_currentValue; + + size_t m_valueBits; + typename Container::value_type m_mask; + size_t m_bitPos; + +public: + + BitWrapper(Container &data) : + m_data(data), m_iterator(m_data.begin()), m_currentValue(0), m_valueBits( + sizeof(typename Container::value_type) * 8), m_mask(1), m_bitPos(0) { + } + + bool Read() { + if (m_bitPos % m_valueBits == 0) { + if (m_iterator != m_data.end()) m_currentValue = *m_iterator++; + } else m_currentValue = m_currentValue >> 1; + + m_bitPos++; + return (m_currentValue & m_mask); + } + + void Put(bool bit) { + if (m_bitPos % m_valueBits == 0) m_data.push_back(0); + + if (bit) m_data[m_data.size() - 1] |= m_mask << (m_bitPos % m_valueBits); + + m_bitPos++; + } + + size_t Tell() { + return m_bitPos; + } + + size_t TellFromEnd() { + if (m_data.size() * m_valueBits < m_bitPos) return 0; + return m_data.size() * m_valueBits - m_bitPos; + } + + void Seek(size_t bitPos) { + m_bitPos = bitPos; + m_iterator = m_data.begin() + int((m_bitPos - 1) / m_valueBits); + m_currentValue = (*m_iterator) >> ((m_bitPos - 1) % m_valueBits); + m_iterator++; + } + + void SeekFromEnd(size_t bitPosFromEnd) { + size_t bitPos = m_data.size() * m_valueBits - bitPosFromEnd; + Seek(bitPos); + } + + void Reset() { + m_iterator = m_data.begin(); + m_currentValue = 0; + m_bitPos = 0; + } + + Container& GetContainer() { + return m_data; + } +}; + +} + +#endif diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp b/mosesdecoder/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8dc3ebde6f9bdc52ea78a1529005488f5a13b277 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp @@ -0,0 +1,95 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifdef HAVE_CMPH + +#include "CmphStringVectorAdapter.h" + +namespace Moses2 +{ + +void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen) +{ + delete[] key; +} + +void CmphStringVectorAdapterRewind(void *data) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *) data; + cmph_vector->position = 0; +} + +//************************************************************************// + +cmph_io_adapter_t *CmphVectorAdapterNew(std::vector& v) +{ + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *) malloc( + sizeof(cmph_io_adapter_t)); + cmph_vector_t * cmph_vector = (cmph_vector_t *) malloc(sizeof(cmph_vector_t)); + assert(key_source); + assert(cmph_vector); + + cmph_vector->vector = (void *) &v; + cmph_vector->position = 0; + key_source->data = (void *) cmph_vector; + key_source->nkeys = v.size(); + + return key_source; +} + +int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *) data; + std::vector* v = (std::vector*) cmph_vector->vector; + size_t size; + *keylen = (*v)[cmph_vector->position].size(); + size = *keylen; + *key = new char[size + 1]; + std::string temp = (*v)[cmph_vector->position]; + strcpy(*key, temp.c_str()); + cmph_vector->position = cmph_vector->position + 1; + return (int) (*keylen); +} + +void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen) +{ + delete[] key; +} + +void CmphVectorAdapterRewind(void *data) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *) data; + cmph_vector->position = 0; +} + +cmph_io_adapter_t* CmphVectorAdapter(std::vector& v) +{ + cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v); + + key_source->read = CmphVectorAdapterRead; + key_source->dispose = CmphVectorAdapterDispose; + key_source->rewind = CmphVectorAdapterRewind; + return key_source; +} + +} + +#endif diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.h b/mosesdecoder/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.h new file mode 100644 index 0000000000000000000000000000000000000000..8d23b4f412be699a420ea8baa43798181eade4a2 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.h @@ -0,0 +1,107 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_CmphStringVectorAdapterNew_h +#define moses_CmphStringVectorAdapterNew_h + +#include +#include + +#ifdef HAVE_CMPH +#include "cmph.h" + +#include "StringVector.h" + +namespace Moses2 +{ + +typedef struct { + void *vector; + cmph_uint32 position; +} cmph_vector_t; + +template class Allocator> +cmph_io_adapter_t *CmphStringVectorAdapterNew( + StringVector& sv) +{ + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *) malloc( + sizeof(cmph_io_adapter_t)); + cmph_vector_t * cmph_vector = (cmph_vector_t *) malloc(sizeof(cmph_vector_t)); + assert(key_source); + assert(cmph_vector); + + cmph_vector->vector = (void *) &sv; + cmph_vector->position = 0; + key_source->data = (void *) cmph_vector; + key_source->nkeys = sv.size(); + + return key_source; +} + +template class Allocator> +int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *) data; + StringVector* sv = (StringVector*) cmph_vector->vector; + size_t size; + *keylen = (*sv)[cmph_vector->position].size(); + size = *keylen; + *key = new char[size + 1]; + std::string temp = (*sv)[cmph_vector->position]; + std::strcpy(*key, temp.c_str()); + cmph_vector->position = cmph_vector->position + 1; + return (int) (*keylen); +} + +void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen); + +void CmphStringVectorAdapterRewind(void *data); + +template class Allocator> +cmph_io_adapter_t* CmphStringVectorAdapter( + StringVector& sv) +{ + cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv); + + key_source->read = CmphStringVectorAdapterRead; + key_source->dispose = CmphStringVectorAdapterDispose; + key_source->rewind = CmphStringVectorAdapterRewind; + return key_source; +} + +//************************************************************************// + +cmph_io_adapter_t *CmphVectorAdapterNew(std::vector& v); + +int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen); + +void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen); + +void CmphVectorAdapterRewind(void *data); + +cmph_io_adapter_t* CmphVectorAdapter(std::vector& v); + +} + +#endif + +#endif diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp b/mosesdecoder/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp new file mode 100644 index 0000000000000000000000000000000000000000..051116decb24c0fa45e7c8f2b7422b12cc81b172 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp @@ -0,0 +1,172 @@ +// -*- c++ -*- +// vim:tabstop=2 +// $Id$ +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "LexicalReorderingTableCompact.h" +#include "../../SubPhrase.h" +#include "../../legacy/Util2.h" + +namespace Moses2 +{ + +////////////////////////////////////////////////////////////////////////////////////////////// + +bool LexicalReorderingTableCompact::s_inMemoryByDefault = false; + +LexicalReorderingTableCompact::LexicalReorderingTableCompact( + const std::string& filePath, const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors) : + LexicalReorderingTable(f_factors, e_factors, c_factors), m_inMemory( + s_inMemoryByDefault), m_numScoreComponent(6), m_multipleScoreTrees( + true), m_hash(10, 16), m_scoreTrees(1) +{ + Load(filePath); +} + +LexicalReorderingTableCompact::LexicalReorderingTableCompact( + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors) : + LexicalReorderingTable(f_factors, e_factors, c_factors), m_inMemory( + s_inMemoryByDefault), m_numScoreComponent(6), m_multipleScoreTrees( + true), m_hash(10, 16), m_scoreTrees(1) +{ +} + +LexicalReorderingTableCompact::~LexicalReorderingTableCompact() +{ + for (size_t i = 0; i < m_scoreTrees.size(); i++) + delete m_scoreTrees[i]; +} + +std::vector LexicalReorderingTableCompact::GetScore(const Phrase& f, + const Phrase& e, const Phrase& c) +{ + std::string key; + std::vector scores; + + if (0 == c.GetSize()) key = MakeKey(f, e, c); + else { + for (size_t i = 0; i <= c.GetSize(); ++i) { + SubPhrase sub_c = c.GetSubPhrase(i, c.GetSize() - i); + key = MakeKey(f, e, sub_c); + } + } + + size_t index = m_hash[key]; + if (m_hash.GetSize() != index) { + std::string scoresString; + if (m_inMemory) scoresString = m_scoresMemory[index].str(); + else scoresString = m_scoresMapped[index].str(); + + BitWrapper<> bitStream(scoresString); + for (size_t i = 0; i < m_numScoreComponent; i++) + scores.push_back( + m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream)); + + return scores; + } + + return std::vector(); +} + +std::string LexicalReorderingTableCompact::MakeKey(const Phrase& f, + const Phrase& e, const Phrase& c) const +{ + return MakeKey(Trim(f.GetString(m_FactorsF)), Trim(e.GetString(m_FactorsE)), + Trim(c.GetString(m_FactorsC))); +} + +std::string LexicalReorderingTableCompact::MakeKey(const std::string& f, + const std::string& e, const std::string& c) const +{ + std::string key; + if (!f.empty()) key += f; + if (!m_FactorsE.empty()) { + if (!key.empty()) key += " ||| "; + key += e; + } + if (!m_FactorsC.empty()) { + if (!key.empty()) key += " ||| "; + key += c; + } + key += " ||| "; + return key; +} + +LexicalReorderingTable* +LexicalReorderingTableCompact::CheckAndLoad(const std::string& filePath, + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors) +{ +#ifdef HAVE_CMPH + std::string minlexr = ".minlexr"; + // file name is specified without suffix + if (FileExists(filePath + minlexr)) { + //there exists a compact binary version use that + std::cerr << "Using compact lexical reordering table" << std::endl; + return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, + e_factors, c_factors); + } + // file name is specified with suffix + if (filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) + == minlexr && FileExists(filePath)) { + //there exists a compact binary version use that + std::cerr << "Using compact lexical reordering table" << std::endl; + return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, + c_factors); + } +#endif + return 0; +} + +void LexicalReorderingTableCompact::Load(std::string filePath) +{ + std::FILE* pFile = std::fopen(filePath.c_str(), "r"); + UTIL_THROW_IF2(pFile == NULL, "File " << filePath << " could not be opened"); + + //if(m_inMemory) + m_hash.Load(pFile); + //else + //m_hash.LoadIndex(pFile); + + size_t read = 0; + read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, + pFile); + read += std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, + pFile); + + if (m_multipleScoreTrees) { + m_scoreTrees.resize(m_numScoreComponent); + for (size_t i = 0; i < m_numScoreComponent; i++) + m_scoreTrees[i] = new CanonicalHuffman(pFile); + } else { + m_scoreTrees.resize(1); + m_scoreTrees[0] = new CanonicalHuffman(pFile); + } + + if (m_inMemory) m_scoresMemory.load(pFile, false); + else m_scoresMapped.load(pFile, true); +} + +} diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.h b/mosesdecoder/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.h new file mode 100644 index 0000000000000000000000000000000000000000..cef6ae108088666dba58b7189e4bcad54c9b7a68 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.h @@ -0,0 +1,135 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_LexicalReorderingTableCompact_h +#define moses_LexicalReorderingTableCompact_h + +#include "BlockHashIndex.h" +#include "CanonicalHuffman.h" +#include "StringVector.h" +#include "../../TypeDef.h" +#include "../../Phrase.h" + +namespace Moses2 +{ + +//! additional types +class LexicalReorderingTable +{ +public: + LexicalReorderingTable(const FactorList& f_factors, + const FactorList& e_factors, const FactorList& c_factors) : + m_FactorsF(f_factors), m_FactorsE(e_factors), m_FactorsC(c_factors) { + } + + virtual ~LexicalReorderingTable() { + } + +public: + + virtual std::vector + GetScore(const Phrase& f, const Phrase& e, const Phrase& c) = 0; + + virtual + void InitializeForInput() { + /* override for on-demand loading */ + } + ; + + virtual + void InitializeForInputPhrase(const Phrase&) { + } + + const FactorList& GetFFactorMask() const { + return m_FactorsF; + } + const FactorList& GetEFactorMask() const { + return m_FactorsE; + } + const FactorList& GetCFactorMask() const { + return m_FactorsC; + } + + virtual + void DbgDump(std::ostream* out) const { + *out << "Overwrite in subclass...\n"; + } + ; + // why is this not a pure virtual function? - UG + +protected: + FactorList m_FactorsF; + FactorList m_FactorsE; + FactorList m_FactorsC; +}; + +////////////////////////////////////////////////////////////////////////////////////////////// +class LexicalReorderingTableCompact: public LexicalReorderingTable +{ +private: + static bool s_inMemoryByDefault; + bool m_inMemory; + + size_t m_numScoreComponent; + bool m_multipleScoreTrees; + + BlockHashIndex m_hash; + + typedef CanonicalHuffman ScoreTree; + std::vector m_scoreTrees; + + StringVector m_scoresMapped; + StringVector m_scoresMemory; + + std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const; + std::string MakeKey(const std::string& f, const std::string& e, + const std::string& c) const; + +public: + LexicalReorderingTableCompact(const std::string& filePath, + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors); + + LexicalReorderingTableCompact(const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors); + + virtual + ~LexicalReorderingTableCompact(); + + virtual std::vector + GetScore(const Phrase& f, const Phrase& e, const Phrase& c); + + static LexicalReorderingTable* + CheckAndLoad(const std::string& filePath, + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors); + + void + Load(std::string filePath); + +}; + +} + +#endif diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/ListCoders.h b/mosesdecoder/moses2/TranslationModel/CompactPT/ListCoders.h new file mode 100644 index 0000000000000000000000000000000000000000..540f50a595285081f7033378aa0581abda54215a --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/ListCoders.h @@ -0,0 +1,383 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_ListCoders_h +#define moses_ListCoders_h + +#include +#include + +namespace Moses2 +{ + +template +class VarIntType +{ +private: + template + static void EncodeSymbol(IntType input, OutIt output) { + if (input == 0) { + *output = 0; + output++; + return; + } + + T msb = 1 << (sizeof(T) * 8 - 1); + IntType mask = ~msb; + IntType shift = (sizeof(T) * 8 - 1); + + while (input) { + T res = input & mask; + input >>= shift; + if (input) res |= msb; + *output = res; + output++; + } + } + ; + + template + static void DecodeSymbol(InIt &it, InIt end, IntType &output) { + T msb = 1 << (sizeof(T) * 8 - 1); + IntType shift = (sizeof(T) * 8 - 1); + + output = 0; + size_t i = 0; + while (it != end && *it & msb) { + IntType temp = *it & ~msb; + temp <<= shift * i; + output |= temp; + it++; + i++; + } + assert(it != end); + + IntType temp = *it; + temp <<= shift * i; + output |= temp; + it++; + } + +public: + + template + static void Encode(InIt it, InIt end, OutIt outIt) { + while (it != end) { + EncodeSymbol(*it, outIt); + it++; + } + } + + template + static void Decode(InIt &it, InIt end, OutIt outIt) { + while (it != end) { + size_t output; + DecodeSymbol(it, end, output); + *outIt = output; + outIt++; + } + } + + template + static size_t DecodeAndSum(InIt &it, InIt end, size_t num) { + size_t sum = 0; + size_t curr = 0; + + while (it != end && curr < num) { + size_t output; + DecodeSymbol(it, end, output); + sum += output; + curr++; + } + + return sum; + } + +}; + +typedef VarIntType VarByte; + +typedef VarByte VarInt8; +typedef VarIntType VarInt16; +typedef VarIntType VarInt32; + +class Simple9 +{ +private: + typedef unsigned int uint; + + template + inline static void EncodeSymbol(uint &output, InIt it, InIt end) { + uint length = end - it; + + uint type = 0; + uint bitlength = 0; + + switch (length) { + case 1: + type = 1; + bitlength = 28; + break; + case 2: + type = 2; + bitlength = 14; + break; + case 3: + type = 3; + bitlength = 9; + break; + case 4: + type = 4; + bitlength = 7; + break; + case 5: + type = 5; + bitlength = 5; + break; + case 7: + type = 6; + bitlength = 4; + break; + case 9: + type = 7; + bitlength = 3; + break; + case 14: + type = 8; + bitlength = 2; + break; + case 28: + type = 9; + bitlength = 1; + break; + } + + output = 0; + output |= (type << 28); + + uint i = 0; + while (it != end) { + UTIL_THROW_IF2(*it > 268435455, + "You are trying to encode " << *it + << " with Simple9. Cannot encode numbers larger than 268435455 (2^28-1)"); + + uint l = bitlength * (length - i - 1); + output |= *it << l; + it++; + i++; + } + } + + template + static inline void DecodeSymbol(uint input, OutIt outIt) { + uint type = (input >> 28); + + uint bitlen = 0; + uint shift = 0; + uint mask = 0; + + switch (type) { + case 1: + bitlen = 28; + shift = 0; + mask = 268435455; + break; + case 2: + bitlen = 14; + shift = 14; + mask = 16383; + break; + case 3: + bitlen = 9; + shift = 18; + mask = 511; + break; + case 4: + bitlen = 7; + shift = 21; + mask = 127; + break; + case 5: + bitlen = 5; + shift = 20; + mask = 31; + break; + case 6: + bitlen = 4; + shift = 24; + mask = 15; + break; + case 7: + bitlen = 3; + shift = 24; + mask = 7; + break; + case 8: + bitlen = 2; + shift = 26; + mask = 3; + break; + case 9: + bitlen = 1; + shift = 27; + mask = 1; + break; + } + + while (shift > 0) { + *outIt = (input >> shift) & mask; + shift -= bitlen; + outIt++; + } + *outIt = input & mask; + outIt++; + } + + static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) { + uint type = (input >> 28); + + uint bitlen = 0; + uint shift = 0; + uint mask = 0; + + switch (type) { + case 1: + bitlen = 28; + shift = 0; + mask = 268435455; + break; + case 2: + bitlen = 14; + shift = 14; + mask = 16383; + break; + case 3: + bitlen = 9; + shift = 18; + mask = 511; + break; + case 4: + bitlen = 7; + shift = 21; + mask = 127; + break; + case 5: + bitlen = 5; + shift = 20; + mask = 31; + break; + case 6: + bitlen = 4; + shift = 24; + mask = 15; + break; + case 7: + bitlen = 3; + shift = 24; + mask = 7; + break; + case 8: + bitlen = 2; + shift = 26; + mask = 3; + break; + case 9: + bitlen = 1; + shift = 27; + mask = 1; + break; + } + + size_t sum = 0; + while (shift > 0) { + sum += (input >> shift) & mask; + shift -= bitlen; + if (++curr == num) return sum; + } + sum += input & mask; + curr++; + return sum; + } + +public: + template + static void Encode(InIt it, InIt end, OutIt outIt) { + uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 }; + + uint buffer[28]; + for (InIt i = it; i < end; i++) { + uint lastbit = 1; + uint lastpos = 0; + uint lastyes = 0; + uint j = 0; + + double log2 = log(2); + while (j < 9 && lastpos < 28 && (i + lastpos) < end) { + if (lastpos >= parts[j]) j++; + + buffer[lastpos] = *(i + lastpos); + + uint reqbit = ceil(log(buffer[lastpos] + 1) / log2); + assert(reqbit <= 28); + + uint bit = 28 / floor(28 / reqbit); + if (lastbit < bit) lastbit = bit; + + if (parts[j] > 28 / lastbit) break; + else if (lastpos == parts[j] - 1) lastyes = lastpos; + + lastpos++; + } + i += lastyes; + + uint length = lastyes + 1; + uint output; + EncodeSymbol(output, buffer, buffer + length); + + *outIt = output; + outIt++; + } + } + + template + static void Decode(InIt &it, InIt end, OutIt outIt) { + while (it != end) { + DecodeSymbol(*it, outIt); + it++; + } + } + + template + static size_t DecodeAndSum(InIt &it, InIt end, size_t num) { + size_t sum = 0; + size_t curr = 0; + while (it != end && curr < num) { + sum += DecodeAndSumSymbol(*it, num, curr); + it++; + } + assert(curr == num); + return sum; + } +}; + +} + +#endif diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/MmapAllocator.h b/mosesdecoder/moses2/TranslationModel/CompactPT/MmapAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..09ba58d931eaf94678f48d9e4af9cd4079aebd56 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/MmapAllocator.h @@ -0,0 +1,206 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_MmapAllocator_h +#define moses_MmapAllocator_h + +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#define _WINSOCKAPI_ +#include +#include +#else +#include +#include +#endif + +#include "util/mmap.hh" + +namespace Moses2 +{ +template +class MmapAllocator +{ +protected: + std::FILE* m_file_ptr; + size_t m_file_desc; + + size_t m_page_size; + size_t m_map_size; + + char* m_data_ptr; + size_t m_data_offset; + bool m_fixed; + size_t* m_count; + +public: + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + MmapAllocator() throw () : + m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), m_page_size( + util::SizePage()), m_map_size(0), m_data_ptr(0), m_data_offset(0), m_fixed( + false), m_count(new size_t(0)) { + } + + MmapAllocator(std::FILE* f_ptr) throw () : + m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), m_page_size( + util::SizePage()), m_map_size(0), m_data_ptr(0), m_data_offset(0), m_fixed( + false), m_count(new size_t(0)) { + } + + MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw () : + m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), m_page_size( + util::SizePage()), m_map_size(0), m_data_ptr(0), m_data_offset( + data_offset), m_fixed(true), m_count(new size_t(0)) { + } + + MmapAllocator(std::string fileName) throw () : + m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc( + fileno(m_file_ptr)), m_page_size(util::SizePage()), m_map_size(0), m_data_ptr( + 0), m_data_offset(0), m_fixed(false), m_count(new size_t(0)) { + } + + MmapAllocator(const MmapAllocator& c) throw () : + m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc), m_page_size( + c.m_page_size), m_map_size(c.m_map_size), m_data_ptr(c.m_data_ptr), m_data_offset( + c.m_data_offset), m_fixed(c.m_fixed), m_count(c.m_count) { + (*m_count)++; + } + + ~MmapAllocator() throw () { + if (m_data_ptr && *m_count == 0) { + util::UnmapOrThrow(m_data_ptr, m_map_size); + if (!m_fixed && std::ftell(m_file_ptr) != -1) std::fclose(m_file_ptr); + } + (*m_count)--; + } + + template + struct rebind { + typedef MmapAllocator other; + }; + + pointer address(reference value) const { + return &value; + } + + const_pointer address(const_reference value) const { + return &value; + } + + size_type max_size() const throw () { + return std::numeric_limits::max() / sizeof(value_type); + } + + pointer allocate(size_type num, const void* = 0) { + m_map_size = num * sizeof(T); + +#if defined(_WIN32) || defined(_WIN64) + // On Windows, MAP_SHARED is not defined and MapOrThrow ignores the flags. + const int map_shared = 0; +#else + const int map_shared = MAP_SHARED; +#endif + if (!m_fixed) { + size_t read = 0; +#ifdef _WIN32 + read += _chsize_s(m_file_desc, m_map_size); +#else + read += ftruncate(m_file_desc, m_map_size); +#endif + m_data_ptr = (char *) util::MapOrThrow(m_map_size, true, map_shared, + false, m_file_desc, 0); + return (pointer) m_data_ptr; + } else { + const size_t map_offset = (m_data_offset / m_page_size) * m_page_size; + const size_t relative_offset = m_data_offset - map_offset; + const size_t adjusted_map_size = m_map_size + relative_offset; + + m_data_ptr = (char *) util::MapOrThrow(adjusted_map_size, false, + map_shared, false, m_file_desc, map_offset); + + return (pointer) (m_data_ptr + relative_offset); + } + } + + void deallocate(pointer p, size_type num) { + if (!m_fixed) { + util::UnmapOrThrow(p, num * sizeof(T)); + } else { + const size_t map_offset = (m_data_offset / m_page_size) * m_page_size; + const size_t relative_offset = m_data_offset - map_offset; + const size_t adjusted_map_size = m_map_size + relative_offset; + + util::UnmapOrThrow((pointer) ((char*) p - relative_offset), + adjusted_map_size); + } + } + + void construct(pointer p, const T& value) { + if (!m_fixed) new (p) value_type(value); + } + void destroy(pointer p) { + if (!m_fixed) p->~T(); + } + + template + friend bool operator==(const MmapAllocator&, + const MmapAllocator&) throw (); + + template + friend bool operator!=(const MmapAllocator&, + const MmapAllocator&) throw (); +}; + +template +bool operator==(const MmapAllocator& a1, + const MmapAllocator& a2) throw () +{ + bool equal = true; + equal &= a1.m_file_ptr == a2.m_file_ptr; + equal &= a1.m_file_desc == a2.m_file_desc; + equal &= a1.m_page_size == a2.m_page_size; + equal &= a1.m_map_size == a2.m_map_size; + equal &= a1.m_data_ptr == a2.m_data_ptr; + equal &= a1.m_data_offset == a2.m_data_offset; + equal &= a1.m_fixed == a2.m_fixed; + return equal; +} + +template +bool operator!=(const MmapAllocator& a1, + const MmapAllocator& a2) throw () +{ + return !(a1 == a2); +} + +} + +#endif diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/MonotonicVector.h b/mosesdecoder/moses2/TranslationModel/CompactPT/MonotonicVector.h new file mode 100644 index 0000000000000000000000000000000000000000..17935465776ceb5eeb1ebbbe1314ebc6ebe444bc --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/MonotonicVector.h @@ -0,0 +1,232 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_MonotonicVector_h +#define moses_MonotonicVector_h + +// MonotonicVector - Represents a monotonic increasing function that maps +// positive integers of any size onto a given number type. Each value has to be +// equal or larger than the previous one. Depending on the stepSize it can save +// up to 90% of memory compared to a std::vector. Time complexity is roughly +// constant, in the worst case, however, stepSize times slower than a normal +// std::vector. + +#include +#include +#include +#include +#include + +#include "ThrowingFwrite.h" +#include "ListCoders.h" +#include "MmapAllocator.h" + +namespace Moses2 +{ + +template class Allocator = std::allocator> +class MonotonicVector +{ +private: + typedef std::vector > Anchors; + typedef std::vector > Diffs; + + Anchors m_anchors; + Diffs m_diffs; + std::vector m_tempDiffs; + + size_t m_size; + PosT m_last; + bool m_final; + +public: + typedef PosT value_type; + + MonotonicVector() : + m_size(0), m_last(0), m_final(false) { + } + + size_t size() const { + return m_size + m_tempDiffs.size(); + } + + PosT at(size_t i) const { + PosT s = stepSize; + PosT j = m_anchors[i / s]; + PosT r = i % s; + + typename Diffs::const_iterator it = m_diffs.begin() + j; + + PosT k = 0; + k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1); + if (i < m_size) k += Simple9::DecodeAndSum(it, m_diffs.end(), r); + else if (i < m_size + m_tempDiffs.size()) for (size_t l = 0; l < r; l++) + k += m_tempDiffs[l]; + + return k; + } + + PosT operator[](PosT i) const { + return at(i); + } + + PosT back() const { + return at(size() - 1); + } + + void push_back(PosT i) { + assert(m_final != true); + + if (m_anchors.size() == 0 && m_tempDiffs.size() == 0) { + m_anchors.push_back(0); + VarInt32::Encode(&i, &i + 1, std::back_inserter(m_diffs)); + m_last = i; + m_size++; + + return; + } + + if (m_tempDiffs.size() == stepSize - 1) { + Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(), + std::back_inserter(m_diffs)); + m_anchors.push_back(m_diffs.size()); + VarInt32::Encode(&i, &i + 1, std::back_inserter(m_diffs)); + + m_size += m_tempDiffs.size() + 1; + m_tempDiffs.clear(); + } else { + PosT last = m_last; + PosT diff = i - last; + m_tempDiffs.push_back(diff); + } + m_last = i; + } + + void commit() { + assert(m_final != true); + Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(), + std::back_inserter(m_diffs)); + m_size += m_tempDiffs.size(); + m_tempDiffs.clear(); + m_final = true; + } + + size_t usage() { + return m_diffs.size() * sizeof(unsigned int) + + m_anchors.size() * sizeof(NumT); + } + + size_t load(std::FILE* in, bool map = false) { + size_t byteSize = 0; + + byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool); + byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t); + byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT); + + byteSize += loadVector(m_diffs, in, map); + byteSize += loadVector(m_anchors, in, map); + + return byteSize; + } + + template + size_t loadVector(std::vector >& v, + std::FILE* in, bool map = false) { + // Can only be read into memory. Mapping not possible with std:allocator. + assert(map == false); + + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + v.resize(valSize, 0); + byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); + + return byteSize; + } + + template + size_t loadVector(std::vector >& v, + std::FILE* in, bool map = false) { + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + if (map == false) { + // Read data into temporary file (default constructor of MmapAllocator) + // and map memory onto temporary file. Can be resized. + + v.resize(valSize, 0); + byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) + * sizeof(ValueT); + } else { + // Map it directly on specified region of file "in" starting at valPos + // with length valSize * sizeof(ValueT). Mapped region cannot be resized. + + size_t valPos = std::ftell(in); + + Allocator alloc(in, valPos); + std::vector > vTemp(alloc); + vTemp.resize(valSize); + v.swap(vTemp); + + std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR); + byteSize += valSize * sizeof(ValueT); + } + + return byteSize; + } + + size_t save(std::FILE* out) { + if (!m_final) commit(); + + bool byteSize = 0; + byteSize += ThrowingFwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool); + byteSize += ThrowingFwrite(&m_size, sizeof(size_t), 1, out) + * sizeof(size_t); + byteSize += ThrowingFwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT); + + size_t size = m_diffs.size(); + byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t); + byteSize += ThrowingFwrite(&m_diffs[0], sizeof(unsigned int), size, out) + * sizeof(unsigned int); + + size = m_anchors.size(); + byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t); + byteSize += ThrowingFwrite(&m_anchors[0], sizeof(NumT), size, out) + * sizeof(NumT); + + return byteSize; + } + + void swap(MonotonicVector &mv) { + if (!m_final) commit(); + + m_diffs.swap(mv.m_diffs); + m_anchors.swap(mv.m_anchors); + } +}; + +} +#endif diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/MurmurHash3.cpp b/mosesdecoder/moses2/TranslationModel/CompactPT/MurmurHash3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..988c1627f208ceba3f137d8778d03f3b5f52f90d --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/MurmurHash3.cpp @@ -0,0 +1,424 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "MurmurHash3.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE inline __attribute__((always_inline)) + +inline uint32_t rotl32(uint32_t x, int8_t r) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64(uint64_t x, int8_t r) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock(const uint32_t * p, int i) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock(const uint64_t * p, int i) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix(uint32_t h) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix(uint64_t k) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32(const void * key, int len, uint32_t seed, void * out) +{ + const uint8_t * data = (const uint8_t*) key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *) (data + nblocks * 4); + + for (int i = -nblocks; i; i++) { + uint32_t k1 = getblock(blocks, i); + + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*) (data + nblocks * 4); + + uint32_t k1 = 0; + + switch (len & 3) { + case 3: + k1 ^= tail[2] << 16; + case 2: + k1 ^= tail[1] << 8; + case 1: + k1 ^= tail[0]; + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix(h1); + + *(uint32_t*) out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128(const void * key, const int len, uint32_t seed, + void * out) +{ + const uint8_t * data = (const uint8_t*) key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *) (data + nblocks * 16); + + for (int i = -nblocks; i; i++) { + uint32_t k1 = getblock(blocks, i * 4 + 0); + uint32_t k2 = getblock(blocks, i * 4 + 1); + uint32_t k3 = getblock(blocks, i * 4 + 2); + uint32_t k4 = getblock(blocks, i * 4 + 3); + + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + h1 ^= k1; + + h1 = ROTL32(h1, 19); + h1 += h2; + h1 = h1 * 5 + 0x561ccd1b; + + k2 *= c2; + k2 = ROTL32(k2, 16); + k2 *= c3; + h2 ^= k2; + + h2 = ROTL32(h2, 17); + h2 += h3; + h2 = h2 * 5 + 0x0bcaa747; + + k3 *= c3; + k3 = ROTL32(k3, 17); + k3 *= c4; + h3 ^= k3; + + h3 = ROTL32(h3, 15); + h3 += h4; + h3 = h3 * 5 + 0x96cd1c35; + + k4 *= c4; + k4 = ROTL32(k4, 18); + k4 *= c1; + h4 ^= k4; + + h4 = ROTL32(h4, 13); + h4 += h1; + h4 = h4 * 5 + 0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*) (data + nblocks * 16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch (len & 15) { + case 15: + k4 ^= tail[14] << 16; + case 14: + k4 ^= tail[13] << 8; + case 13: + k4 ^= tail[12] << 0; + k4 *= c4; + k4 = ROTL32(k4, 18); + k4 *= c1; + h4 ^= k4; + + case 12: + k3 ^= tail[11] << 24; + case 11: + k3 ^= tail[10] << 16; + case 10: + k3 ^= tail[9] << 8; + case 9: + k3 ^= tail[8] << 0; + k3 *= c3; + k3 = ROTL32(k3, 17); + k3 *= c4; + h3 ^= k3; + + case 8: + k2 ^= tail[7] << 24; + case 7: + k2 ^= tail[6] << 16; + case 6: + k2 ^= tail[5] << 8; + case 5: + k2 ^= tail[4] << 0; + k2 *= c2; + k2 = ROTL32(k2, 16); + k2 *= c3; + h2 ^= k2; + + case 4: + k1 ^= tail[3] << 24; + case 3: + k1 ^= tail[2] << 16; + case 2: + k1 ^= tail[1] << 8; + case 1: + k1 ^= tail[0] << 0; + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + h2 ^= len; + h3 ^= len; + h4 ^= len; + + h1 += h2; + h1 += h3; + h1 += h4; + h2 += h1; + h3 += h1; + h4 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + h3 = fmix(h3); + h4 = fmix(h4); + + h1 += h2; + h1 += h3; + h1 += h4; + h2 += h1; + h3 += h1; + h4 += h1; + + ((uint32_t*) out)[0] = h1; + ((uint32_t*) out)[1] = h2; + ((uint32_t*) out)[2] = h3; + ((uint32_t*) out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128(const void * key, const int len, const uint32_t seed, + void * out) +{ + const uint8_t * data = (const uint8_t*) key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *) (data); + + for (int i = 0; i < nblocks; i++) { + uint64_t k1 = getblock(blocks, i * 2 + 0); + uint64_t k2 = getblock(blocks, i * 2 + 1); + + k1 *= c1; + k1 = ROTL64(k1, 31); + k1 *= c2; + h1 ^= k1; + + h1 = ROTL64(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + + k2 *= c2; + k2 = ROTL64(k2, 33); + k2 *= c1; + h2 ^= k2; + + h2 = ROTL64(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*) (data + nblocks * 16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch (len & 15) { + case 15: + k2 ^= uint64_t(tail[14]) << 48; + case 14: + k2 ^= uint64_t(tail[13]) << 40; + case 13: + k2 ^= uint64_t(tail[12]) << 32; + case 12: + k2 ^= uint64_t(tail[11]) << 24; + case 11: + k2 ^= uint64_t(tail[10]) << 16; + case 10: + k2 ^= uint64_t(tail[9]) << 8; + case 9: + k2 ^= uint64_t(tail[8]) << 0; + k2 *= c2; + k2 = ROTL64(k2, 33); + k2 *= c1; + h2 ^= k2; + + case 8: + k1 ^= uint64_t(tail[7]) << 56; + case 7: + k1 ^= uint64_t(tail[6]) << 48; + case 6: + k1 ^= uint64_t(tail[5]) << 40; + case 5: + k1 ^= uint64_t(tail[4]) << 32; + case 4: + k1 ^= uint64_t(tail[3]) << 24; + case 3: + k1 ^= uint64_t(tail[2]) << 16; + case 2: + k1 ^= uint64_t(tail[1]) << 8; + case 1: + k1 ^= uint64_t(tail[0]) << 0; + k1 *= c1; + k1 = ROTL64(k1, 31); + k1 *= c2; + h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*) out)[0] = h1; + ((uint64_t*) out)[1] = h2; +} + +//----------------------------------------------------------------------------- + diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/MurmurHash3.h b/mosesdecoder/moses2/TranslationModel/CompactPT/MurmurHash3.h new file mode 100644 index 0000000000000000000000000000000000000000..ef885a6d4b86aca918205dfe44c10201b4f7d015 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/MurmurHash3.h @@ -0,0 +1,38 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#include +//typedef unsigned char uint8_t; +//typedef unsigned long uint32_t; +//typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32(const void * key, int len, uint32_t seed, void * out); + +void MurmurHash3_x86_128(const void * key, int len, uint32_t seed, void * out); + +void MurmurHash3_x64_128(const void * key, int len, uint32_t seed, void * out); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/PackedArray.h b/mosesdecoder/moses2/TranslationModel/CompactPT/PackedArray.h new file mode 100644 index 0000000000000000000000000000000000000000..2da59a9f2dd4d6b3ad0593533790746afea6cd68 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/PackedArray.h @@ -0,0 +1,191 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_PackedArray_h +#define moses_PackedArray_h + +#include +#include +#include +#include + +#include "ThrowingFwrite.h" + +namespace Moses2 +{ + +template +class PackedArray +{ +protected: + static size_t m_dataBits; + + size_t m_size; + size_t m_storageSize; + D* m_storage; + +public: + PackedArray() { + m_size = 0; + m_storageSize = 0; + m_storage = new D[0]; + } + + PackedArray(size_t size, size_t bits) : + m_size(size) { + m_storageSize = ceil(float(bits * size) / float(m_dataBits)); + m_storage = new D[m_storageSize]; + } + + PackedArray(const PackedArray &c) { + m_size = c.m_size; + + m_storageSize = c.m_storageSize; + m_storage = new D[m_storageSize]; + + std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D)); + } + + virtual ~PackedArray() { + delete[] m_storage; + m_size = 0; + m_storageSize = 0; + m_storage = 0; + } + + T Get(size_t i, size_t bits) const { + T out = 0; + + size_t bitstart = (i * bits); + size_t bitpos = bitstart; + + size_t zero = ((1ul << (bits)) - 1); + + while (bitpos - bitstart < bits) { + size_t pos = bitpos / m_dataBits; + size_t off = bitpos % m_dataBits; + + out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off; + + bitpos += (m_dataBits - off); + } + + out &= zero; + return out; + } + + void Set(size_t i, T v, size_t bits) { + size_t bitstart = (i * bits); + size_t bitpos = bitstart; + + while (bitpos - bitstart < bits) { + size_t pos = bitpos / m_dataBits; + size_t off = bitpos % m_dataBits; + + size_t rest = bits - (bitpos - bitstart); + D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1); + + m_storage[pos] &= zero; + m_storage[pos] |= v << off; + v = v >> (m_dataBits - off); + bitpos += (m_dataBits - off); + } + } + + virtual D*& GetStorage() { + return m_storage; + } + + virtual size_t GetStorageSize() const { + return m_storageSize; + } + + virtual size_t Size() const { + return m_size; + } + + virtual size_t Load(std::FILE* in) { + size_t a1 = std::ftell(in); + + size_t read = 0; + read += std::fread(&m_size, sizeof(m_size), 1, in); + read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in); + delete[] m_storage; + m_storage = new D[m_storageSize]; + read += std::fread(m_storage, sizeof(D), m_storageSize, in); + + size_t a2 = std::ftell(in); + return a2 - a1; + } + + virtual size_t Save(std::FILE* out) { + size_t a1 = std::ftell(out); + + ThrowingFwrite(&m_size, sizeof(m_size), 1, out); + ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out); + ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out); + + size_t a2 = std::ftell(out); + return a2 - a1; + } + +}; + +template +size_t PackedArray::m_dataBits = sizeof(D) * 8; + +/**************************************************************************/ + +template +class PairedPackedArray: public PackedArray +{ +public: + PairedPackedArray() : + PackedArray() { + } + + PairedPackedArray(size_t size, size_t bits1, size_t bits2) : + PackedArray(size, bits1 + bits2) { + } + + void Set(size_t i, T a, T b, size_t bits1, size_t bits2) { + T c = 0; + c = a | (b << bits1); + PackedArray::Set(i, c, bits1 + bits2); + } + + void Set(size_t i, std::pair p, size_t bits1, size_t bits2) { + T c = 0; + c = p.second | (p.first << bits1); + PackedArray::Set(i, c); + } + + std::pair Get(size_t i, size_t bits1, size_t bits2) { + T v = PackedArray::Get(i, bits1 + bits2); + T a = v & ((1 << bits1) - 1); + T b = v >> bits1; + return std::pair(a, b); + } +}; + +} + +#endif diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/StringVector.h b/mosesdecoder/moses2/TranslationModel/CompactPT/StringVector.h new file mode 100644 index 0000000000000000000000000000000000000000..0b2aa176ff9ebf0488e8428226dc3cecef9dc8c5 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/StringVector.h @@ -0,0 +1,650 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_StringVector_h +#define moses_StringVector_h + +#include +#include +#include +#include +#include +#include + +#include + +#include "ThrowingFwrite.h" +#include "MonotonicVector.h" +#include "MmapAllocator.h" + +namespace Moses2 +{ + +// ********** ValueIteratorRange ********** + +template +class ValueIteratorRange +{ +private: + ValueIteratorT m_begin; + ValueIteratorT m_end; + +public: + ValueIteratorRange(ValueIteratorT begin, ValueIteratorT end); + + const ValueIteratorT& begin() const; + const ValueIteratorT& end() const; + const std::string str() const; + operator const std::string() { + return str(); + } + + size_t size() { + return std::distance(m_begin, m_end); + } + + template + bool operator==(const StringT& o) const; + bool operator==(const char* c) const; + + template + bool operator<(const StringT& o) const; + bool operator<(const char* c) const; +}; + +// ********** StringVector ********** + +template class Allocator = std::allocator> +class StringVector +{ +protected: + bool m_sorted; + bool m_memoryMapped; + + std::vector >* m_charArray; + MonotonicVector m_positions; + + virtual const ValueT* value_ptr(PosT i) const; + +public: + //typedef ValueIteratorRange >::const_iterator> range; + typedef ValueIteratorRange range; + + // ********** RangeIterator ********** + + class RangeIterator: public boost::iterator_facade + { + + private: + PosT m_index; + StringVector* m_container; + + public: + RangeIterator(); + RangeIterator(StringVector &sv, PosT index = 0); + + PosT get_index(); + + private: + friend class boost::iterator_core_access; + + range dereference() const; + bool equal(RangeIterator const& other) const; + void increment(); + void decrement(); + void advance(PosT n); + + PosT distance_to(RangeIterator const& other) const; + }; + + // ********** StringIterator ********** + + class StringIterator: public boost::iterator_facade + { + + private: + PosT m_index; + StringVector* m_container; + + public: + StringIterator(); + StringIterator(StringVector &sv, PosT index = 0); + + PosT get_index(); + + private: + friend class boost::iterator_core_access; + + const std::string dereference() const; + bool equal(StringIterator const& other) const; + void increment(); + void decrement(); + void advance(PosT n); + PosT distance_to(StringIterator const& other) const; + }; + + typedef RangeIterator iterator; + typedef StringIterator string_iterator; + + StringVector(bool allocate = false); + StringVector(Allocator& alloc); + + virtual ~StringVector() { + delete m_charArray; + } + + void swap(StringVector &c) { + m_positions.commit(); + m_positions.swap(c.m_positions); + m_charArray->swap(*c.m_charArray); + + bool temp = m_sorted; + m_sorted = c.m_sorted; + c.m_sorted = temp; + } + + bool is_sorted() const; + PosT size() const; + virtual PosT size2() const; + + template Iterator begin() const; + template Iterator end() const; + + iterator begin() const; + iterator end() const; + + PosT length(PosT i) const; + //typename std::vector >::const_iterator begin(PosT i) const; + //typename std::vector >::const_iterator end(PosT i) const; + const ValueT* begin(PosT i) const; + const ValueT* end(PosT i) const; + + void clear() { + m_charArray->clear(); + m_sorted = true; + m_positions = MonotonicVector(); + } + + range at(PosT i) const; + range operator[](PosT i) const; + range back() const; + + template + void push_back(StringT s); + void push_back(const char* c); + + template + PosT find(StringT &s) const; + PosT find(const char* c) const; + + virtual size_t load(std::FILE* in, bool memoryMapped = false) { + size_t size = 0; + m_memoryMapped = memoryMapped; + + size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool); + size += m_positions.load(in, false); + + size += loadCharArray(m_charArray, in, m_memoryMapped); + return size; + } + + size_t loadCharArray(std::vector >*& c, + std::FILE* in, bool map = false) { + // Can only be read into memory. Mapping not possible with std:allocator. + assert(map == false); + + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + c = new std::vector >(valSize, 0); + byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) + * sizeof(ValueT); + + return byteSize; + } + + size_t loadCharArray(std::vector >*& c, + std::FILE* in, bool map = false) { + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + if (map == false) { + // Read data into temporary file (default constructor of MmapAllocator) + // and map memory onto temporary file. Can be resized. + c = new std::vector >(valSize, 0); + byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) + * sizeof(ValueT); + } else { + // Map it directly on specified region of file "in" starting at valPos + // with length valSize * sizeof(ValueT). Mapped region cannot be resized. + + size_t valPos = std::ftell(in); + Allocator alloc(in, valPos); + c = new std::vector >(alloc); + c->resize(valSize, 0); + + byteSize += valSize * sizeof(ValueT); + } + + return byteSize; + } + + size_t load(std::string filename, bool memoryMapped = false) { + std::FILE* pFile = fopen(filename.c_str(), "r"); + size_t byteSize = load(pFile, memoryMapped); + fclose(pFile); + return byteSize; + } + + size_t save(std::FILE* out) { + size_t byteSize = 0; + byteSize += ThrowingFwrite(&m_sorted, sizeof(bool), 1, out) * sizeof(bool); + + byteSize += m_positions.save(out); + + size_t valSize = size2(); + byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out) + * sizeof(size_t); + byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out) + * sizeof(ValueT); + + return byteSize; + } + + size_t save(std::string filename) { + std::FILE* pFile = fopen(filename.c_str(), "w"); + size_t byteSize = save(pFile); + fclose(pFile); + return byteSize; + } + +}; + +// ********** Implementation ********** + +// ValueIteratorRange + +template +ValueIteratorRange::ValueIteratorRange(ValueIteratorT begin, + ValueIteratorT end) : + m_begin(begin), m_end(end) +{ +} + +template +const ValueIteratorT& ValueIteratorRange::begin() const +{ + return m_begin; +} + +template +const ValueIteratorT& ValueIteratorRange::end() const +{ + return m_end; +} + +template +const std::string ValueIteratorRange::str() const +{ + std::string dummy; + for (ValueIteratorT it = m_begin; it != m_end; it++) + dummy.push_back(*it); + return dummy; +} + +template +template +bool ValueIteratorRange::operator==(const StringT& o) const +{ + if (std::distance(m_begin, m_end) == std::distance(o.begin(), o.end())) return std::equal( + m_begin, m_end, o.begin()); + else return false; +} + +template +bool ValueIteratorRange::operator==(const char* c) const +{ + return *this == std::string(c); +} + +template +template +bool ValueIteratorRange::operator<(const StringT &s2) const +{ + return std::lexicographical_compare(m_begin, m_end, s2.begin(), s2.end(), + std::less::value_type>()); +} + +template +bool ValueIteratorRange::operator<(const char* c) const +{ + return *this < std::string(c); +} + +template +bool operator<(const StringT &s1, const ValueIteratorRange &s2) +{ + return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), + s2.end(), + std::less::value_type>()); +} + +template +bool operator<(const char* c, const ValueIteratorRange &s2) +{ + size_t len = std::char_traits::length(c); + return std::lexicographical_compare(c, c + len, s2.begin(), s2.end(), + std::less::value_type>()); +} + +template +OStream& operator<<(OStream &os, ValueIteratorRange cr) +{ + ValueIteratorT it = cr.begin(); + while (it != cr.end()) + os << *(it++); + return os; +} + +// StringVector + +template class Allocator> +StringVector::StringVector(bool allocate) : + m_sorted(true), m_memoryMapped(false), m_charArray( + allocate ? new std::vector >() : 0) +{ +} + +template class Allocator> +StringVector::StringVector(Allocator &alloc) : + m_sorted(true), m_memoryMapped(false), m_charArray( + new std::vector >(alloc)) +{ +} + +template class Allocator> +template +void StringVector::push_back(StringT s) +{ + if (is_sorted() && size() && !(back() < s)) m_sorted = false; + + m_positions.push_back(size2()); + std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray)); +} + +template class Allocator> +void StringVector::push_back(const char* c) +{ + std::string dummy(c); + push_back(dummy); +} + +template class Allocator> +template +Iterator StringVector::begin() const +{ + return Iterator(const_cast&>(*this), 0); +} + +template class Allocator> +template +Iterator StringVector::end() const +{ + return Iterator(const_cast&>(*this), + size()); +} + +template class Allocator> +typename StringVector::iterator StringVector::begin() const +{ + return begin(); +} +; + +template class Allocator> +typename StringVector::iterator StringVector::end() const +{ + return end(); +} +; + +template class Allocator> +bool StringVector::is_sorted() const +{ + return m_sorted; +} + +template class Allocator> +PosT StringVector::size() const +{ + return m_positions.size(); +} + +template class Allocator> +PosT StringVector::size2() const +{ + return m_charArray->size(); +} + +template class Allocator> +typename StringVector::range StringVector::at(PosT i) const +{ + return range(begin(i), end(i)); +} + +template class Allocator> +typename StringVector::range StringVector::operator[](PosT i) const +{ + return at(i); +} + +template class Allocator> +typename StringVector::range StringVector::back() const +{ + return at(size() - 1); +} + +template class Allocator> +PosT StringVector::length(PosT i) const +{ + if (i + 1 < size()) return m_positions[i + 1] - m_positions[i]; + else return size2() - m_positions[i]; +} + +template class Allocator> +const ValueT* StringVector::value_ptr(PosT i) const +{ + return &(*m_charArray)[m_positions[i]]; +} + +template class Allocator> +//typename std::vector >::const_iterator StringVector::begin(PosT i) const +const ValueT* StringVector::begin(PosT i) const +{ + //return typename std::vector >::const_iterator(value_ptr(i)); + return value_ptr(i); +} + +template class Allocator> +//typename std::vector >::const_iterator StringVector::end(PosT i) const +const ValueT* StringVector::end(PosT i) const +{ + //return typename std::vector >::const_iterator(value_ptr(i) + length(i)); + return value_ptr(i) + length(i); +} + +template class Allocator> +template +PosT StringVector::find(StringT &s) const +{ + if (m_sorted) return std::distance(begin(), + std::lower_bound(begin(), end(), s)); + return std::distance(begin(), std::find(begin(), end(), s)); +} + +template class Allocator> +PosT StringVector::find(const char* c) const +{ + std::string s(c); + return find(s); +} + +// RangeIterator + +template class Allocator> +StringVector::RangeIterator::RangeIterator() : + m_index(0), m_container(0) +{ +} + +template class Allocator> +StringVector::RangeIterator::RangeIterator( + StringVector &sv, PosT index) : + m_index(index), m_container(&sv) +{ +} + +template class Allocator> +PosT StringVector::RangeIterator::get_index() +{ + return m_index; +} + +template class Allocator> +typename StringVector::range StringVector::RangeIterator::dereference() const +{ + return typename StringVector::range( + m_container->begin(m_index), m_container->end(m_index)); +} + +template class Allocator> +bool StringVector::RangeIterator::equal( + StringVector::RangeIterator const& other) const +{ + return m_index == other.m_index && m_container == other.m_container; +} + +template class Allocator> +void StringVector::RangeIterator::increment() +{ + m_index++; +} + +template class Allocator> +void StringVector::RangeIterator::decrement() +{ + m_index--; +} + +template class Allocator> +void StringVector::RangeIterator::advance(PosT n) +{ + m_index += n; +} + +template class Allocator> +PosT StringVector::RangeIterator::distance_to( + StringVector::RangeIterator const& other) const +{ + return other.m_index - m_index; +} + +// StringIterator + +template class Allocator> +StringVector::StringIterator::StringIterator() : + m_index(0), m_container(0) +{ +} + +template class Allocator> +StringVector::StringIterator::StringIterator( + StringVector &sv, PosT index) : + m_index(index), m_container(&sv) +{ +} + +template class Allocator> +PosT StringVector::StringIterator::get_index() +{ + return m_index; +} + +template class Allocator> +const std::string StringVector::StringIterator::dereference() const +{ + return StringVector::range( + m_container->begin(m_index), m_container->end(m_index)).str(); +} + +template class Allocator> +bool StringVector::StringIterator::equal( + StringVector::StringIterator const& other) const +{ + return m_index == other.m_index && m_container == other.m_container; +} + +template class Allocator> +void StringVector::StringIterator::increment() +{ + m_index++; +} + +template class Allocator> +void StringVector::StringIterator::decrement() +{ + m_index--; +} + +template class Allocator> +void StringVector::StringIterator::advance(PosT n) +{ + m_index += n; +} + +template class Allocator> +PosT StringVector::StringIterator::distance_to( + StringVector::StringIterator const& other) const +{ + return other.m_index - m_index; +} + +// ********** Some typedefs ********** + +typedef StringVector MediumStringVector; +typedef StringVector LongStringVector; + +} + +#endif diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp b/mosesdecoder/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp new file mode 100644 index 0000000000000000000000000000000000000000..07d0469e062e7f05befbed7c900ab58a7a0c4f48 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp @@ -0,0 +1,39 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "TargetPhraseCollectionCache.h" + +namespace Moses2 +{ + +boost::thread_specific_ptr +TargetPhraseCollectionCache::m_phraseCache; + +PhraseCompact::PhraseCompact(const Phrase ©) +{ + for (size_t i = 0; i < copy.GetSize(); ++i) { + const Word &word = copy[i]; + push_back(word); + } +} + +} + diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.h b/mosesdecoder/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.h new file mode 100644 index 0000000000000000000000000000000000000000..75ab40c9327210723ee369736ce3c9399a88bcef --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.h @@ -0,0 +1,174 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include +#include +#include + +#include +#include + +#include "../../Word.h" +#include "../../Phrase.h" + +namespace Moses2 +{ +typedef std::pair AlignPointSizeT; + +struct PhraseCompact : public std::vector { +public: + PhraseCompact(const Phrase ©); +}; + +struct TPCompact { + std::vector words; + std::set alignment; + std::vector scores; + +}; + +// Avoid using new due to locking +typedef std::vector TargetPhraseVector; +typedef boost::shared_ptr TargetPhraseVectorPtr; + +/** Implementation of Persistent Cache **/ +class TargetPhraseCollectionCache +{ +private: + size_t m_max; + float m_tolerance; + + struct LastUsed { + clock_t m_clock; + TargetPhraseVectorPtr m_tpv; + size_t m_bitsLeft; + + LastUsed() : m_clock(0), m_bitsLeft(0) {} + + LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0) + : m_clock(clock), m_tpv(tpv), m_bitsLeft(bitsLeft) {} + }; + + typedef std::map CacheMap; + static boost::thread_specific_ptr m_phraseCache; + +public: + + typedef CacheMap::iterator iterator; + typedef CacheMap::const_iterator const_iterator; + + TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2) + : m_max(max), m_tolerance(tolerance) { + } + + iterator Begin() { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + return m_phraseCache->begin(); + } + + const_iterator Begin() const { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + return m_phraseCache->begin(); + } + + iterator End() { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + return m_phraseCache->end(); + } + + const_iterator End() const { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + return m_phraseCache->end(); + } + + /** retrieve translations for source phrase from persistent cache **/ + void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv, + size_t bitsLeft = 0, size_t maxRank = 0) { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + // check if source phrase is already in cache + iterator it = m_phraseCache->find(sourcePhrase); + if(it != m_phraseCache->end()) + // if found, just update clock + it->second.m_clock = clock(); + else { + // else, add to cache + if(maxRank && tpv->size() > maxRank) { + TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector()); + tpv_temp->resize(maxRank); + std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin()); + (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft); + } else + (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft); + } + } + + std::pair Retrieve(const Phrase &sourcePhrase) { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + iterator it = m_phraseCache->find(sourcePhrase); + if(it != m_phraseCache->end()) { + LastUsed &lu = it->second; + lu.m_clock = clock(); + return std::make_pair(lu.m_tpv, lu.m_bitsLeft); + } else + return std::make_pair(TargetPhraseVectorPtr(), 0); + } + + // if cache full, reduce + void Prune() { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + if(m_phraseCache->size() > m_max * (1 + m_tolerance)) { + typedef std::set > Cands; + Cands cands; + for(CacheMap::iterator it = m_phraseCache->begin(); + it != m_phraseCache->end(); it++) { + LastUsed &lu = it->second; + cands.insert(std::make_pair(lu.m_clock, it->first)); + } + + for(Cands::iterator it = cands.begin(); it != cands.end(); it++) { + const PhraseCompact& p = it->second; + m_phraseCache->erase(p); + + if(m_phraseCache->size() < (m_max * (1 - m_tolerance))) + break; + } + } + } + + void CleanUp() { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + m_phraseCache->clear(); + } + +}; + +} + diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/ThrowingFwrite.cpp b/mosesdecoder/moses2/TranslationModel/CompactPT/ThrowingFwrite.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d9fec5013e35882d518016ed70ab2bdd043b8ce2 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/ThrowingFwrite.cpp @@ -0,0 +1,30 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "ThrowingFwrite.h" + +size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream) +{ + assert(size); + size_t returnValue = std::fwrite(ptr, size, count, stream); + UTIL_THROW_IF2(count != returnValue, "Short fwrite; requested size " << size); + return returnValue; +} diff --git a/mosesdecoder/moses2/TranslationModel/CompactPT/ThrowingFwrite.h b/mosesdecoder/moses2/TranslationModel/CompactPT/ThrowingFwrite.h new file mode 100644 index 0000000000000000000000000000000000000000..2a0c71a2714dac17fb2f2cef15989e8b230a6c58 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/CompactPT/ThrowingFwrite.h @@ -0,0 +1,31 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_ThrowingFwrite_h +#define moses_ThrowingFwrite_h + +#include +#include +#include "util/exception.hh" + +size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream); + +#endif diff --git a/mosesdecoder/moses2/TranslationModel/Dynamic/DynamicPhraseTable.cpp b/mosesdecoder/moses2/TranslationModel/Dynamic/DynamicPhraseTable.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a7edd3f846736e20baff74fb4ef0a815c4be27db --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/Dynamic/DynamicPhraseTable.cpp @@ -0,0 +1,180 @@ +/* + * DynamicPhraseTable.cpp + * + * Created on: 28 Oct 2015 + * Author: hieu + */ + +#include +#include +#include +#include "DynamicPhraseTable.h" +#include "../../PhraseBased/PhraseImpl.h" +#include "../../Phrase.h" +#include "../../System.h" +#include "../../Scores.h" +#include "../../InputPathsBase.h" +#include "../../legacy/InputFileStream.h" +#include "util/exception.hh" + +#include "../../PhraseBased/InputPath.h" +#include "../../PhraseBased/TargetPhraseImpl.h" +#include "../../PhraseBased/TargetPhrases.h" +#include "../../PhraseBased/SentenceWithCandidates.h" + +#include "../../SCFG/PhraseImpl.h" +#include "../../SCFG/TargetPhraseImpl.h" +#include "../../SCFG/InputPath.h" +#include "../../SCFG/Stack.h" +#include "../../SCFG/Stacks.h" +#include "../../SCFG/Manager.h" + +#include "../../PhraseBased/SentenceWithCandidates.h" +#include "../../PhraseBased/Manager.h" + +using namespace std; + +namespace Moses2 +{ +thread_local DynamicPhraseTable::PBNODE DynamicPhraseTable::m_rootPb; + +//////////////////////////////////////////////////////////////////////// + +DynamicPhraseTable::DynamicPhraseTable(size_t startInd, const std::string &line) + :PhraseTable(startInd, line) +{ + ReadParameters(); +} + +DynamicPhraseTable::~DynamicPhraseTable() +{ + m_rootPb.CleanNode(); +} + +void DynamicPhraseTable::CreatePTForInput(const ManagerBase &mgr, string phraseTableString) +{ + //cerr << "In CreatePTForInput" << endl << flush; + const System &system = mgr.system; + FactorCollection &vocab = system.GetVocab(); + MemPool &pool = mgr.GetPool(); + MemPool tmpSourcePool; + + if (system.isPb) { + //m_rootPb = new PBNODE(); + } else { + abort(); + //cerr << "m_rootSCFG=" << m_rootSCFG << endl; + } + + vector toks; + size_t lineNum = 0; + istringstream strme(phraseTableString); + string line; + while (getline(strme, line)) { + if (++lineNum % 1000000 == 0) { + cerr << lineNum << " "; + } + toks.clear(); + TokenizeMultiCharSeparator(toks, line, "|||"); + UTIL_THROW_IF2(toks.size() < 3, "Wrong format"); + //cerr << "line=" << line << endl; + //cerr << "system.isPb=" << system.isPb << endl; + + if (system.isPb) { + PhraseImpl *source = PhraseImpl::CreateFromString(tmpSourcePool, vocab, system, + toks[0]); + //cerr << "created soure" << endl; + TargetPhraseImpl *target = TargetPhraseImpl::CreateFromString(pool, *this, system, + toks[1]); + //cerr << "created target" << endl; + target->GetScores().CreateFromString(toks[2], *this, system, true); + //cerr << "created scores:" << *target << endl; + + if (toks.size() >= 4) { + //cerr << "alignstr=" << toks[3] << endl; + target->SetAlignmentInfo(toks[3]); + } + + // properties + if (toks.size() == 7) { + //target->properties = (char*) system.systemPool.Allocate(toks[6].size() + 1); + //strcpy(target->properties, toks[6].c_str()); + } + + system.featureFunctions.EvaluateInIsolation(pool, system, *source, + *target); + //cerr << "EvaluateInIsolation:" << target->Debug(system) << endl; + m_rootPb.AddRule(m_input, *source, target); + + //cerr << "target=" << target->Debug(system) << endl; + } else { + abort(); + } + } + + if (system.isPb) { + m_rootPb.SortAndPrune(m_tableLimit, pool, system); + //cerr << "root=" << &m_rootPb << endl; + } else { + abort(); + } + /* + BOOST_FOREACH(const PtMem::Node::Children::value_type &valPair, m_rootPb.GetChildren()) { + const Word &word = valPair.first; + cerr << word << " "; + } + cerr << endl; + */ + +} + +void DynamicPhraseTable::InitializeForInput(const ManagerBase &mgr, const InputType &input) +{ + // downcast to SentenceWithCandidates + const SentenceWithCandidates &inputObj = static_cast(input); + CreatePTForInput(mgr, inputObj.getPhraseTableString()); +} + +TargetPhrases* DynamicPhraseTable::Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const +{ + const SubPhrase &phrase = inputPath.subPhrase; + TargetPhrases *tps = m_rootPb.Find(m_input, phrase); + return tps; +} + +void DynamicPhraseTable::CleanUpAfterSentenceProcessing(const System &system, const InputType &input) const { + m_rootPb.CleanNode(); //TODO : clean this +} + +void DynamicPhraseTable::InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const +{ + abort(); +} + +void DynamicPhraseTable::Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const +{ + abort(); +} + +void DynamicPhraseTable::LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const +{ + abort(); +} + +} + diff --git a/mosesdecoder/moses2/TranslationModel/Dynamic/DynamicPhraseTable.h b/mosesdecoder/moses2/TranslationModel/Dynamic/DynamicPhraseTable.h new file mode 100644 index 0000000000000000000000000000000000000000..99588eda84ec01c21289bff62b9babc47a1ebef8 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/Dynamic/DynamicPhraseTable.h @@ -0,0 +1,88 @@ +/* + * MSPT.h + * + * Created on: 28 Oct 2015 + * Author: hieu + */ +#pragma once + +#include "../PhraseTable.h" +#include "../../legacy/Util2.h" +#include "../../SCFG/InputPath.h" +#include "DynamicPhraseTableNode.h" +#include "../../PhraseBased/PhraseImpl.h" +#include "../../PhraseBased/TargetPhraseImpl.h" +#include "../../PhraseBased/TargetPhrases.h" +#include "../../SCFG/PhraseImpl.h" +#include "../../SCFG/TargetPhraseImpl.h" +#include "../../SCFG/TargetPhrases.h" + +namespace Moses2 +{ + +class DynamicPhraseTable: public PhraseTable +{ + typedef DynamicPhraseTableNS::Node, TargetPhraseImpl, TargetPhrases> PBNODE; + typedef DynamicPhraseTableNS::Node, SCFG::TargetPhraseImpl, SCFG::TargetPhrases> SCFGNODE; + +////////////////////////////////////// + class ActiveChartEntryMem : public SCFG::ActiveChartEntry + { + typedef SCFG::ActiveChartEntry Parent; + public: + const DynamicPhraseTable::SCFGNODE &node; + + ActiveChartEntryMem(MemPool &pool, const DynamicPhraseTable::SCFGNODE &vnode) + :Parent(pool) + ,node(vnode) + {} + + ActiveChartEntryMem( + MemPool &pool, + const DynamicPhraseTable::SCFGNODE &vnode, + const ActiveChartEntry &prevEntry) + :Parent(prevEntry) + ,node(vnode) + {} + }; + + ////////////////////////////////////// +public: + DynamicPhraseTable(size_t startInd, const std::string &line); + virtual ~DynamicPhraseTable(); + + virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const; + + virtual void InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const; + + void Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const; + + virtual void InitializeForInput(const ManagerBase &mgr, const InputType &input); + virtual void CleanUpAfterSentenceProcessing(const System &system, const InputType &input) const; + +protected: + thread_local static PBNODE m_rootPb; + + void LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const; + + void CreatePTForInput(const ManagerBase &mgr, std::string phraseTableString); + +}; + +} + diff --git a/mosesdecoder/moses2/TranslationModel/Dynamic/DynamicPhraseTableNode.h b/mosesdecoder/moses2/TranslationModel/Dynamic/DynamicPhraseTableNode.h new file mode 100644 index 0000000000000000000000000000000000000000..56eb181cb00d380678c6d73fe322e69754edd656 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/Dynamic/DynamicPhraseTableNode.h @@ -0,0 +1,133 @@ +/* + * Node.h + * + * Created on: 22 Apr 2016 + * Author: hieu + */ +#pragma once +#include +#include +#include "../../PhraseBased/TargetPhrases.h" +#include "../../System.h" +#include "../../Phrase.h" + +namespace Moses2 +{ +class System; + +namespace DynamicPhraseTableNS +{ + +template +class Node +{ +public: + typedef boost::unordered_map Children; + + Node() + :m_targetPhrases(NULL) + ,m_unsortedTPS(NULL) + {} + + ~Node() + {} + + void AddRule(const std::vector &factors, SP &source, TP *target) { + AddRule(factors, source, target, 0); + } + + TPS *Find(const std::vector &factors, const SP &source, size_t pos = 0) const { + assert(source.GetSize()); + if (pos == source.GetSize()) { + return m_targetPhrases; + } else { + const WORD &word = source[pos]; + //cerr << "word=" << word << endl; + typename Children::const_iterator iter = m_children.find(word.hash(factors)); + if (iter == m_children.end()) { + return NULL; + } else { + const Node &child = iter->second; + return child.Find(factors, source, pos + 1); + } + } + } + + const Node *Find(const std::vector &factors, const WORD &word) const { + typename Children::const_iterator iter = m_children.find(word.hash(factors)); + if (iter == m_children.end()) { + return NULL; + } else { + const Node &child = iter->second; + return &child; + } + } + + const TPS *GetTargetPhrases() const { + return m_targetPhrases; + } + + void SortAndPrune(size_t tableLimit, MemPool &pool, const System &system) { + BOOST_FOREACH(typename Children::value_type &val, m_children) { + Node &child = val.second; + child.SortAndPrune(tableLimit, pool, system); + } + + // prune target phrases in this node + if (m_unsortedTPS) { + m_targetPhrases = new (pool.Allocate()) TPS(pool, m_unsortedTPS->size()); + + for (size_t i = 0; i < m_unsortedTPS->size(); ++i) { + TP *tp = (*m_unsortedTPS)[i]; + m_targetPhrases->AddTargetPhrase(*tp); + } + + m_targetPhrases->SortAndPrune(tableLimit); + system.featureFunctions.EvaluateAfterTablePruning(system.GetSystemPool(), *m_targetPhrases, *m_source); + + delete m_unsortedTPS; + } + } + void CleanNode() { + m_children.clear(); + } + const Children &GetChildren() const { + return m_children; + } + + void Debug(std::ostream &out, const System &system) const { + BOOST_FOREACH(const typename Children::value_type &valPair, m_children) { + const WORD &word = valPair.first; + //std::cerr << word << "(" << word.hash() << ") "; + } + } +protected: + Children m_children; + TPS *m_targetPhrases; + Phrase *m_source; + std::vector *m_unsortedTPS; + + Node &AddRule(const std::vector &factors, SP &source, TP *target, size_t pos) { + if (pos == source.GetSize()) { + if (m_unsortedTPS == NULL) { + m_unsortedTPS = new std::vector(); + m_source = &source; + } + + m_unsortedTPS->push_back(target); + return *this; + } else { + const WORD &word = source[pos]; + Node &child = m_children[word.hash(factors)]; + //std::cerr << "added " << word << " " << &child << " from " << this << std::endl; + + return child.AddRule(factors, source, target, pos + 1); + } + } + +}; + + +} +} // namespace + diff --git a/mosesdecoder/moses2/TranslationModel/MSPT/MSNode.h b/mosesdecoder/moses2/TranslationModel/MSPT/MSNode.h new file mode 100644 index 0000000000000000000000000000000000000000..ad6d0842d22c7b31e461a8dc5b308f621161de73 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/MSPT/MSNode.h @@ -0,0 +1,131 @@ +/* + * Node.h + * + * Created on: 22 Apr 2016 + * Author: hieu + */ +#pragma once +#include +#include +#include "../../PhraseBased/TargetPhrases.h" +#include "../../System.h" +#include "../../Phrase.h" + +namespace Moses2 +{ +class System; + +namespace MSPTNS +{ + +template +class Node +{ +public: + typedef boost::unordered_map Children; + + Node() + :m_targetPhrases(NULL) + ,m_unsortedTPS(NULL) + {} + + ~Node() + {} + + void AddRule(const std::vector &factors, SP &source, TP *target) { + AddRule(factors, source, target, 0); + } + + TPS *Find(const std::vector &factors, const SP &source, size_t pos = 0) const { + assert(source.GetSize()); + if (pos == source.GetSize()) { + return m_targetPhrases; + } else { + const WORD &word = source[pos]; + //cerr << "word=" << word << endl; + typename Children::const_iterator iter = m_children.find(word.hash(factors)); + if (iter == m_children.end()) { + return NULL; + } else { + const Node &child = iter->second; + return child.Find(factors, source, pos + 1); + } + } + } + + const Node *Find(const std::vector &factors, const WORD &word) const { + typename Children::const_iterator iter = m_children.find(word.hash(factors)); + if (iter == m_children.end()) { + return NULL; + } else { + const Node &child = iter->second; + return &child; + } + } + + const TPS *GetTargetPhrases() const { + return m_targetPhrases; + } + + void SortAndPrune(size_t tableLimit, MemPool &pool, System &system) { + BOOST_FOREACH(typename Children::value_type &val, m_children) { + Node &child = val.second; + child.SortAndPrune(tableLimit, pool, system); + } + + // prune target phrases in this node + if (m_unsortedTPS) { + m_targetPhrases = new (pool.Allocate()) TPS(pool, m_unsortedTPS->size()); + + for (size_t i = 0; i < m_unsortedTPS->size(); ++i) { + TP *tp = (*m_unsortedTPS)[i]; + m_targetPhrases->AddTargetPhrase(*tp); + } + + m_targetPhrases->SortAndPrune(tableLimit); + system.featureFunctions.EvaluateAfterTablePruning(system.GetSystemPool(), *m_targetPhrases, *m_source); + + delete m_unsortedTPS; + } + } + + const Children &GetChildren() const { + return m_children; + } + + void Debug(std::ostream &out, const System &system) const { + BOOST_FOREACH(const typename Children::value_type &valPair, m_children) { + const WORD &word = valPair.first; + //std::cerr << word << "(" << word.hash() << ") "; + } + } +protected: + Children m_children; + TPS *m_targetPhrases; + Phrase *m_source; + std::vector *m_unsortedTPS; + + Node &AddRule(const std::vector &factors, SP &source, TP *target, size_t pos) { + if (pos == source.GetSize()) { + if (m_unsortedTPS == NULL) { + m_unsortedTPS = new std::vector(); + m_source = &source; + } + + m_unsortedTPS->push_back(target); + return *this; + } else { + const WORD &word = source[pos]; + Node &child = m_children[word.hash(factors)]; + //std::cerr << "added " << word << " " << &child << " from " << this << std::endl; + + return child.AddRule(factors, source, target, pos + 1); + } + } + +}; + + +} +} // namespace + diff --git a/mosesdecoder/moses2/TranslationModel/MSPT/MSPT.cpp b/mosesdecoder/moses2/TranslationModel/MSPT/MSPT.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c905d52406cc9dda32b65e7f14ad2b67d26e8c33 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/MSPT/MSPT.cpp @@ -0,0 +1,265 @@ +/* + * MSPT.cpp + * + * Created on: 28 Oct 2015 + * Author: hieu + */ + +#include +#include +#include "MSPT.h" +#include "../../PhraseBased/PhraseImpl.h" +#include "../../Phrase.h" +#include "../../System.h" +#include "../../Scores.h" +#include "../../InputPathsBase.h" +#include "../../legacy/InputFileStream.h" +#include "util/exception.hh" + +#include "../../PhraseBased/InputPath.h" +#include "../../PhraseBased/TargetPhraseImpl.h" +#include "../../PhraseBased/TargetPhrases.h" + +#include "../../SCFG/PhraseImpl.h" +#include "../../SCFG/TargetPhraseImpl.h" +#include "../../SCFG/InputPath.h" +#include "../../SCFG/Stack.h" +#include "../../SCFG/Stacks.h" +#include "../../SCFG/Manager.h" + + +using namespace std; + +namespace Moses2 +{ + + +//////////////////////////////////////////////////////////////////////// + +MSPT::MSPT(size_t startInd, const std::string &line) + :PhraseTable(startInd, line) + ,m_rootPb(NULL) + ,m_rootSCFG(NULL) +{ + ReadParameters(); +} + +MSPT::~MSPT() +{ + delete m_rootPb; + delete m_rootSCFG; +} + +void MSPT::Load(System &system) +{ + FactorCollection &vocab = system.GetVocab(); + MemPool &systemPool = system.GetSystemPool(); + MemPool tmpSourcePool; + + if (system.isPb) { + m_rootPb = new PBNODE(); + } else { + m_rootSCFG = new SCFGNODE(); + //cerr << "m_rootSCFG=" << m_rootSCFG << endl; + } + + vector toks; + size_t lineNum = 0; + InputFileStream strme(m_path); + string line; + while (getline(strme, line)) { + if (++lineNum % 1000000 == 0) { + cerr << lineNum << " "; + } + toks.clear(); + TokenizeMultiCharSeparator(toks, line, "|||"); + UTIL_THROW_IF2(toks.size() < 3, "Wrong format"); + //cerr << "line=" << line << endl; + //cerr << "system.isPb=" << system.isPb << endl; + + if (system.isPb) { + PhraseImpl *source = PhraseImpl::CreateFromString(tmpSourcePool, vocab, system, + toks[0]); + //cerr << "created soure" << endl; + TargetPhraseImpl *target = TargetPhraseImpl::CreateFromString(systemPool, *this, system, + toks[1]); + //cerr << "created target" << endl; + target->GetScores().CreateFromString(toks[2], *this, system, true); + //cerr << "created scores:" << *target << endl; + + if (toks.size() >= 4) { + //cerr << "alignstr=" << toks[3] << endl; + target->SetAlignmentInfo(toks[3]); + } + + // properties + if (toks.size() == 7) { + //target->properties = (char*) system.systemPool.Allocate(toks[6].size() + 1); + //strcpy(target->properties, toks[6].c_str()); + } + + system.featureFunctions.EvaluateInIsolation(systemPool, system, *source, + *target); + //cerr << "EvaluateInIsolation:" << *target << endl; + m_rootPb->AddRule(m_input, *source, target); + + //cerr << "target=" << target->Debug(system) << endl; + } else { + SCFG::PhraseImpl *source = SCFG::PhraseImpl::CreateFromString(tmpSourcePool, vocab, system, + toks[0]); + //cerr << "created source:" << *source << endl; + SCFG::TargetPhraseImpl *target = SCFG::TargetPhraseImpl::CreateFromString(systemPool, *this, + system, toks[1]); + + //cerr << "created target " << *target << " source=" << *source << endl; + + target->GetScores().CreateFromString(toks[2], *this, system, true); + //cerr << "created scores:" << *target << endl; + + //vector scores = Tokenize(toks[2]); + //target->sortScore = (scores.size() >= 3) ? TransformScore(scores[2]) : 0; + + target->SetAlignmentInfo(toks[3]); + + // properties + if (toks.size() == 7) { + //target->properties = (char*) system.systemPool.Allocate(toks[6].size() + 1); + //strcpy(target->properties, toks[6].c_str()); + } + + system.featureFunctions.EvaluateInIsolation(systemPool, system, *source, + *target); + //cerr << "EvaluateInIsolation:" << *target << endl; + m_rootSCFG->AddRule(m_input, *source, target); + } + } + + if (system.isPb) { + m_rootPb->SortAndPrune(m_tableLimit, systemPool, system); + //cerr << "root=" << &m_rootPb << endl; + } else { + m_rootSCFG->SortAndPrune(m_tableLimit, systemPool, system); + //cerr << "root=" << &m_rootPb << endl; + } + /* + BOOST_FOREACH(const PtMem::Node::Children::value_type &valPair, m_rootPb.GetChildren()) { + const Word &word = valPair.first; + cerr << word << " "; + } + cerr << endl; + */ +} + +TargetPhrases* MSPT::Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const +{ + const SubPhrase &phrase = inputPath.subPhrase; + TargetPhrases *tps = m_rootPb->Find(m_input, phrase); + return tps; +} + +void MSPT::InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const +{ + size_t ptInd = GetPtInd(); + ActiveChartEntryMem *chartEntry = new (pool.Allocate()) ActiveChartEntryMem(pool, *m_rootSCFG); + path.AddActiveChartEntry(ptInd, chartEntry); + //cerr << "InitActiveChart=" << path << endl; +} + +void MSPT::Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const +{ + if (path.range.GetNumWordsCovered() > maxChartSpan) { + return; + } + + size_t endPos = path.range.GetEndPos(); + + const SCFG::InputPath *prevPath = static_cast(path.prefixPath); + UTIL_THROW_IF2(prevPath == NULL, "prefixPath == NULL"); + + // TERMINAL + const SCFG::Word &lastWord = path.subPhrase.Back(); + + const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(endPos, 1); + + //cerr << "BEFORE LookupGivenWord=" << *prevPath << endl; + LookupGivenWord(pool, mgr, *prevPath, lastWord, NULL, subPhrasePath.range, path); + //cerr << "AFTER LookupGivenWord=" << *prevPath << endl; + + // NON-TERMINAL + //const SCFG::InputPath *prefixPath = static_cast(path.prefixPath); + while (prevPath) { + const Range &prevRange = prevPath->range; + //cerr << "prevRange=" << prevRange << endl; + + size_t startPos = prevRange.GetEndPos() + 1; + size_t ntSize = endPos - startPos + 1; + const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(startPos, ntSize); + + LookupNT(pool, mgr, subPhrasePath.range, *prevPath, stacks, path); + + prevPath = static_cast(prevPath->prefixPath); + } +} + +void MSPT::LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const +{ + const ActiveChartEntryMem &prevEntryCast = static_cast(prevEntry); + + const SCFGNODE &prevNode = prevEntryCast.node; + UTIL_THROW_IF2(&prevNode == NULL, "node == NULL"); + + size_t ptInd = GetPtInd(); + const SCFGNODE *nextNode = prevNode.Find(m_input, wordSought); + + /* + if (outPath.range.GetStartPos() == 1 || outPath.range.GetStartPos() == 2) { + cerr << "range=" << outPath.range + << " prevEntry=" << prevEntry.GetSymbolBind().Debug(mgr.system) + << " wordSought=" << wordSought.Debug(mgr.system) + << " nextNode=" << nextNode + << endl; + } + */ + if (nextNode) { + // new entries + ActiveChartEntryMem *chartEntry = new (pool.Allocate()) ActiveChartEntryMem(pool, *nextNode, prevEntry); + + chartEntry->AddSymbolBindElement(subPhraseRange, wordSought, hypos, *this); + //cerr << "AFTER Add=" << symbolBind << endl; + + outPath.AddActiveChartEntry(ptInd, chartEntry); + + const SCFG::TargetPhrases *tps = nextNode->GetTargetPhrases(); + if (tps) { + // there are some rules + /* + cerr << "outPath=" << outPath.range + << " bind=" << chartEntry->GetSymbolBind().Debug(mgr.system) + << " pt=" << GetPtInd() + << " tps=" << tps->Debug(mgr.system) << endl; + */ + outPath.AddTargetPhrasesToPath(pool, mgr.system, *this, *tps, chartEntry->GetSymbolBind()); + + } + + //cerr << "AFTER outPath=" << outPath << endl; + } +} + +} + diff --git a/mosesdecoder/moses2/TranslationModel/MSPT/MSPT.h b/mosesdecoder/moses2/TranslationModel/MSPT/MSPT.h new file mode 100644 index 0000000000000000000000000000000000000000..d3946d35342bc5df5224bc8d40df67e85c6c0000 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/MSPT/MSPT.h @@ -0,0 +1,85 @@ +/* + * MSPT.h + * + * Created on: 28 Oct 2015 + * Author: hieu + */ +#pragma once + +#include "../PhraseTable.h" +#include "../../legacy/Util2.h" +#include "../../SCFG/InputPath.h" +#include "MSNode.h" +#include "../../PhraseBased/PhraseImpl.h" +#include "../../PhraseBased/TargetPhraseImpl.h" +#include "../../PhraseBased/TargetPhrases.h" +#include "../../SCFG/PhraseImpl.h" +#include "../../SCFG/TargetPhraseImpl.h" +#include "../../SCFG/TargetPhrases.h" + +namespace Moses2 +{ + +class MSPT: public PhraseTable +{ + typedef MSPTNS::Node, TargetPhraseImpl, TargetPhrases> PBNODE; + typedef MSPTNS::Node, SCFG::TargetPhraseImpl, SCFG::TargetPhrases> SCFGNODE; + +////////////////////////////////////// + class ActiveChartEntryMem : public SCFG::ActiveChartEntry + { + typedef SCFG::ActiveChartEntry Parent; + public: + const MSPT::SCFGNODE &node; + + ActiveChartEntryMem(MemPool &pool, const MSPT::SCFGNODE &vnode) + :Parent(pool) + ,node(vnode) + {} + + ActiveChartEntryMem( + MemPool &pool, + const MSPT::SCFGNODE &vnode, + const ActiveChartEntry &prevEntry) + :Parent(prevEntry) + ,node(vnode) + {} + }; + + ////////////////////////////////////// +public: + MSPT(size_t startInd, const std::string &line); + virtual ~MSPT(); + + virtual void Load(System &system); + virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const; + + virtual void InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const; + + void Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const; + +protected: + PBNODE *m_rootPb; + SCFGNODE *m_rootSCFG; + + void LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const; + +}; + +} + diff --git a/mosesdecoder/moses2/TranslationModel/Memory/Node.h b/mosesdecoder/moses2/TranslationModel/Memory/Node.h new file mode 100644 index 0000000000000000000000000000000000000000..d5a6b879583046e0fa234a5a076eeca212469cb1 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/Memory/Node.h @@ -0,0 +1,131 @@ +/* + * Node.h + * + * Created on: 22 Apr 2016 + * Author: hieu + */ +#pragma once +#include +#include +#include "../../PhraseBased/TargetPhrases.h" +#include "../../System.h" +#include "../../Phrase.h" + +namespace Moses2 +{ +class System; + +namespace PtMem +{ + +template +class Node +{ +public: + typedef boost::unordered_map Children; + + Node() + :m_targetPhrases(NULL) + ,m_unsortedTPS(NULL) + {} + + ~Node() + {} + + void AddRule(const std::vector &factors, SP &source, TP *target) { + AddRule(factors, source, target, 0); + } + + TPS *Find(const std::vector &factors, const SP &source, size_t pos = 0) const { + assert(source.GetSize()); + if (pos == source.GetSize()) { + return m_targetPhrases; + } else { + const WORD &word = source[pos]; + //cerr << "word=" << word << endl; + typename Children::const_iterator iter = m_children.find(word.hash(factors)); + if (iter == m_children.end()) { + return NULL; + } else { + const Node &child = iter->second; + return child.Find(factors, source, pos + 1); + } + } + } + + const Node *Find(const std::vector &factors, const WORD &word) const { + typename Children::const_iterator iter = m_children.find(word.hash(factors)); + if (iter == m_children.end()) { + return NULL; + } else { + const Node &child = iter->second; + return &child; + } + } + + const TPS *GetTargetPhrases() const { + return m_targetPhrases; + } + + void SortAndPrune(size_t tableLimit, MemPool &pool, System &system) { + BOOST_FOREACH(typename Children::value_type &val, m_children) { + Node &child = val.second; + child.SortAndPrune(tableLimit, pool, system); + } + + // prune target phrases in this node + if (m_unsortedTPS) { + m_targetPhrases = new (pool.Allocate()) TPS(pool, m_unsortedTPS->size()); + + for (size_t i = 0; i < m_unsortedTPS->size(); ++i) { + TP *tp = (*m_unsortedTPS)[i]; + m_targetPhrases->AddTargetPhrase(*tp); + } + + m_targetPhrases->SortAndPrune(tableLimit); + system.featureFunctions.EvaluateAfterTablePruning(system.GetSystemPool(), *m_targetPhrases, *m_source); + + delete m_unsortedTPS; + } + } + + const Children &GetChildren() const { + return m_children; + } + + void Debug(std::ostream &out, const System &system) const { + BOOST_FOREACH(const typename Children::value_type &valPair, m_children) { + const WORD &word = valPair.first; + //std::cerr << word << "(" << word.hash() << ") "; + } + } +protected: + Children m_children; + TPS *m_targetPhrases; + Phrase *m_source; + std::vector *m_unsortedTPS; + + Node &AddRule(const std::vector &factors, SP &source, TP *target, size_t pos) { + if (pos == source.GetSize()) { + if (m_unsortedTPS == NULL) { + m_unsortedTPS = new std::vector(); + m_source = &source; + } + + m_unsortedTPS->push_back(target); + return *this; + } else { + const WORD &word = source[pos]; + Node &child = m_children[word.hash(factors)]; + //std::cerr << "added " << word << " " << &child << " from " << this << std::endl; + + return child.AddRule(factors, source, target, pos + 1); + } + } + +}; + + +} +} // namespace + diff --git a/mosesdecoder/moses2/TranslationModel/Memory/PhraseTableMemory.cpp b/mosesdecoder/moses2/TranslationModel/Memory/PhraseTableMemory.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9b231ebd64ad9740f5639543e3b9c091d9c86c38 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/Memory/PhraseTableMemory.cpp @@ -0,0 +1,265 @@ +/* + * PhraseTableMemory.cpp + * + * Created on: 28 Oct 2015 + * Author: hieu + */ + +#include +#include +#include "PhraseTableMemory.h" +#include "../../PhraseBased/PhraseImpl.h" +#include "../../Phrase.h" +#include "../../System.h" +#include "../../Scores.h" +#include "../../InputPathsBase.h" +#include "../../legacy/InputFileStream.h" +#include "util/exception.hh" + +#include "../../PhraseBased/InputPath.h" +#include "../../PhraseBased/TargetPhraseImpl.h" +#include "../../PhraseBased/TargetPhrases.h" + +#include "../../SCFG/PhraseImpl.h" +#include "../../SCFG/TargetPhraseImpl.h" +#include "../../SCFG/InputPath.h" +#include "../../SCFG/Stack.h" +#include "../../SCFG/Stacks.h" +#include "../../SCFG/Manager.h" + + +using namespace std; + +namespace Moses2 +{ + + +//////////////////////////////////////////////////////////////////////// + +PhraseTableMemory::PhraseTableMemory(size_t startInd, const std::string &line) + :PhraseTable(startInd, line) + ,m_rootPb(NULL) + ,m_rootSCFG(NULL) +{ + ReadParameters(); +} + +PhraseTableMemory::~PhraseTableMemory() +{ + delete m_rootPb; + delete m_rootSCFG; +} + +void PhraseTableMemory::Load(System &system) +{ + FactorCollection &vocab = system.GetVocab(); + MemPool &systemPool = system.GetSystemPool(); + MemPool tmpSourcePool; + + if (system.isPb) { + m_rootPb = new PBNODE(); + } else { + m_rootSCFG = new SCFGNODE(); + //cerr << "m_rootSCFG=" << m_rootSCFG << endl; + } + + vector toks; + size_t lineNum = 0; + InputFileStream strme(m_path); + string line; + while (getline(strme, line)) { + if (++lineNum % 1000000 == 0) { + cerr << lineNum << " "; + } + toks.clear(); + TokenizeMultiCharSeparator(toks, line, "|||"); + UTIL_THROW_IF2(toks.size() < 3, "Wrong format"); + //cerr << "line=" << line << endl; + //cerr << "system.isPb=" << system.isPb << endl; + + if (system.isPb) { + PhraseImpl *source = PhraseImpl::CreateFromString(tmpSourcePool, vocab, system, + toks[0]); + //cerr << "created soure" << endl; + TargetPhraseImpl *target = TargetPhraseImpl::CreateFromString(systemPool, *this, system, + toks[1]); + //cerr << "created target" << endl; + target->GetScores().CreateFromString(toks[2], *this, system, true); + //cerr << "created scores:" << *target << endl; + + if (toks.size() >= 4) { + //cerr << "alignstr=" << toks[3] << endl; + target->SetAlignmentInfo(toks[3]); + } + + // properties + if (toks.size() == 7) { + //target->properties = (char*) system.systemPool.Allocate(toks[6].size() + 1); + //strcpy(target->properties, toks[6].c_str()); + } + + system.featureFunctions.EvaluateInIsolation(systemPool, system, *source, + *target); + //cerr << "EvaluateInIsolation:" << *target << endl; + m_rootPb->AddRule(m_input, *source, target); + + //cerr << "target=" << target->Debug(system) << endl; + } else { + SCFG::PhraseImpl *source = SCFG::PhraseImpl::CreateFromString(tmpSourcePool, vocab, system, + toks[0]); + //cerr << "created source:" << *source << endl; + SCFG::TargetPhraseImpl *target = SCFG::TargetPhraseImpl::CreateFromString(systemPool, *this, + system, toks[1]); + + //cerr << "created target " << *target << " source=" << *source << endl; + + target->GetScores().CreateFromString(toks[2], *this, system, true); + //cerr << "created scores:" << *target << endl; + + //vector scores = Tokenize(toks[2]); + //target->sortScore = (scores.size() >= 3) ? TransformScore(scores[2]) : 0; + + target->SetAlignmentInfo(toks[3]); + + // properties + if (toks.size() == 7) { + //target->properties = (char*) system.systemPool.Allocate(toks[6].size() + 1); + //strcpy(target->properties, toks[6].c_str()); + } + + system.featureFunctions.EvaluateInIsolation(systemPool, system, *source, + *target); + //cerr << "EvaluateInIsolation:" << *target << endl; + m_rootSCFG->AddRule(m_input, *source, target); + } + } + + if (system.isPb) { + m_rootPb->SortAndPrune(m_tableLimit, systemPool, system); + //cerr << "root=" << &m_rootPb << endl; + } else { + m_rootSCFG->SortAndPrune(m_tableLimit, systemPool, system); + //cerr << "root=" << &m_rootPb << endl; + } + /* + BOOST_FOREACH(const PtMem::Node::Children::value_type &valPair, m_rootPb.GetChildren()) { + const Word &word = valPair.first; + cerr << word << " "; + } + cerr << endl; + */ +} + +TargetPhrases* PhraseTableMemory::Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const +{ + const SubPhrase &phrase = inputPath.subPhrase; + TargetPhrases *tps = m_rootPb->Find(m_input, phrase); + return tps; +} + +void PhraseTableMemory::InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const +{ + size_t ptInd = GetPtInd(); + ActiveChartEntryMem *chartEntry = new (pool.Allocate()) ActiveChartEntryMem(pool, *m_rootSCFG); + path.AddActiveChartEntry(ptInd, chartEntry); + //cerr << "InitActiveChart=" << path << endl; +} + +void PhraseTableMemory::Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const +{ + if (path.range.GetNumWordsCovered() > maxChartSpan) { + return; + } + + size_t endPos = path.range.GetEndPos(); + + const SCFG::InputPath *prevPath = static_cast(path.prefixPath); + UTIL_THROW_IF2(prevPath == NULL, "prefixPath == NULL"); + + // TERMINAL + const SCFG::Word &lastWord = path.subPhrase.Back(); + + const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(endPos, 1); + + //cerr << "BEFORE LookupGivenWord=" << *prevPath << endl; + LookupGivenWord(pool, mgr, *prevPath, lastWord, NULL, subPhrasePath.range, path); + //cerr << "AFTER LookupGivenWord=" << *prevPath << endl; + + // NON-TERMINAL + //const SCFG::InputPath *prefixPath = static_cast(path.prefixPath); + while (prevPath) { + const Range &prevRange = prevPath->range; + //cerr << "prevRange=" << prevRange << endl; + + size_t startPos = prevRange.GetEndPos() + 1; + size_t ntSize = endPos - startPos + 1; + const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(startPos, ntSize); + + LookupNT(pool, mgr, subPhrasePath.range, *prevPath, stacks, path); + + prevPath = static_cast(prevPath->prefixPath); + } +} + +void PhraseTableMemory::LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const +{ + const ActiveChartEntryMem &prevEntryCast = static_cast(prevEntry); + + const SCFGNODE &prevNode = prevEntryCast.node; + UTIL_THROW_IF2(&prevNode == NULL, "node == NULL"); + + size_t ptInd = GetPtInd(); + const SCFGNODE *nextNode = prevNode.Find(m_input, wordSought); + + /* + if (outPath.range.GetStartPos() == 1 || outPath.range.GetStartPos() == 2) { + cerr << "range=" << outPath.range + << " prevEntry=" << prevEntry.GetSymbolBind().Debug(mgr.system) + << " wordSought=" << wordSought.Debug(mgr.system) + << " nextNode=" << nextNode + << endl; + } + */ + if (nextNode) { + // new entries + ActiveChartEntryMem *chartEntry = new (pool.Allocate()) ActiveChartEntryMem(pool, *nextNode, prevEntry); + + chartEntry->AddSymbolBindElement(subPhraseRange, wordSought, hypos, *this); + //cerr << "AFTER Add=" << symbolBind << endl; + + outPath.AddActiveChartEntry(ptInd, chartEntry); + + const SCFG::TargetPhrases *tps = nextNode->GetTargetPhrases(); + if (tps) { + // there are some rules + /* + cerr << "outPath=" << outPath.range + << " bind=" << chartEntry->GetSymbolBind().Debug(mgr.system) + << " pt=" << GetPtInd() + << " tps=" << tps->Debug(mgr.system) << endl; + */ + outPath.AddTargetPhrasesToPath(pool, mgr.system, *this, *tps, chartEntry->GetSymbolBind()); + + } + + //cerr << "AFTER outPath=" << outPath << endl; + } +} + +} + diff --git a/mosesdecoder/moses2/TranslationModel/Memory/PhraseTableMemory.h b/mosesdecoder/moses2/TranslationModel/Memory/PhraseTableMemory.h new file mode 100644 index 0000000000000000000000000000000000000000..07a47c7fffff1f459d85484d6e3f725055088d05 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/Memory/PhraseTableMemory.h @@ -0,0 +1,85 @@ +/* + * PhraseTableMemory.h + * + * Created on: 28 Oct 2015 + * Author: hieu + */ +#pragma once + +#include "../PhraseTable.h" +#include "../../legacy/Util2.h" +#include "../../SCFG/InputPath.h" +#include "Node.h" +#include "../../PhraseBased/PhraseImpl.h" +#include "../../PhraseBased/TargetPhraseImpl.h" +#include "../../PhraseBased/TargetPhrases.h" +#include "../../SCFG/PhraseImpl.h" +#include "../../SCFG/TargetPhraseImpl.h" +#include "../../SCFG/TargetPhrases.h" + +namespace Moses2 +{ + +class PhraseTableMemory: public PhraseTable +{ + typedef PtMem::Node, TargetPhraseImpl, TargetPhrases> PBNODE; + typedef PtMem::Node, SCFG::TargetPhraseImpl, SCFG::TargetPhrases> SCFGNODE; + +////////////////////////////////////// + class ActiveChartEntryMem : public SCFG::ActiveChartEntry + { + typedef SCFG::ActiveChartEntry Parent; + public: + const PhraseTableMemory::SCFGNODE &node; + + ActiveChartEntryMem(MemPool &pool, const PhraseTableMemory::SCFGNODE &vnode) + :Parent(pool) + ,node(vnode) + {} + + ActiveChartEntryMem( + MemPool &pool, + const PhraseTableMemory::SCFGNODE &vnode, + const ActiveChartEntry &prevEntry) + :Parent(prevEntry) + ,node(vnode) + {} + }; + + ////////////////////////////////////// +public: + PhraseTableMemory(size_t startInd, const std::string &line); + virtual ~PhraseTableMemory(); + + virtual void Load(System &system); + virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const; + + virtual void InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const; + + void Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const; + +protected: + PBNODE *m_rootPb; + SCFGNODE *m_rootSCFG; + + void LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const; + +}; + +} + diff --git a/mosesdecoder/moses2/TranslationModel/PhraseTable.cpp b/mosesdecoder/moses2/TranslationModel/PhraseTable.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1f87b8040610d2721ef4027413d5c4521d674b48 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/PhraseTable.cpp @@ -0,0 +1,170 @@ +/* + * PhraseTable.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include "PhraseTable.h" +#include "../legacy/Util2.h" +#include "../TypeDef.h" +#include "../InputType.h" +#include "../PhraseBased/Manager.h" +#include "../PhraseBased/InputPath.h" +#include "../SCFG/InputPath.h" +#include "../SCFG/Manager.h" + +using namespace std; + +namespace Moses2 +{ + +//////////////////////////////////////////////////////////////////////////// +PhraseTable::PhraseTable(size_t startInd, const std::string &line) : + StatelessFeatureFunction(startInd, line), m_tableLimit(20) // default + , m_maxCacheSize(DEFAULT_MAX_TRANS_OPT_CACHE_SIZE) +{ + m_input.push_back(0); +} + +PhraseTable::~PhraseTable() +{ + // TODO Auto-generated destructor stub +} + +void PhraseTable::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "cache-size") { + m_maxCacheSize = Scan(value); + } else if (key == "path") { + m_path = value; + } else if (key == "input-factor") { + m_input = Tokenize(value, ","); + } else if (key == "output-factor") { + m_output = Tokenize(value, ","); + } else if (key == "table-limit") { + m_tableLimit = Scan(value); + } else { + StatelessFeatureFunction::SetParameter(key, value); + } +} + +bool PhraseTable::SatisfyBackoff(const Manager &mgr, const InputPath &path) const +{ + const InputType &input = mgr.GetInput(); + if ((mgr.system.options.input.xml_policy == XmlExclusive) + && input.XmlOverlap(path.range.GetStartPos(), path.range.GetEndPos())) { + return false; + } + + //cerr << GetName() << "=" << GetPtInd() << "=" << decodeGraphBackoff << endl; + if (decodeGraphBackoff == 0) { + // always lookup + return true; + } else if (decodeGraphBackoff == -1) { + // lookup only if there's no existing rules + return path.GetNumRules() ? false : true; + } else if (path.range.GetNumWordsCovered() <= decodeGraphBackoff) { + return path.GetNumRules() ? false : true; + } + + return false; +} + +void PhraseTable::Lookup(const Manager &mgr, InputPathsBase &inputPaths) const +{ + BOOST_FOREACH(InputPathBase *pathBase, inputPaths) { + InputPath *path = static_cast(pathBase); + //cerr << "path=" << path->range << " "; + + if (SatisfyBackoff(mgr, *path)) { + TargetPhrases *tpsPtr = Lookup(mgr, mgr.GetPool(), *path); + //cerr << "tpsPtr=" << tpsPtr << endl; + + path->AddTargetPhrases(*this, tpsPtr); + } + } + +} + +TargetPhrases *PhraseTable::Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const +{ + UTIL_THROW2("Not implemented"); +} + +void PhraseTable::EvaluateInIsolation(MemPool &pool, const System &system, + const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ +} + +void PhraseTable::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + +} + +// scfg +void PhraseTable::LookupUnary(MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const +{ + //cerr << "BEFORE LookupUnary" << path.Debug(mgr.system) << endl; + size_t startPos = path.range.GetStartPos(); + const SCFG::InputPath *prevPath = mgr.GetInputPaths().GetMatrix().GetValue(startPos, 0); + LookupNT(pool, mgr, path.range, *prevPath, stacks, path); + //cerr << "AFTER LookupUnary" << path.Debug(mgr.system) << endl; +} + +void PhraseTable::LookupNT( + MemPool &pool, + const SCFG::Manager &mgr, + const Moses2::Range &subPhraseRange, + const SCFG::InputPath &prevPath, + const SCFG::Stacks &stacks, + SCFG::InputPath &outPath) const +{ + size_t endPos = outPath.range.GetEndPos(); + + const Range &prevRange = prevPath.range; + + size_t startPos = prevRange.GetEndPos() + 1; + size_t ntSize = endPos - startPos + 1; + + const SCFG::Stack &ntStack = stacks.GetStack(startPos, ntSize); + const SCFG::Stack::Coll &stackColl = ntStack.GetColl(); + + BOOST_FOREACH (const SCFG::Stack::Coll::value_type &valPair, stackColl) { + const SCFG::Word &ntSought = valPair.first; + const Moses2::HypothesisColl *hypos = valPair.second; + const Moses2::Hypotheses &sortedHypos = hypos->GetSortedAndPrunedHypos(mgr, mgr.arcLists); + //cerr << "ntSought=" << ntSought << ntSought.isNonTerminal << endl; + LookupGivenWord(pool, mgr, prevPath, ntSought, &sortedHypos, subPhraseRange, outPath); + } +} + +void PhraseTable::LookupGivenWord( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::InputPath &prevPath, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const +{ + size_t ptInd = GetPtInd(); + + + BOOST_FOREACH(const SCFG::ActiveChartEntry *prevEntry, prevPath.GetActiveChart(ptInd).entries) { + //cerr << "BEFORE LookupGivenNode=" << prevPath << endl; + LookupGivenNode(pool, mgr, *prevEntry, wordSought, hypos, subPhraseRange, outPath); + //cerr << "AFTER LookupGivenNode=" << prevPath << endl; + } +} + +} + diff --git a/mosesdecoder/moses2/TranslationModel/PhraseTable.h b/mosesdecoder/moses2/TranslationModel/PhraseTable.h new file mode 100644 index 0000000000000000000000000000000000000000..ef40c06a4190bbb295fb94f8274a3792fcbc5957 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/PhraseTable.h @@ -0,0 +1,129 @@ +/* + * PhraseTable.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#pragma once +#include +#include +#include "../Word.h" +#include "../HypothesisColl.h" +#include "../FF/StatelessFeatureFunction.h" +#include "../legacy/Util2.h" + +namespace Moses2 +{ + +class System; +class InputPathsBase; +class InputPath; +class Manager; +class TargetPhrases; +class Range; + +namespace SCFG +{ +class InputPath; +class Stacks; +class Manager; +class ActiveChartEntry; +} + +//////////////////////////////////////////////////////////////////////// +class PhraseTable: public StatelessFeatureFunction +{ +public: + int decodeGraphBackoff; + + PhraseTable(size_t startInd, const std::string &line); + virtual ~PhraseTable(); + + virtual void SetParameter(const std::string& key, const std::string& value); + virtual void Lookup(const Manager &mgr, InputPathsBase &inputPaths) const; + virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const; + + void SetPtInd(size_t ind) { + m_ptInd = ind; + } + + size_t GetPtInd() const { + return m_ptInd; + } + + bool SatisfyBackoff(const Manager &mgr, const InputPath &path) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void + EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, + const TargetPhrase &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + // scfg + virtual void InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const = 0; + + virtual void Lookup( + MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const = 0; + + virtual void LookupUnary(MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const; + +protected: + std::string m_path; + size_t m_ptInd; // in the order that it is list in [feature], NOT order of [mapping] + size_t m_tableLimit; + std::vector m_input, m_output; + + // cache + size_t m_maxCacheSize; // 0 = no caching + + struct CacheCollEntry2 { + TargetPhrases *tpsPtr; + clock_t clock; + }; + + // scfg + virtual void LookupNT( + MemPool &pool, + const SCFG::Manager &mgr, + const Moses2::Range &subPhraseRange, + const SCFG::InputPath &prevPath, + const SCFG::Stacks &stacks, + SCFG::InputPath &outPath) const; + + virtual void LookupGivenWord( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::InputPath &prevPath, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const; + + virtual void LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const = 0; + +}; + +} + diff --git a/mosesdecoder/moses2/TranslationModel/ProbingPT.cpp b/mosesdecoder/moses2/TranslationModel/ProbingPT.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d99137f05ff96ca7d816e97756782a890bf3dba3 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/ProbingPT.cpp @@ -0,0 +1,742 @@ +/* + * ProbingPT.cpp + * + * Created on: 3 Nov 2015 + * Author: hieu + */ +#include +#include "ProbingPT.h" +#include "probingpt/querying.h" +#include "probingpt/probing_hash_utils.h" +#include "util/exception.hh" +#include "../System.h" +#include "../Scores.h" +#include "../Phrase.h" +#include "../legacy/InputFileStream.h" +#include "../legacy/FactorCollection.h" +#include "../legacy/Util2.h" +#include "../FF/FeatureFunctions.h" +#include "../PhraseBased/PhraseImpl.h" +#include "../PhraseBased/TargetPhraseImpl.h" +#include "../PhraseBased/Manager.h" +#include "../PhraseBased/TargetPhrases.h" +#include "../SCFG/InputPath.h" +#include "../SCFG/Manager.h" +#include "../SCFG/TargetPhraseImpl.h" +#include "../SCFG/PhraseImpl.h" + +using namespace std; + +namespace Moses2 +{ +ProbingPT::ActiveChartEntryProbing::ActiveChartEntryProbing( + MemPool &pool, + const ActiveChartEntryProbing &prevEntry) + :Parent(prevEntry) + ,m_key(prevEntry.m_key) +{} + +void ProbingPT::ActiveChartEntryProbing::AddSymbolBindElement( + const Range &range, + const SCFG::Word &word, + const Moses2::Hypotheses *hypos, + const Moses2::PhraseTable &pt) +{ + const ProbingPT &probingPt = static_cast(pt); + std::pair key = GetKey(word, probingPt); + UTIL_THROW_IF2(!key.first, "Word should have been in source vocab"); + m_key = key.second; + + ActiveChartEntry::AddSymbolBindElement(range, word, hypos, pt); +} + +std::pair ProbingPT::ActiveChartEntryProbing::GetKey(const SCFG::Word &nextWord, const ProbingPT &pt) const +{ + std::pair ret; + ret.second = m_key; + uint64_t probingId = pt.GetSourceProbingId(nextWord); + if (probingId == pt.GetUnk()) { + ret.first = false; + return ret; + } + + ret.first = true; + size_t phraseSize = m_symbolBind.coll.size(); + ret.second += probingId << phraseSize; + return ret; +} + +//////////////////////////////////////////////////////////////////////////// +ProbingPT::ProbingPT(size_t startInd, const std::string &line) + :PhraseTable(startInd, line) + ,load_method(util::POPULATE_OR_READ) +{ + ReadParameters(); +} + +ProbingPT::~ProbingPT() +{ + delete m_engine; +} + +void ProbingPT::Load(System &system) +{ + m_engine = new probingpt::QueryEngine(m_path.c_str(), load_method); + + m_unkId = 456456546456; + + FactorCollection &vocab = system.GetVocab(); + + // source vocab + const std::map &sourceVocab = + m_engine->getSourceVocab(); + std::map::const_iterator iterSource; + for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); + ++iterSource) { + string wordStr = iterSource->second; + bool isNT; + //cerr << "wordStr=" << wordStr << endl; + ReformatWord(system, wordStr, isNT); + //cerr << "wordStr=" << wordStr << endl; + + const Factor *factor = vocab.AddFactor(wordStr, system, isNT); + + uint64_t probingId = iterSource->first; + size_t factorId = factor->GetId(); + + if (factorId >= m_sourceVocab.size()) { + m_sourceVocab.resize(factorId + 1, m_unkId); + } + m_sourceVocab[factorId] = probingId; + } + + // target vocab + InputFileStream targetVocabStrme(m_path + "/TargetVocab.dat"); + string line; + while (getline(targetVocabStrme, line)) { + vector toks = Tokenize(line, "\t"); + UTIL_THROW_IF2(toks.size() != 2, string("Incorrect format:") + line + "\n"); + + bool isNT; + //cerr << "wordStr=" << toks[0] << endl; + ReformatWord(system, toks[0], isNT); + //cerr << "wordStr=" << toks[0] << endl; + + const Factor *factor = vocab.AddFactor(toks[0], system, isNT); + uint32_t probingId = Scan(toks[1]); + + if (probingId >= m_targetVocab.size()) { + m_targetVocab.resize(probingId + 1); + } + + std::pair ele(isNT, factor); + m_targetVocab[probingId] = ele; + } + + // alignments + CreateAlignmentMap(system, m_path + "/Alignments.dat"); + + // cache + CreateCache(system); +} + +void ProbingPT::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "load") { + if (value == "lazy") { + load_method = util::LAZY; + } else if (value == "populate_or_lazy") { + load_method = util::POPULATE_OR_LAZY; + } else if (value == "populate_or_read" || value == "populate") { + load_method = util::POPULATE_OR_READ; + } else if (value == "read") { + load_method = util::READ; + } else if (value == "parallel_read") { + load_method = util::PARALLEL_READ; + } else { + UTIL_THROW2("load method not supported" << value); + } + } else { + PhraseTable::SetParameter(key, value); + } +} + +void ProbingPT::CreateAlignmentMap(System &system, const std::string path) +{ + const std::vector< std::vector > &probingAlignColl = m_engine->getAlignments(); + m_aligns.resize(probingAlignColl.size(), NULL); + + for (size_t i = 0; i < probingAlignColl.size(); ++i) { + AlignmentInfo::CollType aligns; + + const std::vector &probingAligns = probingAlignColl[i]; + for (size_t j = 0; j < probingAligns.size(); j += 2) { + size_t startPos = probingAligns[j]; + size_t endPos = probingAligns[j+1]; + //cerr << "startPos=" << startPos << " " << endPos << endl; + aligns.insert(std::pair(startPos, endPos)); + } + + const AlignmentInfo *align = AlignmentInfoCollection::Instance().Add(aligns); + m_aligns[i] = align; + //cerr << "align=" << align->Debug(system) << endl; + } +} + +void ProbingPT::Lookup(const Manager &mgr, InputPathsBase &inputPaths) const +{ + BOOST_FOREACH(InputPathBase *pathBase, inputPaths) { + InputPath *path = static_cast(pathBase); + + if (SatisfyBackoff(mgr, *path)) { + TargetPhrases *tpsPtr; + tpsPtr = Lookup(mgr, mgr.GetPool(), *path); + path->AddTargetPhrases(*this, tpsPtr); + } + } +} + +TargetPhrases* ProbingPT::Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const +{ + /* + if (inputPath.prefixPath && inputPath.prefixPath->GetTargetPhrases(*this) == NULL) { + // assume all paths have prefixes, except rules with 1 word source + return NULL; + } + else { + const Phrase &sourcePhrase = inputPath.subPhrase; + std::pair tpsAndKey = CreateTargetPhrase(pool, mgr.system, sourcePhrase); + return tpsAndKey.first; + } + */ + const Phrase &sourcePhrase = inputPath.subPhrase; + + // get hash for source phrase + std::pair keyStruct = GetKey(sourcePhrase); + if (!keyStruct.first) { + return NULL; + } + + // check in cache + CachePb::const_iterator iter = m_cachePb.find(keyStruct.second); + if (iter != m_cachePb.end()) { + //cerr << "FOUND IN CACHE " << keyStruct.second << " " << sourcePhrase.Debug(mgr.system) << endl; + TargetPhrases *tps = iter->second; + return tps; + } + + // query pt + TargetPhrases *tps = CreateTargetPhrases(pool, mgr.system, sourcePhrase, + keyStruct.second); + return tps; +} + +std::pair ProbingPT::GetKey(const Phrase &sourcePhrase) const +{ + std::pair ret; + + // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:' + size_t sourceSize = sourcePhrase.GetSize(); + assert(sourceSize); + + uint64_t *probingSource = (uint64_t*) alloca(sourceSize * sizeof(uint64_t)); + GetSourceProbingIds(sourcePhrase, ret.first, probingSource); + if (!ret.first) { + // source phrase contains a word unknown in the pt. + // We know immediately there's no translation for it + } else { + ret.second = m_engine->getKey(probingSource, sourceSize); + } + + return ret; + +} + +TargetPhrases *ProbingPT::CreateTargetPhrases(MemPool &pool, + const System &system, const Phrase &sourcePhrase, uint64_t key) const +{ + TargetPhrases *tps = NULL; + + //Actual lookup + std::pair query_result; // 1st=found, 2nd=target file offset + query_result = m_engine->query(key); + //cerr << "key2=" << query_result.second << endl; + + if (query_result.first) { + const char *offset = m_engine->memTPS + query_result.second; + uint64_t *numTP = (uint64_t*) offset; + + tps = new (pool.Allocate()) TargetPhrases(pool, *numTP); + + offset += sizeof(uint64_t); + for (size_t i = 0; i < *numTP; ++i) { + TargetPhraseImpl *tp = CreateTargetPhrase(pool, system, offset); + assert(tp); + const FeatureFunctions &ffs = system.featureFunctions; + ffs.EvaluateInIsolation(pool, system, sourcePhrase, *tp); + + tps->AddTargetPhrase(*tp); + + } + + tps->SortAndPrune(m_tableLimit); + system.featureFunctions.EvaluateAfterTablePruning(pool, *tps, sourcePhrase); + //cerr << *tps << endl; + } + + return tps; +} + +TargetPhraseImpl *ProbingPT::CreateTargetPhrase( + MemPool &pool, + const System &system, + const char *&offset) const +{ + probingpt::TargetPhraseInfo *tpInfo = (probingpt::TargetPhraseInfo*) offset; + size_t numRealWords = tpInfo->numWords / m_output.size(); + + TargetPhraseImpl *tp = + new (pool.Allocate()) TargetPhraseImpl(pool, *this, + system, numRealWords); + + offset += sizeof(probingpt::TargetPhraseInfo); + + // scores + SCORE *scores = (SCORE*) offset; + + size_t totalNumScores = m_engine->num_scores + m_engine->num_lex_scores; + + if (m_engine->logProb) { + // set pt score for rule + tp->GetScores().PlusEquals(system, *this, scores); + + // save scores for other FF, eg. lex RO. Just give the offset + if (m_engine->num_lex_scores) { + tp->scoreProperties = scores + m_engine->num_scores; + } + } else { + // log score 1st + SCORE *logScores = (SCORE*) alloca(totalNumScores * sizeof(SCORE)); + for (size_t i = 0; i < totalNumScores; ++i) { + logScores[i] = FloorScore(TransformScore(scores[i])); + } + + // set pt score for rule + tp->GetScores().PlusEquals(system, *this, logScores); + + // save scores for other FF, eg. lex RO. + tp->scoreProperties = pool.Allocate(m_engine->num_lex_scores); + for (size_t i = 0; i < m_engine->num_lex_scores; ++i) { + tp->scoreProperties[i] = logScores[i + m_engine->num_scores]; + } + } + + offset += sizeof(SCORE) * totalNumScores; + + // words + for (size_t targetPos = 0; targetPos < numRealWords; ++targetPos) { + for (size_t i = 0; i < m_output.size(); ++i) { + FactorType factorType = m_output[i]; + + uint32_t *probingId = (uint32_t*) offset; + + const std::pair *factorPair = GetTargetFactor(*probingId); + assert(factorPair); + assert(!factorPair->first); + + Word &word = (*tp)[targetPos]; + word[factorType] = factorPair->second; + + offset += sizeof(uint32_t); + } + } + + // align + uint32_t alignTerm = tpInfo->alignTerm; + //cerr << "alignTerm=" << alignTerm << endl; + UTIL_THROW_IF2(alignTerm >= m_aligns.size(), "Unknown alignInd"); + tp->Parent::SetAlignTerm(*m_aligns[alignTerm]); + + // properties TODO + + return tp; +} + +void ProbingPT::GetSourceProbingIds(const Phrase &sourcePhrase, + bool &ok, uint64_t probingSource[]) const +{ + + size_t size = sourcePhrase.GetSize(); + for (size_t i = 0; i < size; ++i) { + const Word &word = sourcePhrase[i]; + uint64_t probingId = GetSourceProbingId(word); + if (probingId == m_unkId) { + ok = false; + return; + } else { + probingSource[i] = probingId; + } + } + + ok = true; +} + +uint64_t ProbingPT::GetSourceProbingId(const Word &word) const +{ + uint64_t ret = 0; + + for (size_t i = 0; i < m_input.size(); ++i) { + FactorType factorType = m_input[i]; + const Factor *factor = word[factorType]; + + size_t factorId = factor->GetId(); + if (factorId >= m_sourceVocab.size()) { + return m_unkId; + } + ret += m_sourceVocab[factorId]; + } + + return ret; +} + +void ProbingPT::CreateCache(System &system) +{ + if (m_maxCacheSize == 0) { + return; + } + + string filePath = m_path + "/cache"; + InputFileStream strme(filePath); + + string line; + getline(strme, line); + //float totalCount = Scan(line); + + MemPool &pool = system.GetSystemPool(); + FactorCollection &vocab = system.GetVocab(); + + MemPool tmpSourcePool; + + size_t lineCount = 0; + while (getline(strme, line) && lineCount < m_maxCacheSize) { + vector toks = Tokenize(line, "\t"); + assert(toks.size() == 3); + uint64_t key = Scan(toks[1]); + //cerr << "line=" << line << endl; + + if (system.isPb) { + PhraseImpl *sourcePhrase = PhraseImpl::CreateFromString(tmpSourcePool, vocab, system, toks[2]); + + /* + std::pair retStruct = GetKey(*sourcePhrase); + if (!retStruct.first) { + UTIL_THROW2("Unknown cache entry"); + } + cerr << "key=" << retStruct.second << " " << key << endl; + */ + TargetPhrases *tps = CreateTargetPhrases(pool, system, *sourcePhrase, key); + assert(tps); + + m_cachePb[key] = tps; + } else { + // SCFG + SCFG::PhraseImpl *sourcePhrase = SCFG::PhraseImpl::CreateFromString(tmpSourcePool, vocab, system, toks[2], false); + //cerr << "sourcePhrase=" << sourcePhrase->Debug(system) << endl; + + std::pair tpsPair = CreateTargetPhrasesSCFG(pool, system, *sourcePhrase, key); + assert(tpsPair.first && tpsPair.second); + + m_cacheSCFG[key] = tpsPair.second; + } + ++lineCount; + } + +} + +/////////////////////////////////////////////////////////////////////////////// +// SCFG +/////////////////////////////////////////////////////////////////////////////// + +void ProbingPT::ReformatWord(System &system, std::string &wordStr, bool &isNT) +{ + isNT = false; + if (system.isPb) { + return; + } else { + isNT = (wordStr[0] == '[' && wordStr[wordStr.size() - 1] == ']'); + //cerr << "nt=" << nt << endl; + + if (isNT) { + size_t startPos = wordStr.find("]["); + if (startPos == string::npos) { + startPos = 1; + } else { + startPos += 2; + } + + wordStr = wordStr.substr(startPos, wordStr.size() - startPos - 1); + //cerr << "wordStr=" << wordStr << endl; + } + } +} + +void ProbingPT::InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const +{ + //cerr << "InitActiveChart=" << path.Debug(cerr, mgr.system) << endl; + size_t ptInd = GetPtInd(); + ActiveChartEntryProbing *chartEntry = new (pool.Allocate()) ActiveChartEntryProbing(pool); + path.AddActiveChartEntry(ptInd, chartEntry); +} + +void ProbingPT::Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const +{ + //cerr << "Lookup=" << endl; + if (path.range.GetNumWordsCovered() > maxChartSpan) { + return; + } + + size_t endPos = path.range.GetEndPos(); + + const SCFG::InputPath *prevPath = static_cast(path.prefixPath); + UTIL_THROW_IF2(prevPath == NULL, "prefixPath == NULL"); + + // TERMINAL + const SCFG::Word &lastWord = path.subPhrase.Back(); + + const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(endPos, 1); + + //cerr << "BEFORE LookupGivenWord=" << *prevPath << endl; + LookupGivenWord(pool, mgr, *prevPath, lastWord, NULL, subPhrasePath.range, path); + //cerr << "AFTER LookupGivenWord=" << *prevPath << endl; + + // NON-TERMINAL + //const SCFG::InputPath *prefixPath = static_cast(path.prefixPath); + while (prevPath) { + const Range &prevRange = prevPath->range; + //cerr << "prevRange=" << prevRange << endl; + + size_t startPos = prevRange.GetEndPos() + 1; + size_t ntSize = endPos - startPos + 1; + const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(startPos, ntSize); + + LookupNT(pool, mgr, subPhrasePath.range, *prevPath, stacks, path); + + prevPath = static_cast(prevPath->prefixPath); + } +} + +void ProbingPT::LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const +{ + const ActiveChartEntryProbing &prevEntryCast = static_cast(prevEntry); + + std::pair key = prevEntryCast.GetKey(wordSought, *this); + + if (!key.first) { + // should only occasionally happen when looking up unary rules + return; + } + + const Phrase &sourcePhrase = outPath.subPhrase; + + // check in cache + CacheSCFG::const_iterator iter = m_cacheSCFG.find(key.second); + if (iter != m_cacheSCFG.end()) { + //cerr << "FOUND IN CACHE " << key.second << " " << sourcePhrase.Debug(mgr.system) << endl; + SCFG::TargetPhrases *tps = iter->second; + + ActiveChartEntryProbing *chartEntry = new (pool.Allocate()) ActiveChartEntryProbing(pool, prevEntryCast); + //cerr << "AFTER chartEntry" << endl; + + chartEntry->AddSymbolBindElement(subPhraseRange, wordSought, hypos, *this); + //cerr << "AFTER AddSymbolBindElement" << endl; + + size_t ptInd = GetPtInd(); + outPath.AddActiveChartEntry(ptInd, chartEntry); + + outPath.AddTargetPhrasesToPath(pool, mgr.system, *this, *tps, chartEntry->GetSymbolBind()); + } else { + // not in cache. Lookup + std::pair tpsPair = CreateTargetPhrasesSCFG(pool, mgr.system, sourcePhrase, key.second); + assert(tpsPair.first && tpsPair.second); + + if (tpsPair.first) { + // new entries + ActiveChartEntryProbing *chartEntry = new (pool.Allocate()) ActiveChartEntryProbing(pool, prevEntryCast); + //cerr << "AFTER chartEntry" << endl; + + chartEntry->AddSymbolBindElement(subPhraseRange, wordSought, hypos, *this); + //cerr << "AFTER AddSymbolBindElement" << endl; + + size_t ptInd = GetPtInd(); + outPath.AddActiveChartEntry(ptInd, chartEntry); + //cerr << "AFTER AddActiveChartEntry" << endl; + + if (tpsPair.second) { + // there are some rules + //cerr << "symbolbind=" << chartEntry->GetSymbolBind().Debug(mgr.system) << endl; + outPath.AddTargetPhrasesToPath(pool, mgr.system, *this, *tpsPair.second, chartEntry->GetSymbolBind()); + } + } + } +} + +SCFG::TargetPhraseImpl *ProbingPT::CreateTargetPhraseSCFG( + MemPool &pool, + const System &system, + const char *&offset) const +{ + probingpt::TargetPhraseInfo *tpInfo = (probingpt::TargetPhraseInfo*) offset; + SCFG::TargetPhraseImpl *tp = + new (pool.Allocate()) SCFG::TargetPhraseImpl(pool, *this, + system, tpInfo->numWords - 1); + + offset += sizeof(probingpt::TargetPhraseInfo); + + // scores + SCORE *scores = (SCORE*) offset; + + size_t totalNumScores = m_engine->num_scores + m_engine->num_lex_scores; + + if (m_engine->logProb) { + // set pt score for rule + tp->GetScores().PlusEquals(system, *this, scores); + + // save scores for other FF, eg. lex RO. Just give the offset + if (m_engine->num_lex_scores) { + tp->scoreProperties = scores + m_engine->num_scores; + } + } else { + // log score 1st + SCORE *logScores = (SCORE*) alloca(totalNumScores * sizeof(SCORE)); + for (size_t i = 0; i < totalNumScores; ++i) { + logScores[i] = FloorScore(TransformScore(scores[i])); + } + + // set pt score for rule + tp->GetScores().PlusEquals(system, *this, logScores); + + // save scores for other FF, eg. lex RO. + tp->scoreProperties = pool.Allocate(m_engine->num_lex_scores); + for (size_t i = 0; i < m_engine->num_lex_scores; ++i) { + tp->scoreProperties[i] = logScores[i + m_engine->num_scores]; + } + } + + offset += sizeof(SCORE) * totalNumScores; + + // words + for (size_t i = 0; i < tpInfo->numWords - 1; ++i) { + uint32_t *probingId = (uint32_t*) offset; + + const std::pair *factorPair = GetTargetFactor(*probingId); + assert(factorPair); + + SCFG::Word &word = (*tp)[i]; + word[0] = factorPair->second; + word.isNonTerminal = factorPair->first; + + offset += sizeof(uint32_t); + } + + // lhs + uint32_t *probingId = (uint32_t*) offset; + + const std::pair *factorPair = GetTargetFactor(*probingId); + assert(factorPair); + assert(factorPair->first); + + tp->lhs[0] = factorPair->second; + tp->lhs.isNonTerminal = factorPair->first; + + offset += sizeof(uint32_t); + + // align + uint32_t alignTerm = tpInfo->alignTerm; + //cerr << "alignTerm=" << alignTerm << endl; + UTIL_THROW_IF2(alignTerm >= m_aligns.size(), "Unknown alignInd"); + tp->Parent::SetAlignTerm(*m_aligns[alignTerm]); + + uint32_t alignNonTerm = tpInfo->alignNonTerm; + //cerr << "alignTerm=" << alignTerm << endl; + UTIL_THROW_IF2(alignNonTerm >= m_aligns.size(), "Unknown alignInd"); + tp->SetAlignNonTerm(*m_aligns[alignNonTerm]); + + // properties TODO + + return tp; +} + +std::pair ProbingPT::CreateTargetPhrasesSCFG(MemPool &pool, const System &system, + const Phrase &sourcePhrase, uint64_t key) const +{ + std::pair ret(false, NULL); + + std::pair query_result; // 1st=found, 2nd=target file offset + query_result = m_engine->query(key); + //cerr << "query_result=" << query_result.first << endl; + + /* + if (outPath.range.GetStartPos() == 1 || outPath.range.GetStartPos() == 2) { + cerr << "range=" << outPath.range + << " prevEntry=" << prevEntry.GetSymbolBind().Debug(mgr.system) << " " << prevEntryCast.GetKey() + << " wordSought=" << wordSought.Debug(mgr.system) + << " key=" << key.first << " " << key.second + << " query_result=" << query_result.first << " " << (query_result.second == NONE) + << endl; + } + */ + + if (query_result.first) { + ret.first = true; + size_t ptInd = GetPtInd(); + + if (query_result.second != NONE) { + // there are some rules + const FeatureFunctions &ffs = system.featureFunctions; + + const char *offset = m_engine->memTPS + query_result.second; + uint64_t *numTP = (uint64_t*) offset; + //cerr << "numTP=" << *numTP << endl; + + SCFG::TargetPhrases *tps = new (pool.Allocate()) SCFG::TargetPhrases(pool, *numTP); + ret.second = tps; + + offset += sizeof(uint64_t); + for (size_t i = 0; i < *numTP; ++i) { + SCFG::TargetPhraseImpl *tp = CreateTargetPhraseSCFG(pool, system, offset); + assert(tp); + //cerr << "tp=" << tp->Debug(mgr.system) << endl; + + ffs.EvaluateInIsolation(pool, system, sourcePhrase, *tp); + + tps->AddTargetPhrase(*tp); + + } + + tps->SortAndPrune(m_tableLimit); + ffs.EvaluateAfterTablePruning(pool, *tps, sourcePhrase); + //cerr << "tps=" << tps->GetSize() << endl; + + } + } + + return ret; +} + +} // namespace + diff --git a/mosesdecoder/moses2/TranslationModel/ProbingPT.h b/mosesdecoder/moses2/TranslationModel/ProbingPT.h new file mode 100644 index 0000000000000000000000000000000000000000..47d22e1b397f6c9f4c5969f152612308e85ae27d --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/ProbingPT.h @@ -0,0 +1,164 @@ +/* + * ProbingPT.h + * + * Created on: 3 Nov 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include +#include "PhraseTable.h" +#include "../Vector.h" +#include "../Phrase.h" +#include "../SCFG/ActiveChart.h" +#include "util/mmap.hh" + +namespace probingpt +{ +class QueryEngine; +class target_text; +} + +namespace Moses2 +{ +class AlignmentInfo; +class MemPool; +class System; +class RecycleData; + +namespace SCFG +{ +class TargetPhraseImpl; +class TargetPhrases; +} + +class ProbingPT: public Moses2::PhraseTable +{ + ////////////////////////////////////// + class ActiveChartEntryProbing : public SCFG::ActiveChartEntry + { + typedef SCFG::ActiveChartEntry Parent; + public: + + ActiveChartEntryProbing(MemPool &pool) + :Parent(pool) + ,m_key(0) + {} + + ActiveChartEntryProbing( + MemPool &pool, + const ActiveChartEntryProbing &prevEntry); + + uint64_t GetKey() const { + return m_key; + } + + std::pair GetKey(const SCFG::Word &nextWord, const ProbingPT &pt) const; + + virtual void AddSymbolBindElement( + const Range &range, + const SCFG::Word &word, + const Moses2::Hypotheses *hypos, + const Moses2::PhraseTable &pt); + + protected: + uint64_t m_key; + }; + ////////////////////////////////////// + +public: + ProbingPT(size_t startInd, const std::string &line); + virtual ~ProbingPT(); + void Load(System &system); + + virtual void SetParameter(const std::string& key, const std::string& value); + void Lookup(const Manager &mgr, InputPathsBase &inputPaths) const; + + uint64_t GetUnk() const { + return m_unkId; + } + + // SCFG + void InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const; + + virtual void Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const; + + +protected: + std::vector m_sourceVocab; // factor id -> pt id + std::vector< std::pair > m_targetVocab; // pt id -> factor* + std::vector m_aligns; + util::LoadMethod load_method; + + uint64_t m_unkId; + probingpt::QueryEngine *m_engine; + + void CreateAlignmentMap(System &system, const std::string path); + + TargetPhrases *Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const; + TargetPhrases *CreateTargetPhrases(MemPool &pool, const System &system, + const Phrase &sourcePhrase, uint64_t key) const; + TargetPhraseImpl *CreateTargetPhrase(MemPool &pool, const System &system, + const char *&offset) const; + + inline const std::pair *GetTargetFactor(uint32_t probingId) const { + if (probingId >= m_targetVocab.size()) { + return NULL; + } + return &m_targetVocab[probingId]; + } + + std::pair GetKey(const Phrase &sourcePhrase) const; + + void GetSourceProbingIds(const Phrase &sourcePhrase, bool &ok, + uint64_t probingSource[]) const; + + uint64_t GetSourceProbingId(const Word &word) const; + + // caching + typedef boost::unordered_map CachePb; + CachePb m_cachePb; + + typedef boost::unordered_map CacheSCFG; + CacheSCFG m_cacheSCFG; + + void CreateCache(System &system); + + void ReformatWord(System &system, std::string &wordStr, bool &isNT); + + // SCFG + void LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const; + + std::pair CreateTargetPhrasesSCFG(MemPool &pool, const System &system, + const Phrase &sourcePhrase, uint64_t key) const; + // return value: 1st = there are actual rules, not just a empty cell for prefix + + SCFG::TargetPhraseImpl *CreateTargetPhraseSCFG( + MemPool &pool, + const System &system, + const char *&offset) const; + + +}; + +} + diff --git a/mosesdecoder/moses2/TranslationModel/Transliteration.cpp b/mosesdecoder/moses2/TranslationModel/Transliteration.cpp new file mode 100644 index 0000000000000000000000000000000000000000..13c884508f691be2b19295d1147f1e8d3fe9de1a --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/Transliteration.cpp @@ -0,0 +1,229 @@ +/* + * Transliteration.cpp + * + * Created on: 28 Oct 2015 + * Author: hieu + */ +#include +#include "Transliteration.h" +#include "../System.h" +#include "../Scores.h" +#include "../InputType.h" +#include "../PhraseBased/Manager.h" +#include "../PhraseBased/TargetPhraseImpl.h" +#include "../PhraseBased/InputPath.h" +#include "../PhraseBased/TargetPhrases.h" +#include "../PhraseBased/Sentence.h" +#include "../SCFG/InputPath.h" +#include "../SCFG/TargetPhraseImpl.h" +#include "../SCFG/Manager.h" +#include "../SCFG/Sentence.h" +#include "../SCFG/ActiveChart.h" +#include "util/tempfile.hh" +#include "../legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ + +Transliteration::Transliteration(size_t startInd, const std::string &line) : + PhraseTable(startInd, line) +{ + ReadParameters(); + UTIL_THROW_IF2(m_mosesDir.empty() || + m_scriptDir.empty() || + m_externalDir.empty() || + m_inputLang.empty() || + m_outputLang.empty(), "Must specify all arguments"); +} + +Transliteration::~Transliteration() +{ + // TODO Auto-generated destructor stub +} + +void +Transliteration:: +SetParameter(const std::string& key, const std::string& value) +{ + if (key == "moses-dir") { + m_mosesDir = value; + } else if (key == "script-dir") { + m_scriptDir = value; + } else if (key == "external-dir") { + m_externalDir = value; + } else if (key == "input-lang") { + m_inputLang = value; + } else if (key == "output-lang") { + m_outputLang = value; + } else { + PhraseTable::SetParameter(key, value); + } +} + +void Transliteration::Lookup(const Manager &mgr, + InputPathsBase &inputPaths) const +{ + BOOST_FOREACH(InputPathBase *pathBase, inputPaths) { + InputPath *path = static_cast(pathBase); + + if (SatisfyBackoff(mgr, *path)) { + const SubPhrase &phrase = path->subPhrase; + + TargetPhrases *tps = Lookup(mgr, mgr.GetPool(), *path); + path->AddTargetPhrases(*this, tps); + } + } + +} + +TargetPhrases *Transliteration::Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const +{ + const SubPhrase &sourcePhrase = inputPath.subPhrase; + size_t hash = sourcePhrase.hash(); + + // TRANSLITERATE + const util::temp_file inFile; + const util::temp_dir outDir; + + ofstream inStream(inFile.path().c_str()); + inStream << sourcePhrase.Debug(mgr.system) << endl; + inStream.close(); + + string cmd = m_scriptDir + "/Transliteration/prepare-transliteration-phrase-table.pl" + + " --transliteration-model-dir " + m_filePath + + " --moses-src-dir " + m_mosesDir + + " --external-bin-dir " + m_externalDir + + " --input-extension " + m_inputLang + + " --output-extension " + m_outputLang + + " --oov-file " + inFile.path() + + " --out-dir " + outDir.path(); + + int ret = system(cmd.c_str()); + UTIL_THROW_IF2(ret != 0, "Transliteration script error"); + + TargetPhrases *tps = NULL; + tps = new (pool.Allocate()) TargetPhrases(pool, 1); + + vector targetPhrases + = CreateTargetPhrases(mgr, pool, sourcePhrase, outDir.path()); + + vector::const_iterator iter; + for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) { + TargetPhraseImpl *tp = *iter; + tps->AddTargetPhrase(*tp); + } + mgr.system.featureFunctions.EvaluateAfterTablePruning(pool, *tps, sourcePhrase); + + inputPath.AddTargetPhrases(*this, tps); +} + +std::vector Transliteration::CreateTargetPhrases( + const Manager &mgr, + MemPool &pool, + const SubPhrase &sourcePhrase, + const std::string &outDir) const +{ + std::vector ret; + + string outPath = outDir + "/out.txt"; + ifstream outStream(outPath.c_str()); + + string line; + while (getline(outStream, line)) { + vector toks = Moses2::Tokenize(line, "\t"); + UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore"); + + TargetPhraseImpl *tp = + new (pool.Allocate()) TargetPhraseImpl(pool, *this, mgr.system, 1); + Moses2::Word &word = (*tp)[0]; + word.CreateFromString(mgr.system.GetVocab(), mgr.system, toks[0]); + + float score = Scan(toks[1]); + tp->GetScores().PlusEquals(mgr.system, *this, score); + + // score of all other ff when this rule is being loaded + mgr.system.featureFunctions.EvaluateInIsolation(pool, mgr.system, sourcePhrase, *tp); + + ret.push_back(tp); + } + + outStream.close(); + + return ret; + +} + + +void Transliteration::EvaluateInIsolation(const System &system, + const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + UTIL_THROW2("Not implemented"); +} + +// SCFG /////////////////////////////////////////////////////////////////////////////////////////// +void Transliteration::InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const +{ + UTIL_THROW2("Not implemented"); +} + +void Transliteration::Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const +{ + UTIL_THROW2("Not implemented"); +} + +void Transliteration::LookupUnary(MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const +{ + UTIL_THROW2("Not implemented"); +} + +void Transliteration::LookupNT( + MemPool &pool, + const SCFG::Manager &mgr, + const Moses2::Range &subPhraseRange, + const SCFG::InputPath &prevPath, + const SCFG::Stacks &stacks, + SCFG::InputPath &outPath) const +{ + UTIL_THROW2("Not implemented"); +} + +void Transliteration::LookupGivenWord( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::InputPath &prevPath, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const +{ + UTIL_THROW2("Not implemented"); +} + +void Transliteration::LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const +{ + UTIL_THROW2("Not implemented"); +} + +} + diff --git a/mosesdecoder/moses2/TranslationModel/Transliteration.h b/mosesdecoder/moses2/TranslationModel/Transliteration.h new file mode 100644 index 0000000000000000000000000000000000000000..593677d6013b696ba91c530415de4dd33a7ce95b --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/Transliteration.h @@ -0,0 +1,91 @@ +/* + * Transliteration.h + * + * Created on: 28 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include "PhraseTable.h" + +namespace Moses2 +{ +class Sentence; +class InputPaths; +class Range; + +class Transliteration: public PhraseTable +{ +public: + Transliteration(size_t startInd, const std::string &line); + virtual ~Transliteration(); + + void Lookup(const Manager &mgr, InputPathsBase &inputPaths) const; + virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const; + + virtual void + EvaluateInIsolation(const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const; + + void Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const; + + void LookupUnary(MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const; + +protected: + virtual void LookupNT( + MemPool &pool, + const SCFG::Manager &mgr, + const Moses2::Range &subPhraseRange, + const SCFG::InputPath &prevPath, + const SCFG::Stacks &stacks, + SCFG::InputPath &outPath) const; + + virtual void LookupGivenWord( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::InputPath &prevPath, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const; + + virtual void LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const; + + void SetParameter(const std::string& key, const std::string& value); + +protected: + std::string m_filePath; + std::string m_mosesDir, m_scriptDir, m_externalDir, m_inputLang, m_outputLang; + + std::vector CreateTargetPhrases( + const Manager &mgr, + MemPool &pool, + const SubPhrase &sourcePhrase, + const std::string &outDir) const; + +}; + +} + diff --git a/mosesdecoder/moses2/TranslationModel/UnknownWordPenalty.cpp b/mosesdecoder/moses2/TranslationModel/UnknownWordPenalty.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e165e7e020338ab2247ade1180567de0e7a3a092 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/UnknownWordPenalty.cpp @@ -0,0 +1,281 @@ +/* + * UnknownWordPenalty.cpp + * + * Created on: 28 Oct 2015 + * Author: hieu + */ +#include +#include "UnknownWordPenalty.h" +#include "../System.h" +#include "../Scores.h" +#include "../InputType.h" +#include "../PhraseBased/Manager.h" +#include "../PhraseBased/TargetPhraseImpl.h" +#include "../PhraseBased/InputPath.h" +#include "../PhraseBased/TargetPhrases.h" +#include "../PhraseBased/Sentence.h" +#include "../SCFG/InputPath.h" +#include "../SCFG/TargetPhraseImpl.h" +#include "../SCFG/Manager.h" +#include "../SCFG/Sentence.h" +#include "../SCFG/ActiveChart.h" + +using namespace std; + +namespace Moses2 +{ + +UnknownWordPenalty::UnknownWordPenalty(size_t startInd, const std::string &line) + :PhraseTable(startInd, line) + ,m_drop(false) +{ + m_tuneable = false; + ReadParameters(); +} + +UnknownWordPenalty::~UnknownWordPenalty() +{ + // TODO Auto-generated destructor stub +} + +void UnknownWordPenalty::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "drop") { + m_drop = Scan(value); + } else if (key == "prefix") { + m_prefix = value; + } else if (key == "suffix") { + m_suffix = value; + } else { + PhraseTable::SetParameter(key, value); + } +} + +void UnknownWordPenalty::ProcessXML( + const Manager &mgr, + MemPool &pool, + const Sentence &sentence, + InputPaths &inputPaths) const +{ + const Vector &xmlOptions = sentence.GetXMLOptions(); + BOOST_FOREACH(const InputType::XMLOption *xmlOption, xmlOptions) { + TargetPhraseImpl *target = TargetPhraseImpl::CreateFromString(pool, *this, mgr.system, xmlOption->GetTranslation()); + + if (xmlOption->prob) { + Scores &scores = target->GetScores(); + scores.PlusEquals(mgr.system, *this, Moses2::TransformScore(xmlOption->prob)); + } + + InputPath *path = inputPaths.GetMatrix().GetValue(xmlOption->startPos, xmlOption->phraseSize - 1); + const SubPhrase &source = path->subPhrase; + + mgr.system.featureFunctions.EvaluateInIsolation(pool, mgr.system, source, *target); + + TargetPhrases *tps = new (pool.Allocate()) TargetPhrases(pool, 1); + + tps->AddTargetPhrase(*target); + mgr.system.featureFunctions.EvaluateAfterTablePruning(pool, *tps, source); + + path->AddTargetPhrases(*this, tps); + } +} + +void UnknownWordPenalty::Lookup(const Manager &mgr, + InputPathsBase &inputPaths) const +{ + BOOST_FOREACH(InputPathBase *pathBase, inputPaths) { + InputPath *path = static_cast(pathBase); + + if (SatisfyBackoff(mgr, *path)) { + const SubPhrase &phrase = path->subPhrase; + + TargetPhrases *tps = Lookup(mgr, mgr.GetPool(), *path); + path->AddTargetPhrases(*this, tps); + } + } + +} + +TargetPhrases *UnknownWordPenalty::Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const +{ + const System &system = mgr.system; + TargetPhrases *tps = NULL; + + // any other pt translate this? + size_t numPt = mgr.system.mappings.size(); + const TargetPhrases **allTPS = + static_cast(inputPath).targetPhrases; + for (size_t i = 0; i < numPt; ++i) { + const TargetPhrases *otherTps = allTPS[i]; + + if (otherTps && otherTps->GetSize()) { + return tps; + } + } + + const SubPhrase &source = inputPath.subPhrase; + const Moses2::Word &sourceWord = source[0]; + const Factor *factor = sourceWord[0]; + + tps = new (pool.Allocate()) TargetPhrases(pool, 1); + + size_t numWords = m_drop ? 0 : 1; + + TargetPhraseImpl *target = + new (pool.Allocate()) TargetPhraseImpl(pool, *this, + system, numWords); + + if (!m_drop) { + Moses2::Word &word = (*target)[0]; + + if (m_prefix.empty() && m_suffix.empty()) { + word[0] = factor; + } else { + stringstream strm; + if (!m_prefix.empty()) { + strm << m_prefix; + } + strm << factor->GetString(); + if (!m_suffix.empty()) { + strm << m_suffix; + } + + FactorCollection &fc = system.GetVocab(); + const Factor *targetFactor = fc.AddFactor(strm.str(), system, false); + word[0] = targetFactor; + } + } + + Scores &scores = target->GetScores(); + scores.PlusEquals(mgr.system, *this, -100); + + MemPool &memPool = mgr.GetPool(); + system.featureFunctions.EvaluateInIsolation(memPool, system, source, *target); + + tps->AddTargetPhrase(*target); + system.featureFunctions.EvaluateAfterTablePruning(memPool, *tps, source); + + return tps; +} + +void UnknownWordPenalty::EvaluateInIsolation(const System &system, + const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const +{ + +} + +// SCFG /////////////////////////////////////////////////////////////////////////////////////////// +void UnknownWordPenalty::InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const +{ +} + +void UnknownWordPenalty::Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const +{ + const System &system = mgr.system; + + size_t numWords = path.range.GetNumWordsCovered(); + if (numWords > 1) { + // only create 1 word phrases + return; + } + + if (path.GetNumRules()) { + // only create rules if no other rules + return; + } + + // don't do 1st if 1st word + if (path.range.GetStartPos() == 0) { + return; + } + + // don't do 1st if last word + const SCFG::Sentence &sentence = static_cast(mgr.GetInput()); + if (path.range.GetStartPos() + 1 == sentence.GetSize()) { + return; + } + + // terminal + const SCFG::Word &lastWord = path.subPhrase.Back(); + //cerr << "UnknownWordPenalty lastWord=" << lastWord << endl; + + const Factor *factor = lastWord[0]; + SCFG::TargetPhraseImpl *tp = new (pool.Allocate()) SCFG::TargetPhraseImpl(pool, *this, system, 1); + SCFG::Word &word = (*tp)[0]; + word.CreateFromString(system.GetVocab(), system, factor->GetString().as_string()); + + tp->lhs.CreateFromString(system.GetVocab(), system, "[X]"); + + size_t endPos = path.range.GetEndPos(); + const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(endPos, 1); + + SCFG::ActiveChartEntry *chartEntry = new (pool.Allocate()) SCFG::ActiveChartEntry(pool); + chartEntry->AddSymbolBindElement(subPhrasePath.range, lastWord, NULL, *this); + path.AddActiveChartEntry(GetPtInd(), chartEntry); + + Scores &scores = tp->GetScores(); + scores.PlusEquals(mgr.system, *this, -100); + + MemPool &memPool = mgr.GetPool(); + const SubPhrase &source = path.subPhrase; + system.featureFunctions.EvaluateInIsolation(memPool, system, source, *tp); + + SCFG::TargetPhrases *tps = new (pool.Allocate()) SCFG::TargetPhrases(pool); + tps->AddTargetPhrase(*tp); + + path.AddTargetPhrasesToPath(pool, mgr.system, *this, *tps, chartEntry->GetSymbolBind()); +} + +void UnknownWordPenalty::LookupUnary(MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const +{ +} + +void UnknownWordPenalty::LookupNT( + MemPool &pool, + const SCFG::Manager &mgr, + const Moses2::Range &subPhraseRange, + const SCFG::InputPath &prevPath, + const SCFG::Stacks &stacks, + SCFG::InputPath &outPath) const +{ + UTIL_THROW2("Not implemented"); +} + +void UnknownWordPenalty::LookupGivenWord( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::InputPath &prevPath, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const +{ + UTIL_THROW2("Not implemented"); +} + +void UnknownWordPenalty::LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const +{ + UTIL_THROW2("Not implemented"); +} + +} + diff --git a/mosesdecoder/moses2/TranslationModel/UnknownWordPenalty.h b/mosesdecoder/moses2/TranslationModel/UnknownWordPenalty.h new file mode 100644 index 0000000000000000000000000000000000000000..112f0b6cfd151c77fce88c082db67d86b382b019 --- /dev/null +++ b/mosesdecoder/moses2/TranslationModel/UnknownWordPenalty.h @@ -0,0 +1,89 @@ +/* + * UnknownWordPenalty.h + * + * Created on: 28 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include "PhraseTable.h" + +namespace Moses2 +{ +class Sentence; +class InputPaths; +class Range; + +class UnknownWordPenalty: public PhraseTable +{ +public: + UnknownWordPenalty(size_t startInd, const std::string &line); + virtual ~UnknownWordPenalty(); + + virtual void SetParameter(const std::string& key, const std::string& value); + + void Lookup(const Manager &mgr, InputPathsBase &inputPaths) const; + virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool, + InputPath &inputPath) const; + + void ProcessXML( + const Manager &mgr, + MemPool &pool, + const Sentence &sentence, + InputPaths &inputPaths) const; + + virtual void + EvaluateInIsolation(const System &system, const Phrase &source, + const TargetPhraseImpl &targetPhrase, Scores &scores, + SCORE &estimatedScore) const; + + virtual void InitActiveChart( + MemPool &pool, + const SCFG::Manager &mgr, + SCFG::InputPath &path) const; + + void Lookup(MemPool &pool, + const SCFG::Manager &mgr, + size_t maxChartSpan, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const; + + void LookupUnary(MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::Stacks &stacks, + SCFG::InputPath &path) const; + +protected: + virtual void LookupNT( + MemPool &pool, + const SCFG::Manager &mgr, + const Moses2::Range &subPhraseRange, + const SCFG::InputPath &prevPath, + const SCFG::Stacks &stacks, + SCFG::InputPath &outPath) const; + + virtual void LookupGivenWord( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::InputPath &prevPath, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const; + + virtual void LookupGivenNode( + MemPool &pool, + const SCFG::Manager &mgr, + const SCFG::ActiveChartEntry &prevEntry, + const SCFG::Word &wordSought, + const Moses2::Hypotheses *hypos, + const Moses2::Range &subPhraseRange, + SCFG::InputPath &outPath) const; +protected: + bool m_drop; + std::string m_prefix, m_suffix; +}; + +} + diff --git a/mosesdecoder/moses2/TranslationTask.cpp b/mosesdecoder/moses2/TranslationTask.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0644283b548116b708038b25fccc1b84e1a660ba --- /dev/null +++ b/mosesdecoder/moses2/TranslationTask.cpp @@ -0,0 +1,58 @@ +#include "TranslationTask.h" +#include "System.h" +#include "InputType.h" +#include "PhraseBased/Manager.h" +#include "SCFG/Manager.h" + +using namespace std; + +namespace Moses2 +{ + +TranslationTask::TranslationTask(System &system, + const std::string &line, + long translationId) +{ + if (system.isPb) { + m_mgr = new Manager(system, *this, line, translationId); + } else { + m_mgr = new SCFG::Manager(system, *this, line, translationId); + } +} + +TranslationTask::~TranslationTask() +{ +} +std::string TranslationTask::ReturnTranslation() const +{ + m_mgr->Decode(); + string out; + out = m_mgr->OutputBest() + "\n"; + delete m_mgr; + return out; +} +void TranslationTask::Run() +{ + + m_mgr->Decode(); + + string out; + + out = m_mgr->OutputBest() + "\n"; + m_mgr->system.bestCollector->Write(m_mgr->GetTranslationId(), out); + + if (m_mgr->system.options.nbest.nbest_size) { + out = m_mgr->OutputNBest(); + m_mgr->system.nbestCollector->Write(m_mgr->GetTranslationId(), out); + } + + if (!m_mgr->system.options.output.detailed_transrep_filepath.empty()) { + out = m_mgr->OutputTransOpt(); + m_mgr->system.detailedTranslationCollector->Write(m_mgr->GetTranslationId(), out); + } + + delete m_mgr; +} + +} + diff --git a/mosesdecoder/moses2/TranslationTask.h b/mosesdecoder/moses2/TranslationTask.h new file mode 100644 index 0000000000000000000000000000000000000000..86e1766e171587b5a92b0ebfc8c198e7b79688de --- /dev/null +++ b/mosesdecoder/moses2/TranslationTask.h @@ -0,0 +1,26 @@ +#pragma once +#include +#include "legacy/ThreadPool.h" + +namespace Moses2 +{ + +class System; +class ManagerBase; +class Manager; + +class TranslationTask: public Task +{ +public: + + TranslationTask(System &system, const std::string &line, long translationId); + virtual ~TranslationTask(); + virtual void Run(); + virtual std::string ReturnTranslation() const; + +protected: + ManagerBase *m_mgr; +}; + +} + diff --git a/mosesdecoder/moses2/TrellisPaths.cpp b/mosesdecoder/moses2/TrellisPaths.cpp new file mode 100644 index 0000000000000000000000000000000000000000..814da45211235ad584c87883bf6779855abb38fc --- /dev/null +++ b/mosesdecoder/moses2/TrellisPaths.cpp @@ -0,0 +1,14 @@ +/* + * TrellisPaths.cpp + * + * Created on: 16 Mar 2016 + * Author: hieu + */ +#include "TrellisPaths.h" +#include "legacy/Util2.h" + +namespace Moses2 +{ + + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/TrellisPaths.h b/mosesdecoder/moses2/TrellisPaths.h new file mode 100644 index 0000000000000000000000000000000000000000..6a6a59c1a64abd681e5c1f11aef7d4a5988b5741 --- /dev/null +++ b/mosesdecoder/moses2/TrellisPaths.h @@ -0,0 +1,64 @@ +/* + * TrellisPaths.h + * + * Created on: 16 Mar 2016 + * Author: hieu + */ +#pragma once + +#include +#include +#include "PhraseBased/TrellisPath.h" + +namespace Moses2 +{ + +template +struct CompareTrellisPath { + bool operator()(const T* pathA, const T* pathB) const { + return (pathA->GetFutureScore() < pathB->GetFutureScore()); + } +}; + +template +class TrellisPaths +{ +public: + TrellisPaths() {} + + virtual ~TrellisPaths() { + while (!empty()) { + T *path = Get(); + delete path; + } + } + + bool empty() const { + return m_coll.empty(); + } + + //! add a new entry into collection + void Add(T *trellisPath) { + m_coll.push(trellisPath); + } + + T *Get() { + T *top = m_coll.top(); + + // Detach + m_coll.pop(); + return top; + } + + size_t GetSize() const { + return m_coll.size(); + } + +protected: + typedef std::priority_queue, + CompareTrellisPath > CollectionType; + CollectionType m_coll; +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/TypeDef.cpp b/mosesdecoder/moses2/TypeDef.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b8b79c59c642eed8413b8f2a177216287cee3f16 --- /dev/null +++ b/mosesdecoder/moses2/TypeDef.cpp @@ -0,0 +1,11 @@ +#include "TypeDef.h" +#include "util/exception.hh" +#include + + +namespace Moses2 +{ + + + +} diff --git a/mosesdecoder/moses2/TypeDef.h b/mosesdecoder/moses2/TypeDef.h new file mode 100644 index 0000000000000000000000000000000000000000..d96257ac29e3cca13167ebc61cfae65735d6fa0f --- /dev/null +++ b/mosesdecoder/moses2/TypeDef.h @@ -0,0 +1,125 @@ +/* + * TypeDef.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include +#include "Vector.h" + +namespace Moses2 +{ + +class HypothesisBase; + +#define NOT_FOUND std::numeric_limits::max() +const size_t DEFAULT_MAX_PHRASE_LENGTH = 20; +const size_t DEFAULT_MAX_CHART_SPAN = 20; +const size_t DEFAULT_MAX_HYPOSTACK_SIZE = 200; +const size_t DEFAULT_CUBE_PRUNING_POP_LIMIT = 1000; +const size_t DEFAULT_CUBE_PRUNING_DIVERSITY = 0; +const size_t DEFAULT_MAX_TRANS_OPT_SIZE = 5000; + +const size_t DEFAULT_MAX_PART_TRANS_OPT_SIZE = 10000; +const size_t DEFAULT_MAX_TRANS_OPT_CACHE_SIZE = 10000; +const float LOWEST_SCORE = -100.0f; + +const float DEFAULT_BEAM_WIDTH = 0.00001f; +const float DEFAULT_EARLY_DISCARDING_THRESHOLD = 0.0f; +const float DEFAULT_TRANSLATION_OPTION_THRESHOLD = 0.0f; + +#ifndef BOS_ +#define BOS_ "" //Beginning of sentence symbol +#endif +#ifndef EOS_ +#define EOS_ "" //End of sentence symbol +#endif + +typedef size_t FactorType; +typedef float SCORE; +typedef std::vector FactorList; + +// Note: StaticData uses SearchAlgorithm to determine whether the translation +// model is phrase-based or syntax-based. If you add a syntax-based search +// algorithm here then you should also update StaticData::IsSyntax(). +enum SearchAlgorithm { + Normal = 0, CubePruning = 1, + //,CubeGrowing = 2 + CYKPlus = 3, + NormalBatch = 4, + ChartIncremental = 5, + SyntaxS2T = 6, + SyntaxT2S = 7, + SyntaxT2S_SCFG = 8, + SyntaxF2S = 9, + CubePruningPerMiniStack = 10, + CubePruningPerBitmap = 11, + CubePruningCardinalStack = 12, + CubePruningBitmapStack = 13, + CubePruningMiniStack = 14, + DefaultSearchAlgorithm = 777 // means: use StaticData.m_searchAlgorithm +}; + +enum InputTypeEnum { + SentenceInput = 0, + ConfusionNetworkInput = 1, + WordLatticeInput = 2, + TreeInputType = 3, + //,WordLatticeInput2 = 4, + TabbedSentenceInput = 5, + ForestInputType = 6, + SentenceInputWithCandidates = 7, +}; + +enum XmlInputType { + XmlPassThrough = 0, + XmlIgnore = 1, + XmlExclusive = 2, + XmlInclusive = 3, + XmlConstraint = 4 +}; + +enum WordAlignmentSort { + NoSort = 0, + TargetOrder = 1 +}; + +enum S2TParsingAlgorithm { + RecursiveCYKPlus, + Scope3 +}; + +enum SourceLabelOverlap { + SourceLabelOverlapAdd = 0, + SourceLabelOverlapReplace = 1, + SourceLabelOverlapDiscard = 2 +}; + +///////////////////////// +// MOSES2 only + +class StackAdd +{ +public: + bool added; + HypothesisBase *other; + + StackAdd() { + } + StackAdd(bool vadded, HypothesisBase *vOther) : + added(vadded), other(vOther) { + } +}; + +class Hypothesis; +typedef Vector Batch; + +class Factor; +typedef std::vector Context; + +} + diff --git a/mosesdecoder/moses2/Vector.cpp b/mosesdecoder/moses2/Vector.cpp new file mode 100644 index 0000000000000000000000000000000000000000..46af0f7934b7c0b749e40811d65680148a44a3e2 --- /dev/null +++ b/mosesdecoder/moses2/Vector.cpp @@ -0,0 +1,14 @@ +/* + * Vector.cpp + * + * Created on: 7 Dec 2015 + * Author: hieu + */ + +#include "Vector.h" + +namespace Moses2 +{ + +} + diff --git a/mosesdecoder/moses2/Vector.h b/mosesdecoder/moses2/Vector.h new file mode 100644 index 0000000000000000000000000000000000000000..404d76dd33c6cf811467a0115d847fd5c15fe1ea --- /dev/null +++ b/mosesdecoder/moses2/Vector.h @@ -0,0 +1,34 @@ +/* + * Vector.h + * + * Created on: 7 Dec 2015 + * Author: hieu + */ + +#pragma once +#include +#include "MemPoolAllocator.h" + +namespace Moses2 +{ + +template +class Vector: public std::vector > +{ + typedef std::vector > Parent; + +public: + Vector(MemPool &pool, size_t size = 0, const T &val = T()) : + Parent(size, val, MemPoolAllocator(pool)) { + } + + Vector(const Vector ©) : + Parent(copy) { + } + +protected: +}; + + +} + diff --git a/mosesdecoder/moses2/Weights.cpp b/mosesdecoder/moses2/Weights.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e31a0fd3ba56a871cbbfd2fb606f881b096dcc45 --- /dev/null +++ b/mosesdecoder/moses2/Weights.cpp @@ -0,0 +1,61 @@ +/* + * Weights.cpp + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#include +#include +#include +#include "FF/FeatureFunction.h" +#include "FF/FeatureFunctions.h" +#include "Weights.h" +#include "System.h" +#include "legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ + +Weights::Weights() +{ + // TODO Auto-generated constructor stub + +} + +Weights::~Weights() +{ + // TODO Auto-generated destructor stub +} + +void Weights::Init(const FeatureFunctions &ffs) +{ + size_t totalNumScores = ffs.GetNumScores(); + //cerr << "totalNumScores=" << totalNumScores << endl; + m_weights.resize(totalNumScores, 1); +} + +std::vector Weights::GetWeights(const FeatureFunction &ff) const +{ + std::vector ret(m_weights.begin() + ff.GetStartInd(), m_weights.begin() + ff.GetStartInd() + ff.GetNumScores()); + return ret; +} + +void Weights::SetWeights(const FeatureFunctions &ffs, const std::string &ffName, const std::vector &weights) +{ + const FeatureFunction *ff = ffs.FindFeatureFunction(ffName); + UTIL_THROW_IF2(ff == NULL, "Feature function not found:" << ffName); + + size_t startInd = ff->GetStartInd(); + size_t numScores = ff->GetNumScores(); + UTIL_THROW_IF2(weights.size() != numScores, "Wrong number of weights. " << weights.size() << "!=" << numScores); + + for (size_t i = 0; i < numScores; ++i) { + SCORE weight = weights[i]; + m_weights[startInd + i] = weight; + } +} + +} + diff --git a/mosesdecoder/moses2/Weights.h b/mosesdecoder/moses2/Weights.h new file mode 100644 index 0000000000000000000000000000000000000000..96fdb5a71a20cc360d1d618a02d5abcdbb4810fa --- /dev/null +++ b/mosesdecoder/moses2/Weights.h @@ -0,0 +1,38 @@ +/* + * Weights.h + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include "TypeDef.h" + +namespace Moses2 +{ + +class FeatureFunctions; + +class Weights +{ +public: + Weights(); + virtual ~Weights(); + void Init(const FeatureFunctions &ffs); + + SCORE operator[](size_t ind) const { + return m_weights[ind]; + } + + std::vector GetWeights(const FeatureFunction &ff) const; + + void SetWeights(const FeatureFunctions &ffs, const std::string &ffName, const std::vector &weights); + +protected: + std::vector m_weights; +}; + +} + diff --git a/mosesdecoder/moses2/Word.cpp b/mosesdecoder/moses2/Word.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f272f7cdcc135aa10a42ea11939b94157ddb54ec --- /dev/null +++ b/mosesdecoder/moses2/Word.cpp @@ -0,0 +1,136 @@ +/* + * Word.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include +#include "Word.h" +#include "System.h" +#include "legacy/Util2.h" +#include "util/murmur_hash.hh" + +using namespace std; + +namespace Moses2 +{ + +Word::Word() +{ + Init(m_factors, MAX_NUM_FACTORS, NULL); +} + +Word::Word(const Word ©) +{ + memcpy(m_factors, copy.m_factors, sizeof(const Factor *) * MAX_NUM_FACTORS); +} + +Word::~Word() +{ + // TODO Auto-generated destructor stub +} + +void Word::CreateFromString(FactorCollection &vocab, const System &system, + const std::string &str) +{ + vector toks = Tokenize(str, "|"); + for (size_t i = 0; i < toks.size(); ++i) { + const string &tok = toks[i]; + //cerr << "tok=" << tok << endl; + const Factor *factor = vocab.AddFactor(tok, system, false); + m_factors[i] = factor; + } + + // null the rest + for (size_t i = toks.size(); i < MAX_NUM_FACTORS; ++i) { + m_factors[i] = NULL; + } +} + +size_t Word::hash() const +{ + uint64_t seed = 0; + size_t ret = util::MurmurHashNative(m_factors, + sizeof(Factor*) * MAX_NUM_FACTORS, seed); + return ret; +} + +size_t Word::hash(const std::vector &factors) const +{ + size_t seed = 0; + for (size_t i = 0; i < factors.size(); ++i) { + FactorType factorType = factors[i]; + const Factor *factor = m_factors[factorType]; + boost::hash_combine(seed, factor); + } + return seed; +} + + +int Word::Compare(const Word &compare) const +{ + + int cmp = memcmp(m_factors, compare.m_factors, + sizeof(Factor*) * MAX_NUM_FACTORS); + return cmp; + + /* + int ret = m_factors[0]->GetString().compare(compare.m_factors[0]->GetString()); + return ret; + */ +} + +bool Word::operator<(const Word &compare) const +{ + int cmp = Compare(compare); + return (cmp < 0); +} + +std::string Word::Debug(const System &system) const +{ + stringstream out; + bool outputAlready = false; + for (size_t i = 0; i < MAX_NUM_FACTORS; ++i) { + const Factor *factor = m_factors[i]; + if (factor) { + if (outputAlready) { + out << "|"; + } + out << *factor; + outputAlready = true; + } + } + + return out.str(); +} + +void Word::OutputToStream(const System &system, std::ostream &out) const +{ + const std::vector &factorTypes = system.options.output.factor_order; + out << *m_factors[ factorTypes[0] ]; + + for (size_t i = 1; i < factorTypes.size(); ++i) { + FactorType factorType = factorTypes[i]; + const Factor *factor = m_factors[factorType]; + + out << "|" << *factor; + } +} + +std::string Word::GetString(const FactorList &factorTypes) const +{ + assert(factorTypes.size()); + std::stringstream ret; + + ret << m_factors[factorTypes[0]]->GetString(); + for (size_t i = 1; i < factorTypes.size(); ++i) { + FactorType factorType = factorTypes[i]; + ret << "|" << m_factors[factorType]; + } + return ret.str(); +} + +} + diff --git a/mosesdecoder/moses2/Word.h b/mosesdecoder/moses2/Word.h new file mode 100644 index 0000000000000000000000000000000000000000..9d742eece61ed26bf7b4ff64dae481175be7d069 --- /dev/null +++ b/mosesdecoder/moses2/Word.h @@ -0,0 +1,63 @@ +/* + * Word.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include "TypeDef.h" +#include "legacy/Factor.h" +#include "legacy/FactorCollection.h" + +namespace Moses2 +{ + +class Word +{ +public: + explicit Word(); + Word(const Word ©); + + virtual ~Word(); + + void CreateFromString(FactorCollection &vocab, const System &system, + const std::string &str); + + virtual size_t hash() const; + virtual size_t hash(const std::vector &factors) const; + + int Compare(const Word &compare) const; + + virtual bool operator==(const Word &compare) const { + int cmp = Compare(compare); + return cmp == 0; + } + + virtual bool operator!=(const Word &compare) const { + return !((*this) == compare); + } + + virtual bool operator<(const Word &compare) const; + + const Factor* operator[](size_t ind) const { + return m_factors[ind]; + } + + const Factor*& operator[](size_t ind) { + return m_factors[ind]; + } + + virtual void OutputToStream(const System &system, std::ostream &out) const; + virtual std::string Debug(const System &system) const; + + std::string GetString(const FactorList &factorTypes) const; +protected: + const Factor *m_factors[MAX_NUM_FACTORS]; + +}; + +} + diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/AlignmentInfo.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/AlignmentInfo.o new file mode 100644 index 0000000000000000000000000000000000000000..5b5e6ad19f94190c797f523aeb11ee3304a29cc0 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/AlignmentInfo.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/AlignmentInfoCollection.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/AlignmentInfoCollection.o new file mode 100644 index 0000000000000000000000000000000000000000..6533dd53174b872d9f1defeb92838f5357c6a02b Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/AlignmentInfoCollection.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/ArcLists.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/ArcLists.o new file mode 100644 index 0000000000000000000000000000000000000000..caa38a27342c0bbb5459f167350690179c91c5b8 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/ArcLists.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/DLLEntryApi.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/DLLEntryApi.o new file mode 100644 index 0000000000000000000000000000000000000000..72b154bbcc9e17970c913da71880088e171198fc Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/DLLEntryApi.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/EstimatedScores.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/EstimatedScores.o new file mode 100644 index 0000000000000000000000000000000000000000..13f68a72ffe4dde004f03a8fbadccd120fb6bca9 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/EstimatedScores.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/Distortion.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/Distortion.o new file mode 100644 index 0000000000000000000000000000000000000000..2d0babb483607ed335606707f65daae35f1443dc Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/Distortion.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/ExampleStatefulFF.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/ExampleStatefulFF.o new file mode 100644 index 0000000000000000000000000000000000000000..3fe09c061e64b94e42941f6e5b008fa191bce899 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/ExampleStatefulFF.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/ExampleStatelessFF.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/ExampleStatelessFF.o new file mode 100644 index 0000000000000000000000000000000000000000..ea2480e526512063cf2d60b32f6f1435f8729f68 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/ExampleStatelessFF.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/FeatureFunction.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/FeatureFunction.o new file mode 100644 index 0000000000000000000000000000000000000000..3bed59d68b9a1af7079c610c4b847d7a7549e9ed Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/FeatureFunction.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/FeatureFunctions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/FeatureFunctions.o new file mode 100644 index 0000000000000000000000000000000000000000..39de868168161f0b1744bb94cc5cb0e8a5d0ff3c Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/FeatureFunctions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/FeatureRegistry.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/FeatureRegistry.o new file mode 100644 index 0000000000000000000000000000000000000000..196ba38082953bd5143ea7dfba9a059ef1a17347 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/FeatureRegistry.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/BidirectionalReorderingState.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/BidirectionalReorderingState.o new file mode 100644 index 0000000000000000000000000000000000000000..b19a4c661e8b88dc6487a14389de15bd7aeb7fe4 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/BidirectionalReorderingState.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/HReorderingBackwardState.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/HReorderingBackwardState.o new file mode 100644 index 0000000000000000000000000000000000000000..0220e0147b9c1159fbd65af7ef301f359bf56b30 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/HReorderingBackwardState.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/HReorderingForwardState.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/HReorderingForwardState.o new file mode 100644 index 0000000000000000000000000000000000000000..45a01303a6060191e046d19e96ef12c1abb1a95b Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/HReorderingForwardState.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/LRModel.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/LRModel.o new file mode 100644 index 0000000000000000000000000000000000000000..ad7e4a6ccd7393483bc2f3ee1326f2c4df78133a Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/LRModel.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/LRState.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/LRState.o new file mode 100644 index 0000000000000000000000000000000000000000..2cca4db5190a8ce8201391a6e5e763974e8f1ab9 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/LRState.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/LexicalReordering.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/LexicalReordering.o new file mode 100644 index 0000000000000000000000000000000000000000..f12ba5d588f7f1941a8cc2563752526cdc6d1c0d Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/LexicalReordering.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/PhraseBasedReorderingState.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/PhraseBasedReorderingState.o new file mode 100644 index 0000000000000000000000000000000000000000..136e81fe97e3f5d134e0845563a89fa253f41eb6 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/PhraseBasedReorderingState.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/ReorderingStack.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/ReorderingStack.o new file mode 100644 index 0000000000000000000000000000000000000000..e37ceed392b626c413dcb3d8cc0c150bd1a7ce1e Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/LexicalReordering/ReorderingStack.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/OSM/KenOSM.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/OSM/KenOSM.o new file mode 100644 index 0000000000000000000000000000000000000000..8b1984455522a25b4cca5c8b97ed22880ddad792 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/OSM/KenOSM.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/OSM/OpSequenceModel.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/OSM/OpSequenceModel.o new file mode 100644 index 0000000000000000000000000000000000000000..ac8dab6c3bb3145abb4d84368acac9d96a9baa3c Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/OSM/OpSequenceModel.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/OSM/osmHyp.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/OSM/osmHyp.o new file mode 100644 index 0000000000000000000000000000000000000000..a37d4159dbcb04c1b94631a18cb1f970389eab8e Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/OSM/osmHyp.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/PhrasePenalty.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/PhrasePenalty.o new file mode 100644 index 0000000000000000000000000000000000000000..9781930747f22128a87d0515898345f7039a8901 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/PhrasePenalty.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/StatefulFeatureFunction.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/StatefulFeatureFunction.o new file mode 100644 index 0000000000000000000000000000000000000000..0b40f6d781dde9fca71935f0709eaa6407d19b50 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/StatefulFeatureFunction.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/StatelessFeatureFunction.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/StatelessFeatureFunction.o new file mode 100644 index 0000000000000000000000000000000000000000..480ed4f79dd4666b40729f62dd7eda11eda446d1 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/StatelessFeatureFunction.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/WordPenalty.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/WordPenalty.o new file mode 100644 index 0000000000000000000000000000000000000000..36c466b2bdafaa9c7fccc59bf0b12c14d062c444 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/FF/WordPenalty.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/HypothesisBase.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/HypothesisBase.o new file mode 100644 index 0000000000000000000000000000000000000000..078e994985d41b47d6bc09fb2c6145879eaf7983 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/HypothesisBase.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/HypothesisColl.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/HypothesisColl.o new file mode 100644 index 0000000000000000000000000000000000000000..0672c0c69eaf742b8453dfe47dc6bbe71a5b9652 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/HypothesisColl.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/InputPathBase.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/InputPathBase.o new file mode 100644 index 0000000000000000000000000000000000000000..e133877b090f4382f084d8f36233ac5814289c8c Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/InputPathBase.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/InputPathsBase.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/InputPathsBase.o new file mode 100644 index 0000000000000000000000000000000000000000..02b95c051859ac1d097f74e88aa0b9248b487a7c Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/InputPathsBase.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/InputType.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/InputType.o new file mode 100644 index 0000000000000000000000000000000000000000..0eb85de9393a9a3d2367355443a9f54116d9a22b Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/InputType.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/GPULM.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/GPULM.o new file mode 100644 index 0000000000000000000000000000000000000000..5fb154b98ace2488ffee574214f24203082d0d95 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/GPULM.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/KENLM.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/KENLM.o new file mode 100644 index 0000000000000000000000000000000000000000..12becd5d1fd91813a9b0df6dfc12d3bc01fee2b2 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/KENLM.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/KENLMBatch.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/KENLMBatch.o new file mode 100644 index 0000000000000000000000000000000000000000..0b6925a2b5799f4218cfc87bb11b5d23fab54120 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/KENLMBatch.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/LanguageModel.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/LanguageModel.o new file mode 100644 index 0000000000000000000000000000000000000000..d805691498165944814f11f727af76841a747dee Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/LM/LanguageModel.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Main.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Main.o new file mode 100644 index 0000000000000000000000000000000000000000..8d2d3543816bb302dad65c1beb03a1c583f3e69b Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Main.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/ManagerBase.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/ManagerBase.o new file mode 100644 index 0000000000000000000000000000000000000000..0c3ba133374f4164480b820ac6f2cc054d7e1616 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/ManagerBase.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/MemPool.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/MemPool.o new file mode 100644 index 0000000000000000000000000000000000000000..81f9668db6fcd532d0baad793bf7edfc92802799 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/MemPool.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Moses2Wrapper.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Moses2Wrapper.o new file mode 100644 index 0000000000000000000000000000000000000000..c915550c5c589e07e23ff3e57932f90c1f06d217 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Moses2Wrapper.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Phrase.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Phrase.o new file mode 100644 index 0000000000000000000000000000000000000000..99a05e59e125e3a218a97adfe213ca10c7c1c38b Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Phrase.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/CubePruningMiniStack/Misc.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/CubePruningMiniStack/Misc.o new file mode 100644 index 0000000000000000000000000000000000000000..8d9425f78f25e8d35e3f72ec9812d8f045880610 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/CubePruningMiniStack/Misc.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/CubePruningMiniStack/Search.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/CubePruningMiniStack/Search.o new file mode 100644 index 0000000000000000000000000000000000000000..0a497ed3831c7cbfd9216d6de076e50d7e906591 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/CubePruningMiniStack/Search.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/CubePruningMiniStack/Stack.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/CubePruningMiniStack/Stack.o new file mode 100644 index 0000000000000000000000000000000000000000..c1b57f7c191c563a96a90af8d577e84d6e394b2a Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/CubePruningMiniStack/Stack.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Hypothesis.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Hypothesis.o new file mode 100644 index 0000000000000000000000000000000000000000..b8e24b8a351914513d6bf7405b4befeaba35a299 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Hypothesis.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/InputPath.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/InputPath.o new file mode 100644 index 0000000000000000000000000000000000000000..c79f3000cd91e3efd45359a596d46318bc2db125 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/InputPath.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/InputPaths.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/InputPaths.o new file mode 100644 index 0000000000000000000000000000000000000000..76f1aed4eefd56629ff985ac2f12d2260a58f0ea Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/InputPaths.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Manager.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Manager.o new file mode 100644 index 0000000000000000000000000000000000000000..6df521e490d82fb22f58bf94693d3581f5dc3d54 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Manager.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Normal/Search.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Normal/Search.o new file mode 100644 index 0000000000000000000000000000000000000000..41f0481a7f99e0de494bed129d663c79d8d844d8 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Normal/Search.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Normal/Stack.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Normal/Stack.o new file mode 100644 index 0000000000000000000000000000000000000000..5a624c39628fd868adff1fa987c9ae6b99b1897e Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Normal/Stack.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Normal/Stacks.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Normal/Stacks.o new file mode 100644 index 0000000000000000000000000000000000000000..10eb211015790e81ce251487f7885c4c28e57ed8 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Normal/Stacks.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/PhraseImpl.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/PhraseImpl.o new file mode 100644 index 0000000000000000000000000000000000000000..fee13f9eb9700e4ef5e37e9c9746c437a6153ff0 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/PhraseImpl.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/ReorderingConstraint.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/ReorderingConstraint.o new file mode 100644 index 0000000000000000000000000000000000000000..c9c79090b63ed5d64ab293e6f25db58e1d804b12 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/ReorderingConstraint.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Search.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Search.o new file mode 100644 index 0000000000000000000000000000000000000000..4028b387aa70e012a1fbd375178e7933e0762c20 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Search.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Sentence.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Sentence.o new file mode 100644 index 0000000000000000000000000000000000000000..bc7584271a1ca9f832b19cb3f6e9ce7c9803334b Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/Sentence.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/SentenceWithCandidates.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/SentenceWithCandidates.o new file mode 100644 index 0000000000000000000000000000000000000000..85af67e32d828796a98184e94aeaba482bf0c9e4 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/SentenceWithCandidates.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/TargetPhraseImpl.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/TargetPhraseImpl.o new file mode 100644 index 0000000000000000000000000000000000000000..1f6ba5a1fbbf8d7c5478aad11836decca9d335bf Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/TargetPhraseImpl.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/TargetPhrases.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/TargetPhrases.o new file mode 100644 index 0000000000000000000000000000000000000000..0e00bd47c028a140a497a42d1bf9b1d248aef3ce Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/TargetPhrases.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/TrellisPath.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/TrellisPath.o new file mode 100644 index 0000000000000000000000000000000000000000..eaedd76ff86cd89948845b861a2507490c0b4acc Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/PhraseBased/TrellisPath.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/ActiveChart.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/ActiveChart.o new file mode 100644 index 0000000000000000000000000000000000000000..7c43191b8b58f0b8f8f3407463e6f6e544e820ec Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/ActiveChart.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Hypothesis.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Hypothesis.o new file mode 100644 index 0000000000000000000000000000000000000000..d44abf90f9d017b087f1a98f1697d21e50a5499d Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Hypothesis.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/InputPath.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/InputPath.o new file mode 100644 index 0000000000000000000000000000000000000000..62d070a19d26f37cf09b1394e9edf4b5e36e4fe3 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/InputPath.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/InputPaths.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/InputPaths.o new file mode 100644 index 0000000000000000000000000000000000000000..2bd8d5e8063348fbefd8c95d32e3c17f26cd9d6a Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/InputPaths.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Manager.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Manager.o new file mode 100644 index 0000000000000000000000000000000000000000..75d8da440b57d4b2a15d5d8eb96907aa2758d67e Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Manager.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Misc.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Misc.o new file mode 100644 index 0000000000000000000000000000000000000000..5eae0e595620141e0f4dc0b91d31eafab9227f05 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Misc.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/PhraseImpl.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/PhraseImpl.o new file mode 100644 index 0000000000000000000000000000000000000000..c90023b46dc2ec6685505c5a8c99b91da75278c6 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/PhraseImpl.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Sentence.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Sentence.o new file mode 100644 index 0000000000000000000000000000000000000000..c60500b1e74ec91933d0d1fa46aa2158e7fcf13b Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Sentence.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Stack.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Stack.o new file mode 100644 index 0000000000000000000000000000000000000000..04d057b139f98b826b7b56a2ce471e214c6db19b Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Stack.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Stacks.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Stacks.o new file mode 100644 index 0000000000000000000000000000000000000000..d290370dde2ed514bda214b82c1faf7045a62e99 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Stacks.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/TargetPhraseImpl.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/TargetPhraseImpl.o new file mode 100644 index 0000000000000000000000000000000000000000..ca8629c2e9046d5294935851257ddc068d7cb64a Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/TargetPhraseImpl.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/TargetPhrases.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/TargetPhrases.o new file mode 100644 index 0000000000000000000000000000000000000000..25371c076617a628384c03c5cec63fa7cf843cf3 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/TargetPhrases.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Word.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Word.o new file mode 100644 index 0000000000000000000000000000000000000000..92ec56501385b7a73e38f1dc3abecb3c27bec2a6 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/Word.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/KBestExtractor.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/KBestExtractor.o new file mode 100644 index 0000000000000000000000000000000000000000..c609f220229c7dd6d226ecbccacca6838b120d6a Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/KBestExtractor.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/NBest.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/NBest.o new file mode 100644 index 0000000000000000000000000000000000000000..6bc394441636c5793a7c0ccd12f67835d3f5740f Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/NBest.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/NBestColl.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/NBestColl.o new file mode 100644 index 0000000000000000000000000000000000000000..51b6f094f2ae3bb6a3e118931336da2da78f9d95 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/NBestColl.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/NBests.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/NBests.o new file mode 100644 index 0000000000000000000000000000000000000000..0e2147fb5b471ef921a713e1fb8520325e512dfe Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SCFG/nbest/NBests.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Scores.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Scores.o new file mode 100644 index 0000000000000000000000000000000000000000..f7e63188f0ac1420a16b836b9b59711f03a7e5e8 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Scores.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SubPhrase.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SubPhrase.o new file mode 100644 index 0000000000000000000000000000000000000000..6d1dfbc86f537bf314cb27de8664d9b07836d528 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/SubPhrase.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/System.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/System.o new file mode 100644 index 0000000000000000000000000000000000000000..cf11a232e07c6463b97ca0d6b0a1a6994336206a Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/System.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TargetPhrase.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TargetPhrase.o new file mode 100644 index 0000000000000000000000000000000000000000..31b75f319a66667df4542b05fac9ecf576d08b87 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TargetPhrase.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/BlockHashIndex.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/BlockHashIndex.o new file mode 100644 index 0000000000000000000000000000000000000000..0290dc8c8b3633a453afce7c1b92a9669669b8f2 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/BlockHashIndex.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/CmphStringVectorAdapter.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/CmphStringVectorAdapter.o new file mode 100644 index 0000000000000000000000000000000000000000..31ecf2ede7b5345bbb7864a41ae5b6aec102de93 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/CmphStringVectorAdapter.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/LexicalReorderingTableCompact.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/LexicalReorderingTableCompact.o new file mode 100644 index 0000000000000000000000000000000000000000..da841e8fa75fa56a7d1c03e08002e89aa58ec928 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/LexicalReorderingTableCompact.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/MurmurHash3.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/MurmurHash3.o new file mode 100644 index 0000000000000000000000000000000000000000..4e7744877565617afa1fb7f34aa06c95bfc2d9d1 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/MurmurHash3.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/TargetPhraseCollectionCache.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/TargetPhraseCollectionCache.o new file mode 100644 index 0000000000000000000000000000000000000000..04ab3f61b533aa8d335c094a825ffeeceefffb8e Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/TargetPhraseCollectionCache.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/ThrowingFwrite.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/ThrowingFwrite.o new file mode 100644 index 0000000000000000000000000000000000000000..66b2d7e7fe09799b2c9272d33d7d8f6120a6b590 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/CompactPT/ThrowingFwrite.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/Dynamic/DynamicPhraseTable.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/Dynamic/DynamicPhraseTable.o new file mode 100644 index 0000000000000000000000000000000000000000..e5b9eb9cdf5d198431365b90ffea8bd6b8913e03 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/Dynamic/DynamicPhraseTable.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/Memory/PhraseTableMemory.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/Memory/PhraseTableMemory.o new file mode 100644 index 0000000000000000000000000000000000000000..39777c37be0d124914aeff5e92bda1c870f70dc2 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/Memory/PhraseTableMemory.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/PhraseTable.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/PhraseTable.o new file mode 100644 index 0000000000000000000000000000000000000000..0be230cfa5ea6ff2767322af67815434fefa311a Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/PhraseTable.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/ProbingPT.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/ProbingPT.o new file mode 100644 index 0000000000000000000000000000000000000000..aed1e9e059fddf79b51f141299575ca7d43eefb0 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/ProbingPT.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/Transliteration.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/Transliteration.o new file mode 100644 index 0000000000000000000000000000000000000000..402f50594008b1a823481607039530e64ca68925 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/Transliteration.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/UnknownWordPenalty.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/UnknownWordPenalty.o new file mode 100644 index 0000000000000000000000000000000000000000..68532e21fbb71901d82b911b1d5f01eaf808a598 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationModel/UnknownWordPenalty.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationTask.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationTask.o new file mode 100644 index 0000000000000000000000000000000000000000..c4f5a613d1e643a7eac313d4ead91eaec8a33d60 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TranslationTask.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TrellisPaths.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TrellisPaths.o new file mode 100644 index 0000000000000000000000000000000000000000..e09bcc745be02b31f57ff6f2ae044a6049b7515d Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TrellisPaths.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TypeDef.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TypeDef.o new file mode 100644 index 0000000000000000000000000000000000000000..02904c342a16ac2a123346cf4c844a15e72303cc Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/TypeDef.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Vector.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Vector.o new file mode 100644 index 0000000000000000000000000000000000000000..30fc965659bde7ace1aa21a759ccddefc530a8b1 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Vector.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Weights.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Weights.o new file mode 100644 index 0000000000000000000000000000000000000000..1f61ad0e19ead52fe1b17f3ffaa56610fd7de8fe Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Weights.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Word.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Word.o new file mode 100644 index 0000000000000000000000000000000000000000..777aaf33b86eabb73ee8690bdee4411c98cce627 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/Word.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Bitmap.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Bitmap.o new file mode 100644 index 0000000000000000000000000000000000000000..aa0be86175f712d69c946bd45412d9d07f8f8074 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Bitmap.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Bitmaps.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Bitmaps.o new file mode 100644 index 0000000000000000000000000000000000000000..83ad57688283656effce19376c2f1a1abdd62ac9 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Bitmaps.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Factor.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Factor.o new file mode 100644 index 0000000000000000000000000000000000000000..8eeb39e6be36c01cea5f0d83fe3afa71781f1f1d Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Factor.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/FactorCollection.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/FactorCollection.o new file mode 100644 index 0000000000000000000000000000000000000000..cfecd5cbcd7e37a1006fddbed99e23e26a5873b6 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/FactorCollection.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/InputFileStream.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/InputFileStream.o new file mode 100644 index 0000000000000000000000000000000000000000..0ad852327a74e01a787b22eb024cf2a01c9db092 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/InputFileStream.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Matrix.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Matrix.o new file mode 100644 index 0000000000000000000000000000000000000000..fffe2c6b7c170c7526bd013f20e917d138f0b689 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Matrix.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/OutputFileStream.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/OutputFileStream.o new file mode 100644 index 0000000000000000000000000000000000000000..fb660c4afa0cce4040322586fa89e6f3b2c11c2c Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/OutputFileStream.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Parameter.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Parameter.o new file mode 100644 index 0000000000000000000000000000000000000000..5a5a55ca38e7eb630d6bddb71ff3b1224c8b99fb Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Parameter.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Range.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Range.o new file mode 100644 index 0000000000000000000000000000000000000000..e37877d08c82784ad5141ec18ae04cd6ecca76a0 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Range.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/ThreadPool.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/ThreadPool.o new file mode 100644 index 0000000000000000000000000000000000000000..8297e946f53bfd6927f295fbf81b283e68f466fc Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/ThreadPool.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Timer.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Timer.o new file mode 100644 index 0000000000000000000000000000000000000000..86299db616186f9b72376d72a12e038388c37029 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Timer.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Util2.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Util2.o new file mode 100644 index 0000000000000000000000000000000000000000..2352d88c389c5e37983cede218d52d1a78cdbf03 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/legacy/Util2.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2_lib.a b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2_lib.a new file mode 100644 index 0000000000000000000000000000000000000000..7c2bdc325e27e5a26757cc4177e81b85e9d96c80 --- /dev/null +++ b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2_lib.a @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:207dcf0e82681048abc8e09385cbb1708dea0ab12fe94c03ea70eb2c8fed0848 +size 18703816 diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2decoder.a b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2decoder.a new file mode 100644 index 0000000000000000000000000000000000000000..c8d291e5797903c2d19c22a44cffbefeebb6e45f --- /dev/null +++ b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2decoder.a @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f432243c9244fc307b512349deb1bbce2ff32e0d1d0ca4607e76f3036d762d0e +size 2522782 diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/moses2 b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/moses2 new file mode 100644 index 0000000000000000000000000000000000000000..9afd1b08469606ec60e8bd66d3114eb0d2a21fc2 --- /dev/null +++ b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/moses2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eadc728d0c4403d6b149ea887743791bbf49cba9ec2785015a155f3b103baf4a +size 6563456 diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/AllOptions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/AllOptions.o new file mode 100644 index 0000000000000000000000000000000000000000..df21fbadc4071906c94e12ea57228ec2fec62cec Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/AllOptions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/BookkeepingOptions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/BookkeepingOptions.o new file mode 100644 index 0000000000000000000000000000000000000000..8b2edf5106d195dca32a85adcb4499a51b3aa609 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/BookkeepingOptions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ContextParameters.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ContextParameters.o new file mode 100644 index 0000000000000000000000000000000000000000..e10bcca836a57fc6ebb1ab0fe6342d511f3362b1 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ContextParameters.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/CubePruningOptions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/CubePruningOptions.o new file mode 100644 index 0000000000000000000000000000000000000000..cf6e91eb2c4538f7001d94a754a261a61ab679b1 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/CubePruningOptions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/InputOptions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/InputOptions.o new file mode 100644 index 0000000000000000000000000000000000000000..b4fe1551660992245d89b1561409513f2e3c5136 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/InputOptions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/LMBR_Options.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/LMBR_Options.o new file mode 100644 index 0000000000000000000000000000000000000000..2b1c14e69b2969e4150d8f9fb2b9dcf71a87388f Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/LMBR_Options.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/MBR_Options.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/MBR_Options.o new file mode 100644 index 0000000000000000000000000000000000000000..b9dbbd773e9a7ab1cb23b9033669ce7c21e6cb29 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/MBR_Options.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/NBestOptions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/NBestOptions.o new file mode 100644 index 0000000000000000000000000000000000000000..a4174c1e1a83ccf0307b87a5494e251112d66e4f Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/NBestOptions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/OOVHandlingOptions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/OOVHandlingOptions.o new file mode 100644 index 0000000000000000000000000000000000000000..e29ed320edbde6af9c4868634c697df085c51e21 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/OOVHandlingOptions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/OptionsBaseClass.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/OptionsBaseClass.o new file mode 100644 index 0000000000000000000000000000000000000000..92f9dd95efb96c2a6e0a5b6341ea85d6f4d94597 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/OptionsBaseClass.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ReorderingOptions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ReorderingOptions.o new file mode 100644 index 0000000000000000000000000000000000000000..bf013597c0bc54fbd0e49f83c166ff8b00be3744 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ReorderingOptions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ReportingOptions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ReportingOptions.o new file mode 100644 index 0000000000000000000000000000000000000000..8bb068fe8f028df22d117a7e1d07cb7eb9913ff4 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ReportingOptions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/SearchOptions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/SearchOptions.o new file mode 100644 index 0000000000000000000000000000000000000000..108df68a76b6297be66867cdcb899acdbacf3679 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/SearchOptions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ServerOptions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ServerOptions.o new file mode 100644 index 0000000000000000000000000000000000000000..63ae8e0893785b27b3b12815bfb472b5f804186e Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/ServerOptions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/SyntaxOptions.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/SyntaxOptions.o new file mode 100644 index 0000000000000000000000000000000000000000..8e56039be009db876beecbd31e5585b5c856aef1 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/parameters/SyntaxOptions.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/pugixml.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/pugixml.o new file mode 100644 index 0000000000000000000000000000000000000000..bd92e433596db45498fd145d432f0719809e6cd7 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/pugixml.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/server/Server.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/server/Server.o new file mode 100644 index 0000000000000000000000000000000000000000..1f756b51233d22f877f55cdd8b112b822663bdf1 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/server/Server.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/server/TranslationRequest.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/server/TranslationRequest.o new file mode 100644 index 0000000000000000000000000000000000000000..98e6d550b71eadd7f22e754c2b46a689c3c60e8c Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/server/TranslationRequest.o differ diff --git a/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/server/Translator.o b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/server/Translator.o new file mode 100644 index 0000000000000000000000000000000000000000..822c94cae86268e2334392ff84a9a2361a3f0ba3 Binary files /dev/null and b/mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/server/Translator.o differ diff --git a/mosesdecoder/moses2/defer/CubePruningBitmapStack/Misc.cpp b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Misc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9f994ba8b674453285d5983ce1941cf6a13412cb --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Misc.cpp @@ -0,0 +1,159 @@ +/* + * CubePruning.cpp + * + * Created on: 27 Nov 2015 + * Author: hieu + */ + +#include "Misc.h" +#include "Stack.h" +#include "../Manager.h" +#include "../../MemPool.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningBitmapStack +{ + +//////////////////////////////////////////////////////////////////////// +QueueItem *QueueItem::Create(QueueItem *currItem, + Manager &mgr, + CubeEdge &edge, + size_t hypoIndex, + size_t tpIndex, + std::deque &queueItemRecycler) +{ + QueueItem *ret; + if (currItem) { + // reuse incoming queue item to create new item + ret = currItem; + ret->Init(mgr, edge, hypoIndex, tpIndex); + } else if (!queueItemRecycler.empty()) { + // use item from recycle bin + ret = queueItemRecycler.back(); + ret->Init(mgr, edge, hypoIndex, tpIndex); + queueItemRecycler.pop_back(); + } else { + // create new item + ret = new (mgr.GetPool().Allocate()) QueueItem(mgr, edge, hypoIndex, tpIndex); + } + + return ret; +} + +QueueItem::QueueItem(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex) + :edge(&edge) + ,hypoIndex(hypoIndex) + ,tpIndex(tpIndex) +{ + CreateHypothesis(mgr); +} + +void QueueItem::Init(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex) +{ + this->edge = &edge; + this->hypoIndex = hypoIndex; + this->tpIndex = tpIndex; + + CreateHypothesis(mgr); +} + +void QueueItem::CreateHypothesis(Manager &mgr) +{ + const Hypothesis *prevHypo = edge->hypos[hypoIndex]; + const TargetPhrase &tp = edge->tps[tpIndex]; + + //cerr << "hypoIndex=" << hypoIndex << endl; + //cerr << "edge.hypos=" << edge.hypos.size() << endl; + //cerr << prevHypo << endl; + //cerr << *prevHypo << endl; + + hypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + hypo->Init(mgr, *prevHypo, edge->path, tp, edge->newBitmap, edge->estimatedScore); + hypo->EvaluateWhenApplied(); +} + +//////////////////////////////////////////////////////////////////////// +CubeEdge::CubeEdge( + Manager &mgr, + const Hypotheses &hypos, + const InputPath &path, + const TargetPhrases &tps, + const Bitmap &newBitmap) + :hypos(hypos) + ,path(path) + ,tps(tps) + ,newBitmap(newBitmap) +{ + estimatedScore = mgr.GetEstimatedScores().CalcEstimatedScore(newBitmap); +} + +std::ostream& operator<<(std::ostream &out, const CubeEdge &obj) +{ + out << obj.newBitmap; + return out; +} + +bool +CubeEdge::SetSeenPosition(const size_t x, const size_t y, SeenPositions &seenPositions) const +{ + //UTIL_THROW_IF2(x >= (1<<17), "Error"); + //UTIL_THROW_IF2(y >= (1<<17), "Error"); + + SeenPositionItem val(this, (x<<16) + y); + std::pair pairRet = seenPositions.insert(val); + return pairRet.second; +} + +void CubeEdge::CreateFirst(Manager &mgr, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler) +{ + assert(hypos.size()); + assert(tps.GetSize()); + + QueueItem *item = QueueItem::Create(NULL, mgr, *this, 0, 0, queueItemRecycler); + queue.push(item); + bool setSeen = SetSeenPosition(0, 0, seenPositions); + assert(setSeen); +} + +void CubeEdge::CreateNext(Manager &mgr, + QueueItem *item, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler) +{ + size_t hypoIndex = item->hypoIndex; + size_t tpIndex = item->tpIndex; + + if (hypoIndex + 1 < hypos.size() && SetSeenPosition(hypoIndex + 1, tpIndex, seenPositions)) { + // reuse incoming queue item to create new item + QueueItem *newItem = QueueItem::Create(item, mgr, *this, hypoIndex + 1, tpIndex, queueItemRecycler); + assert(newItem == item); + queue.push(newItem); + item = NULL; + } + + if (tpIndex + 1 < tps.GetSize() && SetSeenPosition(hypoIndex, tpIndex + 1, seenPositions)) { + QueueItem *newItem = QueueItem::Create(item, mgr, *this, hypoIndex, tpIndex + 1, queueItemRecycler); + queue.push(newItem); + item = NULL; + } + + if (item) { + // recycle unused queue item + queueItemRecycler.push_back(item); + } +} + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningBitmapStack/Misc.h b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Misc.h new file mode 100644 index 0000000000000000000000000000000000000000..355f8f4c2535789eb2b064cd86ce5e0cfee679da --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Misc.h @@ -0,0 +1,111 @@ +/* + * CubePruning.h + * + * Created on: 27 Nov 2015 + * Author: hieu + */ +#pragma once +#include +#include +#include +#include +#include +#include "../../legacy/Range.h" +#include "../Hypothesis.h" +#include "../../TypeDef.h" +#include "../../Vector.h" +#include "Stack.h" + +namespace Moses2 +{ + +class Manager; +class InputPath; +class TargetPhrases; +class Bitmap; + +namespace NSCubePruningBitmapStack +{ +class CubeEdge; + +/////////////////////////////////////////// +class QueueItem +{ + ~QueueItem(); // NOT IMPLEMENTED. Use MemPool +public: + static QueueItem *Create(QueueItem *currItem, + Manager &mgr, + CubeEdge &edge, + size_t hypoIndex, + size_t tpIndex, + std::deque &queueItemRecycler); + QueueItem(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex); + + void Init(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex); + + CubeEdge *edge; + size_t hypoIndex, tpIndex; + Hypothesis *hypo; + +protected: + void CreateHypothesis(Manager &mgr); +}; + +/////////////////////////////////////////// +class QueueItemOrderer +{ +public: + bool operator()(QueueItem* itemA, QueueItem* itemB) const { + HypothesisFutureScoreOrderer orderer; + return !orderer(itemA->hypo, itemB->hypo); + } +}; + +/////////////////////////////////////////// +class CubeEdge +{ + friend std::ostream& operator<<(std::ostream &, const CubeEdge &); + +public: + typedef std::priority_queue, + QueueItemOrderer> Queue; + + typedef std::pair SeenPositionItem; + typedef boost::unordered_set, + std::equal_to > SeenPositions; + + const Hypotheses &hypos; + const InputPath &path; + const TargetPhrases &tps; + const Bitmap &newBitmap; + SCORE estimatedScore; + + CubeEdge(Manager &mgr, + const Hypotheses &hypos, + const InputPath &path, + const TargetPhrases &tps, + const Bitmap &newBitmap); + + bool SetSeenPosition(const size_t x, const size_t y, SeenPositions &seenPositions) const; + + void CreateFirst(Manager &mgr, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler); + void CreateNext(Manager &mgr, + QueueItem *item, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler); + +protected: + +}; + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningBitmapStack/Search.cpp b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Search.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8c06f134028e066af3a17be22378966e159d4a5d --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Search.cpp @@ -0,0 +1,206 @@ +/* + * Search.cpp + * + * Created on: 16 Nov 2015 + * Author: hieu + */ +#include +#include "Search.h" +#include "Stack.h" +#include "../Manager.h" +#include "../Hypothesis.h" +#include "../../InputPaths.h" +#include "../../InputPath.h" +#include "../../System.h" +#include "../../Sentence.h" +#include "../../TranslationTask.h" +#include "../../legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningBitmapStack +{ + +//////////////////////////////////////////////////////////////////////// +Search::Search(Manager &mgr) + :Moses2::Search(mgr) + ,m_stack(mgr) + + ,m_queue(QueueItemOrderer(), std::vector() ) + + ,m_seenPositions() +{ +} + +Search::~Search() +{ +} + +void Search::Decode() +{ + // init cue edges + m_cubeEdges.resize(mgr.GetInput().GetSize() + 1); + for (size_t i = 0; i < m_cubeEdges.size(); ++i) { + m_cubeEdges[i] = new (mgr.GetPool().Allocate()) CubeEdges(); + } + + const Bitmap &initBitmap = mgr.GetBitmaps().GetInitialBitmap(); + Hypothesis *initHypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + initHypo->Init(mgr, mgr.GetInputPaths().GetBlank(), mgr.GetInitPhrase(), initBitmap); + initHypo->EmptyHypothesisState(mgr.GetInput()); + + m_stack.Add(initHypo, mgr.GetHypoRecycle()); + PostDecode(0); + + for (size_t stackInd = 1; stackInd < mgr.GetInput().GetSize() + 1; ++stackInd) { + //cerr << "stackInd=" << stackInd << endl; + m_stack.Clear(); + Decode(stackInd); + PostDecode(stackInd); + + //m_stack.DebugCounts(); + //cerr << m_stacks << endl; + } + +} + +void Search::Decode(size_t stackInd) +{ + Recycler &hypoRecycler = mgr.GetHypoRecycle(); + + // reuse queue from previous stack. Clear it first + std::vector &container = Container(m_queue); + //cerr << "container=" << container.size() << endl; + BOOST_FOREACH(QueueItem *item, container) { + // recycle unused hypos from queue + Hypothesis *hypo = item->hypo; + hypoRecycler.Recycle(hypo); + + // recycle queue item + m_queueItemRecycler.push_back(item); + } + container.clear(); + + m_seenPositions.clear(); + + // add top hypo from every edge into queue + CubeEdges &edges = *m_cubeEdges[stackInd]; + + BOOST_FOREACH(CubeEdge *edge, edges) { + //cerr << *edge << " "; + edge->CreateFirst(mgr, m_queue, m_seenPositions, m_queueItemRecycler); + } + + /* + cerr << "edges: "; + boost::unordered_set uniqueBM; + BOOST_FOREACH(CubeEdge *edge, edges) { + uniqueBM.insert(&edge->newBitmap); + //cerr << *edge << " "; + } + cerr << edges.size() << " " << uniqueBM.size(); + cerr << endl; + */ + + size_t pops = 0; + while (!m_queue.empty() && pops < mgr.system.popLimit) { + // get best hypo from queue, add to stack + //cerr << "queue=" << queue.size() << endl; + QueueItem *item = m_queue.top(); + m_queue.pop(); + + CubeEdge *edge = item->edge; + + // add hypo to stack + Hypothesis *hypo = item->hypo; + //cerr << "hypo=" << *hypo << " " << hypo->GetBitmap() << endl; + m_stack.Add(hypo, hypoRecycler); + + edge->CreateNext(mgr, item, m_queue, m_seenPositions, m_queueItemRecycler); + + ++pops; + } + + /* + // create hypo from every edge. Increase diversity + while (!m_queue.empty()) { + QueueItem *item = m_queue.top(); + m_queue.pop(); + + if (item->hypoIndex == 0 && item->tpIndex == 0) { + CubeEdge &edge = item->edge; + + // add hypo to stack + Hypothesis *hypo = item->hypo; + //cerr << "hypo=" << *hypo << " " << hypo->GetBitmap() << endl; + m_stacks.Add(hypo, mgr.GetHypoRecycle()); + } + } + */ +} + +void Search::PostDecode(size_t stackInd) +{ + MemPool &pool = mgr.GetPool(); + + Stack::SortedHypos sortedHypos = m_stack.GetSortedAndPruneHypos(mgr); + + BOOST_FOREACH(const Stack::SortedHypos::value_type &val, sortedHypos) { + const Bitmap &hypoBitmap = *val.first.first; + size_t hypoEndPos = val.first.second; + //cerr << "key=" << hypoBitmap << " " << hypoEndPos << endl; + + // create edges to next hypos from existing hypos + const InputPaths &paths = mgr.GetInputPaths(); + + BOOST_FOREACH(const InputPath *path, paths) { + const Range &pathRange = path->range; + //cerr << "pathRange=" << pathRange << endl; + + if (!path->IsUsed()) { + continue; + } + if (!CanExtend(hypoBitmap, hypoEndPos, pathRange)) { + continue; + } + + const Bitmap &newBitmap = mgr.GetBitmaps().GetBitmap(hypoBitmap, pathRange); + size_t numWords = newBitmap.GetNumWordsCovered(); + + CubeEdges &edges = *m_cubeEdges[numWords]; + + // sort hypo for a particular bitmap and hypoEndPos + Hypotheses &sortedHypos = *val.second; + + size_t numPt = mgr.system.mappings.size(); + for (size_t i = 0; i < numPt; ++i) { + const TargetPhrases *tps = path->targetPhrases[i]; + if (tps && tps->GetSize()) { + CubeEdge *edge = new (pool.Allocate()) CubeEdge(mgr, sortedHypos, *path, *tps, newBitmap); + edges.push_back(edge); + } + } + } + } + +} + +const Hypothesis *Search::GetBestHypo() const +{ + std::vector sortedHypos = m_stack.GetBestHypos(1); + + const Hypothesis *best = NULL; + if (sortedHypos.size()) { + best = sortedHypos[0]; + } + return best; +} + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningBitmapStack/Search.h b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Search.h new file mode 100644 index 0000000000000000000000000000000000000000..1ff0477c6b3c1e9740c0d7065a4835ae61e65aa3 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Search.h @@ -0,0 +1,57 @@ +/* + * Search.h + * + * Created on: 16 Nov 2015 + * Author: hieu + */ + +#pragma once +#include +#include "../Search.h" +#include "Misc.h" +#include "Stack.h" +#include "../../legacy/Range.h" + +namespace Moses2 +{ + +class Bitmap; +class Hypothesis; +class InputPath; +class TargetPhrases; + +namespace NSCubePruningBitmapStack +{ + +class Search : public Moses2::Search +{ +public: + Search(Manager &mgr); + virtual ~Search(); + + virtual void Decode(); + const Hypothesis *GetBestHypo() const; + +protected: + Stack m_stack; + + CubeEdge::Queue m_queue; + CubeEdge::SeenPositions m_seenPositions; + + // CUBE PRUNING VARIABLES + // setup + typedef std::vector CubeEdges; + std::vector m_cubeEdges; + + std::deque m_queueItemRecycler; + + // CUBE PRUNING + // decoding + void Decode(size_t stackInd); + void PostDecode(size_t stackInd); +}; + +} + +} + diff --git a/mosesdecoder/moses2/defer/CubePruningBitmapStack/Stack.cpp b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Stack.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f6abd203820f0bfabd0090aa6c6b7aaab9f308a0 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Stack.cpp @@ -0,0 +1,299 @@ +/* + * Stack.cpp + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#include +#include +#include "Stack.h" +#include "../Hypothesis.h" +#include "../Manager.h" +#include "../../Scores.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningBitmapStack +{ +MiniStack::MiniStack(const Manager &mgr) + :m_coll() + ,m_sortedHypos(NULL) +{} + +StackAdd MiniStack::Add(const Hypothesis *hypo) +{ + std::pair<_HCType::iterator, bool> addRet = m_coll.insert(hypo); + + // CHECK RECOMBINATION + if (addRet.second) { + // equiv hypo doesn't exists + return StackAdd(true, NULL); + } else { + const Hypothesis *hypoExisting = *addRet.first; + if (hypo->GetScores().GetTotalScore() > hypoExisting->GetScores().GetTotalScore()) { + // incoming hypo is better than the one we have + const Hypothesis *const &hypoExisting1 = *addRet.first; + const Hypothesis *&hypoExisting2 = const_cast(hypoExisting1); + hypoExisting2 = hypo; + + return StackAdd(true, const_cast(hypoExisting)); + } else { + // already storing the best hypo. discard incoming hypo + return StackAdd(false, const_cast(hypo)); + } + } + + assert(false); +} + +Hypotheses &MiniStack::GetSortedAndPruneHypos(const Manager &mgr) const +{ + if (m_sortedHypos == NULL) { + // create sortedHypos first + MemPool &pool = mgr.GetPool(); + m_sortedHypos = new (pool.Allocate< Vector >()) Vector(pool, m_coll.size()); + + size_t ind = 0; + BOOST_FOREACH(const Hypothesis *hypo, m_coll) { + (*m_sortedHypos)[ind] = hypo; + ++ind; + } + + SortAndPruneHypos(mgr); + } + + return *m_sortedHypos; +} + +void MiniStack::SortAndPruneHypos(const Manager &mgr) const +{ + size_t stackSize = mgr.system.stackSize; + Recycler &recycler = mgr.GetHypoRecycle(); + + /* + cerr << "UNSORTED hypos:" << endl; + for (size_t i = 0; i < hypos.size(); ++i) { + const Hypothesis *hypo = hypos[i]; + cerr << *hypo << endl; + } + cerr << endl; + */ + Hypotheses::iterator iterMiddle; + iterMiddle = (stackSize == 0 || m_sortedHypos->size() < stackSize) + ? m_sortedHypos->end() + : m_sortedHypos->begin() + stackSize; + + std::partial_sort(m_sortedHypos->begin(), iterMiddle, m_sortedHypos->end(), + HypothesisFutureScoreOrderer()); + + // prune + if (stackSize && m_sortedHypos->size() > stackSize) { + for (size_t i = stackSize; i < m_sortedHypos->size(); ++i) { + Hypothesis *hypo = const_cast((*m_sortedHypos)[i]); + recycler.Recycle(hypo); + } + m_sortedHypos->resize(stackSize); + } + + /* + cerr << "sorted hypos:" << endl; + for (size_t i = 0; i < hypos.size(); ++i) { + const Hypothesis *hypo = hypos[i]; + cerr << hypo << " " << *hypo << endl; + } + cerr << endl; + */ + +} + +void MiniStack::Clear() +{ + m_sortedHypos = NULL; + m_coll.clear(); +} + +/////////////////////////////////////////////////////////////// +Stack::Stack(const Manager &mgr) + :m_mgr(mgr) + ,m_coll() + ,m_miniStackRecycler() +{ +} + +Stack::~Stack() +{ + // TODO Auto-generated destructor stub +} + +void Stack::Add(const Hypothesis *hypo, Recycler &hypoRecycle) +{ + HypoCoverageInternal key = &hypo->GetBitmap(); + StackAdd added = GetMiniStack(key).Add(hypo); + + if (added.toBeDeleted) { + hypoRecycle.Recycle(added.toBeDeleted); + } +} + +std::vector Stack::GetBestHypos(size_t num) const +{ + std::vector ret; + BOOST_FOREACH(const Coll::value_type &val, m_coll) { + const MiniStack::_HCType &hypos = val.second->GetColl(); + ret.insert(ret.end(), hypos.begin(), hypos.end()); + } + + std::vector::iterator iterMiddle; + iterMiddle = (num == 0 || ret.size() < num) + ? ret.end() + : ret.begin()+num; + + std::partial_sort(ret.begin(), iterMiddle, ret.end(), + HypothesisFutureScoreOrderer()); + + return ret; +} + +size_t Stack::GetHypoSize() const +{ + size_t ret = 0; + BOOST_FOREACH(const Coll::value_type &val, m_coll) { + const MiniStack::_HCType &hypos = val.second->GetColl(); + ret += hypos.size(); + } + return ret; +} + +MiniStack &Stack::GetMiniStack(const HypoCoverageInternal &key) +{ + MiniStack *ret; + Coll::iterator iter = m_coll.find(key); + if (iter == m_coll.end()) { + if (m_miniStackRecycler.empty()) { + ret = new (m_mgr.GetPool().Allocate()) MiniStack(m_mgr); + } else { + ret = m_miniStackRecycler.back(); + ret->Clear(); + m_miniStackRecycler.pop_back(); + } + + m_coll[key] = ret; + } else { + ret = iter->second; + } + return *ret; +} + +void Stack::Clear() +{ + BOOST_FOREACH(const Coll::value_type &val, m_coll) { + MiniStack *miniStack = val.second; + m_miniStackRecycler.push_back(miniStack); + } + + m_coll.clear(); +} + +Stack::SortedHypos Stack::GetSortedAndPruneHypos(const Manager &mgr) const +{ + SortedHypos ret; + + MemPool &pool = mgr.GetPool(); + + // prune and sort + Hypotheses *allHypos = new (pool.Allocate()) Hypotheses(pool, GetHypoSize()); + size_t i = 0; + + BOOST_FOREACH(const Coll::value_type &val, m_coll) { + const MiniStack *miniStack = val.second; + const MiniStack::MiniStack::_HCType &hypos = miniStack->GetColl(); + + BOOST_FOREACH(const Hypothesis *hypo, hypos) { + (*allHypos)[i++] = hypo; + } + } + + SortAndPruneHypos(mgr, *allHypos); + + // divide hypos by [bitmap, last end pos] + BOOST_FOREACH(const Hypothesis *hypo, *allHypos) { + HypoCoverage key(&hypo->GetBitmap(), hypo->GetInputPath().range.GetEndPos()); + + Hypotheses *hypos; + SortedHypos::iterator iter; + iter = ret.find(key); + if (iter == ret.end()) { + hypos = new (pool.Allocate()) Hypotheses(pool); + ret[key] = hypos; + } else { + hypos = iter->second; + } + hypos->push_back(hypo); + } + + return ret; +} + +void Stack::SortAndPruneHypos(const Manager &mgr, Hypotheses &hypos) const +{ + size_t stackSize = mgr.system.stackSize; + Recycler &recycler = mgr.GetHypoRecycle(); + + /* + cerr << "UNSORTED hypos:" << endl; + for (size_t i = 0; i < hypos.size(); ++i) { + const Hypothesis *hypo = hypos[i]; + cerr << *hypo << endl; + } + cerr << endl; + */ + Hypotheses::iterator iterMiddle; + iterMiddle = (stackSize == 0 || hypos.size() < stackSize) + ? hypos.end() + : hypos.begin() + stackSize; + + std::partial_sort(hypos.begin(), iterMiddle, hypos.end(), + HypothesisFutureScoreOrderer()); + + // prune + if (stackSize && hypos.size() > stackSize) { + for (size_t i = stackSize; i < hypos.size(); ++i) { + Hypothesis *hypo = const_cast(hypos[i]); + recycler.Recycle(hypo); + } + hypos.resize(stackSize); + } + + /* + cerr << "sorted hypos:" << endl; + for (size_t i = 0; i < hypos.size(); ++i) { + const Hypothesis *hypo = hypos[i]; + cerr << hypo << " " << *hypo << endl; + } + cerr << endl; + */ + +} + + +void Stack::DebugCounts() +{ + /* + cerr << "counts="; + BOOST_FOREACH(const Coll::value_type &val, GetColl()) { + const NSCubePruning::MiniStack &miniStack = *val.second; + size_t count = miniStack.GetColl().size(); + cerr << count << " "; + } + cerr << endl; + */ +} + +} + +} + diff --git a/mosesdecoder/moses2/defer/CubePruningBitmapStack/Stack.h b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Stack.h new file mode 100644 index 0000000000000000000000000000000000000000..f052fab4285e9a4e4d3b28b25610a80db612ebb7 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningBitmapStack/Stack.h @@ -0,0 +1,114 @@ +/* + * Stack.h + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#pragma once +#include +#include +#include +#include "../Hypothesis.h" +#include "../../TypeDef.h" +#include "../../Vector.h" +#include "../../MemPool.h" +#include "../../Recycler.h" +#include "../../legacy/Util2.h" + +namespace Moses2 +{ + +class Manager; + +namespace NSCubePruningBitmapStack +{ +typedef Vector Hypotheses; + +class MiniStack +{ +public: + typedef boost::unordered_set, + UnorderedComparer + > _HCType; + + MiniStack(const Manager &mgr); + + StackAdd Add(const Hypothesis *hypo); + + _HCType &GetColl() { + return m_coll; + } + + const _HCType &GetColl() const { + return m_coll; + } + + void Clear(); + + Hypotheses &GetSortedAndPruneHypos(const Manager &mgr) const; + +protected: + _HCType m_coll; + mutable Hypotheses *m_sortedHypos; + + void SortAndPruneHypos(const Manager &mgr) const; + +}; + +///////////////////////////////////////////// +class Stack +{ +protected: + + +public: + typedef std::pair HypoCoverage; + // bitmap and current endPos of hypos + typedef boost::unordered_map SortedHypos; + + typedef const Bitmap* HypoCoverageInternal; + typedef boost::unordered_map + ,std::equal_to + > Coll; + + + Stack(const Manager &mgr); + virtual ~Stack(); + + size_t GetHypoSize() const; + + Coll &GetColl() { + return m_coll; + } + const Coll &GetColl() const { + return m_coll; + } + + void Add(const Hypothesis *hypo, Recycler &hypoRecycle); + + MiniStack &GetMiniStack(const HypoCoverageInternal &key); + + std::vector GetBestHypos(size_t num) const; + void Clear(); + + SortedHypos GetSortedAndPruneHypos(const Manager &mgr) const; + void SortAndPruneHypos(const Manager &mgr, Hypotheses &hypos) const; + + void DebugCounts(); + +protected: + const Manager &m_mgr; + Coll m_coll; + + std::deque m_miniStackRecycler; + + +}; + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningCardinalStack/Misc.cpp b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Misc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..197dc108a5eeb8306c25e5f9ba53f14cb4f2c4bc --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Misc.cpp @@ -0,0 +1,159 @@ +/* + * CubePruning.cpp + * + * Created on: 27 Nov 2015 + * Author: hieu + */ + +#include "Misc.h" +#include "Stack.h" +#include "../Manager.h" +#include "../../MemPool.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningCardinalStack +{ + +//////////////////////////////////////////////////////////////////////// +QueueItem *QueueItem::Create(QueueItem *currItem, + Manager &mgr, + CubeEdge &edge, + size_t hypoIndex, + size_t tpIndex, + std::deque &queueItemRecycler) +{ + QueueItem *ret; + if (currItem) { + // reuse incoming queue item to create new item + ret = currItem; + ret->Init(mgr, edge, hypoIndex, tpIndex); + } else if (!queueItemRecycler.empty()) { + // use item from recycle bin + ret = queueItemRecycler.back(); + ret->Init(mgr, edge, hypoIndex, tpIndex); + queueItemRecycler.pop_back(); + } else { + // create new item + ret = new (mgr.GetPool().Allocate()) QueueItem(mgr, edge, hypoIndex, tpIndex); + } + + return ret; +} + +QueueItem::QueueItem(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex) + :edge(&edge) + ,hypoIndex(hypoIndex) + ,tpIndex(tpIndex) +{ + CreateHypothesis(mgr); +} + +void QueueItem::Init(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex) +{ + this->edge = &edge; + this->hypoIndex = hypoIndex; + this->tpIndex = tpIndex; + + CreateHypothesis(mgr); +} + +void QueueItem::CreateHypothesis(Manager &mgr) +{ + const Hypothesis *prevHypo = edge->hypos[hypoIndex]; + const TargetPhrase &tp = edge->tps[tpIndex]; + + //cerr << "hypoIndex=" << hypoIndex << endl; + //cerr << "edge.hypos=" << edge.hypos.size() << endl; + //cerr << prevHypo << endl; + //cerr << *prevHypo << endl; + + hypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + hypo->Init(mgr, *prevHypo, edge->path, tp, edge->newBitmap, edge->estimatedScore); + hypo->EvaluateWhenApplied(); +} + +//////////////////////////////////////////////////////////////////////// +CubeEdge::CubeEdge( + Manager &mgr, + const Hypotheses &hypos, + const InputPath &path, + const TargetPhrases &tps, + const Bitmap &newBitmap) + :hypos(hypos) + ,path(path) + ,tps(tps) + ,newBitmap(newBitmap) +{ + estimatedScore = mgr.GetEstimatedScores().CalcEstimatedScore(newBitmap); +} + +std::ostream& operator<<(std::ostream &out, const CubeEdge &obj) +{ + out << obj.newBitmap; + return out; +} + +bool +CubeEdge::SetSeenPosition(const size_t x, const size_t y, SeenPositions &seenPositions) const +{ + //UTIL_THROW_IF2(x >= (1<<17), "Error"); + //UTIL_THROW_IF2(y >= (1<<17), "Error"); + + SeenPositionItem val(this, (x<<16) + y); + std::pair pairRet = seenPositions.insert(val); + return pairRet.second; +} + +void CubeEdge::CreateFirst(Manager &mgr, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler) +{ + assert(hypos.size()); + assert(tps.GetSize()); + + QueueItem *item = QueueItem::Create(NULL, mgr, *this, 0, 0, queueItemRecycler); + queue.push(item); + bool setSeen = SetSeenPosition(0, 0, seenPositions); + assert(setSeen); +} + +void CubeEdge::CreateNext(Manager &mgr, + QueueItem *item, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler) +{ + size_t hypoIndex = item->hypoIndex; + size_t tpIndex = item->tpIndex; + + if (hypoIndex + 1 < hypos.size() && SetSeenPosition(hypoIndex + 1, tpIndex, seenPositions)) { + // reuse incoming queue item to create new item + QueueItem *newItem = QueueItem::Create(item, mgr, *this, hypoIndex + 1, tpIndex, queueItemRecycler); + assert(newItem == item); + queue.push(newItem); + item = NULL; + } + + if (tpIndex + 1 < tps.GetSize() && SetSeenPosition(hypoIndex, tpIndex + 1, seenPositions)) { + QueueItem *newItem = QueueItem::Create(item, mgr, *this, hypoIndex, tpIndex + 1, queueItemRecycler); + queue.push(newItem); + item = NULL; + } + + if (item) { + // recycle unused queue item + queueItemRecycler.push_back(item); + } +} + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningCardinalStack/Misc.h b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Misc.h new file mode 100644 index 0000000000000000000000000000000000000000..9f5d28f1e7c54bf6ce0a4de734e958f94088a0c3 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Misc.h @@ -0,0 +1,112 @@ +/* + * CubePruning.h + * + * Created on: 27 Nov 2015 + * Author: hieu + */ +#pragma once +#include +#include +#include +#include +#include +#include "../../legacy/Range.h" +#include "../Hypothesis.h" +#include "../../TypeDef.h" +#include "../../Vector.h" +#include "Stack.h" + +namespace Moses2 +{ + +class Manager; +class InputPath; +class TargetPhrases; +class Bitmap; + +namespace NSCubePruningCardinalStack +{ +class CubeEdge; + +/////////////////////////////////////////// +class QueueItem +{ + ~QueueItem(); // NOT IMPLEMENTED. Use MemPool +public: + static QueueItem *Create(QueueItem *currItem, + Manager &mgr, + CubeEdge &edge, + size_t hypoIndex, + size_t tpIndex, + std::deque &queueItemRecycler); + QueueItem(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex); + + void Init(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex); + + CubeEdge *edge; + size_t hypoIndex, tpIndex; + Hypothesis *hypo; + +protected: + void CreateHypothesis(Manager &mgr); +}; + +/////////////////////////////////////////// +class QueueItemOrderer +{ +public: + bool operator()(QueueItem* itemA, QueueItem* itemB) const { + HypothesisFutureScoreOrderer orderer; + return !orderer(itemA->hypo, itemB->hypo); + } +}; + +/////////////////////////////////////////// +class CubeEdge +{ + friend std::ostream& operator<<(std::ostream &, const CubeEdge &); + +public: + typedef std::priority_queue, + QueueItemOrderer> Queue; + + typedef std::pair SeenPositionItem; + typedef boost::unordered_set, + std::equal_to + > SeenPositions; + + const Hypotheses &hypos; + const InputPath &path; + const TargetPhrases &tps; + const Bitmap &newBitmap; + SCORE estimatedScore; + + CubeEdge(Manager &mgr, + const Hypotheses &hypos, + const InputPath &path, + const TargetPhrases &tps, + const Bitmap &newBitmap); + + bool SetSeenPosition(const size_t x, const size_t y, SeenPositions &seenPositions) const; + + void CreateFirst(Manager &mgr, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler); + void CreateNext(Manager &mgr, + QueueItem *item, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler); + +protected: + +}; + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningCardinalStack/Search.cpp b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Search.cpp new file mode 100644 index 0000000000000000000000000000000000000000..23cae74ebc7e07a003c2e9b8ddf76c954f0572b6 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Search.cpp @@ -0,0 +1,206 @@ +/* + * Search.cpp + * + * Created on: 16 Nov 2015 + * Author: hieu + */ +#include +#include "Search.h" +#include "Stack.h" +#include "../Manager.h" +#include "../Hypothesis.h" +#include "../../InputPaths.h" +#include "../../InputPath.h" +#include "../../System.h" +#include "../../Sentence.h" +#include "../../TranslationTask.h" +#include "../../legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningCardinalStack +{ + +//////////////////////////////////////////////////////////////////////// +Search::Search(Manager &mgr) + :Moses2::Search(mgr) + ,m_stack(mgr) + + ,m_queue(QueueItemOrderer(), std::vector() ) + + ,m_seenPositions() +{ +} + +Search::~Search() +{ +} + +void Search::Decode() +{ + // init cue edges + m_cubeEdges.resize(mgr.GetInput().GetSize() + 1); + for (size_t i = 0; i < m_cubeEdges.size(); ++i) { + m_cubeEdges[i] = new (mgr.GetPool().Allocate()) CubeEdges(); + } + + const Bitmap &initBitmap = mgr.GetBitmaps().GetInitialBitmap(); + Hypothesis *initHypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + initHypo->Init(mgr, mgr.GetInputPaths().GetBlank(), mgr.GetInitPhrase(), initBitmap); + initHypo->EmptyHypothesisState(mgr.GetInput()); + + m_stack.Add(initHypo, mgr.GetHypoRecycle()); + PostDecode(0); + + for (size_t stackInd = 1; stackInd < mgr.GetInput().GetSize() + 1; ++stackInd) { + //cerr << "stackInd=" << stackInd << endl; + m_stack.Clear(); + Decode(stackInd); + PostDecode(stackInd); + + //m_stack.DebugCounts(); + //cerr << m_stacks << endl; + } + +} + +void Search::Decode(size_t stackInd) +{ + Recycler &hypoRecycler = mgr.GetHypoRecycle(); + + // reuse queue from previous stack. Clear it first + std::vector &container = Container(m_queue); + //cerr << "container=" << container.size() << endl; + BOOST_FOREACH(QueueItem *item, container) { + // recycle unused hypos from queue + Hypothesis *hypo = item->hypo; + hypoRecycler.Recycle(hypo); + + // recycle queue item + m_queueItemRecycler.push_back(item); + } + container.clear(); + + m_seenPositions.clear(); + + // add top hypo from every edge into queue + CubeEdges &edges = *m_cubeEdges[stackInd]; + + BOOST_FOREACH(CubeEdge *edge, edges) { + //cerr << *edge << " "; + edge->CreateFirst(mgr, m_queue, m_seenPositions, m_queueItemRecycler); + } + + /* + cerr << "edges: "; + boost::unordered_set uniqueBM; + BOOST_FOREACH(CubeEdge *edge, edges) { + uniqueBM.insert(&edge->newBitmap); + //cerr << *edge << " "; + } + cerr << edges.size() << " " << uniqueBM.size(); + cerr << endl; + */ + + size_t pops = 0; + while (!m_queue.empty() && pops < mgr.system.popLimit) { + // get best hypo from queue, add to stack + //cerr << "queue=" << queue.size() << endl; + QueueItem *item = m_queue.top(); + m_queue.pop(); + + CubeEdge *edge = item->edge; + + // add hypo to stack + Hypothesis *hypo = item->hypo; + //cerr << "hypo=" << *hypo << " " << hypo->GetBitmap() << endl; + m_stack.Add(hypo, hypoRecycler); + + edge->CreateNext(mgr, item, m_queue, m_seenPositions, m_queueItemRecycler); + + ++pops; + } + + /* + // create hypo from every edge. Increase diversity + while (!m_queue.empty()) { + QueueItem *item = m_queue.top(); + m_queue.pop(); + + if (item->hypoIndex == 0 && item->tpIndex == 0) { + CubeEdge &edge = item->edge; + + // add hypo to stack + Hypothesis *hypo = item->hypo; + //cerr << "hypo=" << *hypo << " " << hypo->GetBitmap() << endl; + m_stacks.Add(hypo, mgr.GetHypoRecycle()); + } + } + */ +} + +void Search::PostDecode(size_t stackInd) +{ + MemPool &pool = mgr.GetPool(); + + Stack::SortedHypos sortedHypos = m_stack.GetSortedAndPruneHypos(mgr); + + BOOST_FOREACH(const Stack::SortedHypos::value_type &val, sortedHypos) { + const Bitmap &hypoBitmap = *val.first.first; + size_t hypoEndPos = val.first.second; + //cerr << "key=" << hypoBitmap << " " << hypoEndPos << endl; + + // create edges to next hypos from existing hypos + const InputPaths &paths = mgr.GetInputPaths(); + + BOOST_FOREACH(const InputPath *path, paths) { + const Range &pathRange = path->range; + //cerr << "pathRange=" << pathRange << endl; + + if (!path->IsUsed()) { + continue; + } + if (!CanExtend(hypoBitmap, hypoEndPos, pathRange)) { + continue; + } + + const Bitmap &newBitmap = mgr.GetBitmaps().GetBitmap(hypoBitmap, pathRange); + size_t numWords = newBitmap.GetNumWordsCovered(); + + CubeEdges &edges = *m_cubeEdges[numWords]; + + // sort hypo for a particular bitmap and hypoEndPos + Hypotheses &sortedHypos = *val.second; + + size_t numPt = mgr.system.mappings.size(); + for (size_t i = 0; i < numPt; ++i) { + const TargetPhrases *tps = path->targetPhrases[i]; + if (tps && tps->GetSize()) { + CubeEdge *edge = new (pool.Allocate()) CubeEdge(mgr, sortedHypos, *path, *tps, newBitmap); + edges.push_back(edge); + } + } + } + } + +} + +const Hypothesis *Search::GetBestHypo() const +{ + std::vector sortedHypos = m_stack.GetBestHypos(1); + + const Hypothesis *best = NULL; + if (sortedHypos.size()) { + best = sortedHypos[0]; + } + return best; +} + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningCardinalStack/Search.h b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Search.h new file mode 100644 index 0000000000000000000000000000000000000000..f641c87d7a3352377cbff863d3e0da2e64e887b7 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Search.h @@ -0,0 +1,57 @@ +/* + * Search.h + * + * Created on: 16 Nov 2015 + * Author: hieu + */ + +#pragma once +#include +#include "../Search.h" +#include "Misc.h" +#include "Stack.h" +#include "../../legacy/Range.h" + +namespace Moses2 +{ + +class Bitmap; +class Hypothesis; +class InputPath; +class TargetPhrases; + +namespace NSCubePruningCardinalStack +{ + +class Search : public Moses2::Search +{ +public: + Search(Manager &mgr); + virtual ~Search(); + + virtual void Decode(); + const Hypothesis *GetBestHypo() const; + +protected: + Stack m_stack; + + CubeEdge::Queue m_queue; + CubeEdge::SeenPositions m_seenPositions; + + // CUBE PRUNING VARIABLES + // setup + typedef std::vector CubeEdges; + std::vector m_cubeEdges; + + std::deque m_queueItemRecycler; + + // CUBE PRUNING + // decoding + void Decode(size_t stackInd); + void PostDecode(size_t stackInd); +}; + +} + +} + diff --git a/mosesdecoder/moses2/defer/CubePruningCardinalStack/Stack.cpp b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Stack.cpp new file mode 100644 index 0000000000000000000000000000000000000000..60a3fe1e8af39cd3bd2a13cee0b2dd1c23504041 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Stack.cpp @@ -0,0 +1,198 @@ +/* + * Stack.cpp + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#include +#include +#include "Stack.h" +#include "../Hypothesis.h" +#include "../Manager.h" +#include "../../Scores.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningCardinalStack +{ + +/////////////////////////////////////////////////////////////// +Stack::Stack(const Manager &mgr) + :m_mgr(mgr) + ,m_coll() +{ +} + +Stack::~Stack() +{ + // TODO Auto-generated destructor stub +} + +void Stack::Add(const Hypothesis *hypo, Recycler &hypoRecycle) +{ + std::pair<_HCType::iterator, bool> addRet = m_coll.insert(hypo); + + // CHECK RECOMBINATION + if (addRet.second) { + // equiv hypo doesn't exists + } else { + const Hypothesis *hypoExisting = *addRet.first; + if (hypo->GetScores().GetTotalScore() > hypoExisting->GetScores().GetTotalScore()) { + // incoming hypo is better than the one we have + const Hypothesis *const &hypoExisting1 = *addRet.first; + const Hypothesis *&hypoExisting2 = const_cast(hypoExisting1); + hypoExisting2 = hypo; + + Hypothesis *hypoToBeDeleted = const_cast(hypoExisting); + hypoRecycle.Recycle(hypoToBeDeleted); + } else { + // already storing the best hypo. discard incoming hypo + Hypothesis *hypoToBeDeleted = const_cast(hypo); + hypoRecycle.Recycle(hypoToBeDeleted); + } + } +} + +std::vector Stack::GetBestHypos(size_t num) const +{ + std::vector ret; + ret.insert(ret.end(), m_coll.begin(), m_coll.end()); + + std::vector::iterator iterMiddle; + iterMiddle = (num == 0 || ret.size() < num) + ? ret.end() + : ret.begin()+num; + + std::partial_sort(ret.begin(), iterMiddle, ret.end(), + HypothesisFutureScoreOrderer()); + + return ret; +} + +size_t Stack::GetHypoSize() const +{ + return m_coll.size(); +} + +void Stack::Clear() +{ + + m_coll.clear(); +} + +Stack::SortedHypos Stack::GetSortedAndPruneHypos(const Manager &mgr) const +{ + SortedHypos ret; + + MemPool &pool = mgr.GetPool(); + + // prune and sort + Hypotheses *allHypos = new (pool.Allocate()) Hypotheses(pool, GetHypoSize()); + size_t i = 0; + BOOST_FOREACH(const Hypothesis *hypo, m_coll) { + (*allHypos)[i++] = hypo; + } + SortAndPruneHypos(mgr, *allHypos); + + // divide hypos by [bitmap, last end pos] + BOOST_FOREACH(const Hypothesis *hypo, *allHypos) { + HypoCoverage key(&hypo->GetBitmap(), hypo->GetInputPath().range.GetEndPos()); + + Hypotheses *hypos; + SortedHypos::iterator iter; + iter = ret.find(key); + if (iter == ret.end()) { + hypos = new (pool.Allocate()) Hypotheses(pool); + ret[key] = hypos; + } else { + hypos = iter->second; + } + hypos->push_back(hypo); + } + + return ret; +} + + +//Stack::SortedHypos Stack::GetSortedAndPruneHypos(const Manager &mgr) const +//{ +// SortedHypos ret; +// +// MemPool &pool = mgr.GetPool(); +// +// // divide hypos by [bitmap, last end pos] +// BOOST_FOREACH(const Hypothesis *hypo, m_coll) { +// HypoCoverage key(&hypo->GetBitmap(), hypo->GetInputPath().range.GetEndPos()); +// +// Hypotheses *hypos; +// SortedHypos::iterator iter; +// iter = ret.find(key); +// if (iter == ret.end()) { +// hypos = new (pool.Allocate()) Hypotheses(pool); +// ret[key] = hypos; +// } +// else { +// hypos = iter->second; +// } +// hypos->push_back(hypo); +// } +// +// // put into real return variable and sort +// BOOST_FOREACH(SortedHypos::value_type &val, ret) { +// Hypotheses &hypos = *val.second; +// SortAndPruneHypos(mgr, hypos); +// } +// +// return ret; +//} + +void Stack::SortAndPruneHypos(const Manager &mgr, Hypotheses &hypos) const +{ + size_t stackSize = mgr.system.stackSize; + Recycler &recycler = mgr.GetHypoRecycle(); + + /* + cerr << "UNSORTED hypos:" << endl; + for (size_t i = 0; i < hypos.size(); ++i) { + const Hypothesis *hypo = hypos[i]; + cerr << *hypo << endl; + } + cerr << endl; + */ + Hypotheses::iterator iterMiddle; + iterMiddle = (stackSize == 0 || hypos.size() < stackSize) + ? hypos.end() + : hypos.begin() + stackSize; + + std::partial_sort(hypos.begin(), iterMiddle, hypos.end(), + HypothesisFutureScoreOrderer()); + + // prune + if (stackSize && hypos.size() > stackSize) { + for (size_t i = stackSize; i < hypos.size(); ++i) { + Hypothesis *hypo = const_cast(hypos[i]); + recycler.Recycle(hypo); + } + hypos.resize(stackSize); + } + + /* + cerr << "sorted hypos:" << endl; + for (size_t i = 0; i < hypos.size(); ++i) { + const Hypothesis *hypo = hypos[i]; + cerr << hypo << " " << *hypo << endl; + } + cerr << endl; + */ + +} + + +} + +} + diff --git a/mosesdecoder/moses2/defer/CubePruningCardinalStack/Stack.h b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Stack.h new file mode 100644 index 0000000000000000000000000000000000000000..94e987b7b91b464af92f97767ac456adb66a08ca --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningCardinalStack/Stack.h @@ -0,0 +1,71 @@ +/* + * Stack.h + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#pragma once +#include +#include +#include +#include "../Hypothesis.h" +#include "../../TypeDef.h" +#include "../../Vector.h" +#include "../../MemPool.h" +#include "../../Recycler.h" +#include "../../legacy/Util2.h" + +namespace Moses2 +{ + +class Manager; + +namespace NSCubePruningCardinalStack +{ +typedef Vector Hypotheses; + + +///////////////////////////////////////////// +class Stack +{ +protected: + typedef boost::unordered_set, + UnorderedComparer + > _HCType; + +public: + typedef std::pair HypoCoverage; + typedef boost::unordered_map SortedHypos; + + Stack(const Manager &mgr); + virtual ~Stack(); + + size_t GetHypoSize() const; + + _HCType &GetColl() { + return m_coll; + } + const _HCType &GetColl() const { + return m_coll; + } + + void Add(const Hypothesis *hypo, Recycler &hypoRecycle); + + std::vector GetBestHypos(size_t num) const; + void Clear(); + + SortedHypos GetSortedAndPruneHypos(const Manager &mgr) const; + void SortAndPruneHypos(const Manager &mgr, Hypotheses &hypos) const; + +protected: + const Manager &m_mgr; + _HCType m_coll; + +}; + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningPerBitmap/Misc.cpp b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Misc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8e94dac5d4801dd7a6ee3df0e23e3c9cafb6b9a9 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Misc.cpp @@ -0,0 +1,159 @@ +/* + * CubePruning.cpp + * + * Created on: 27 Nov 2015 + * Author: hieu + */ + +#include "Misc.h" +#include "../Manager.h" +#include "../../MemPool.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningPerBitmap +{ + +//////////////////////////////////////////////////////////////////////// +QueueItem *QueueItem::Create(QueueItem *currItem, + Manager &mgr, + CubeEdge &edge, + size_t hypoIndex, + size_t tpIndex, + std::deque &queueItemRecycler) +{ + QueueItem *ret; + if (currItem) { + // reuse incoming queue item to create new item + ret = currItem; + ret->Init(mgr, edge, hypoIndex, tpIndex); + } else if (!queueItemRecycler.empty()) { + // use item from recycle bin + ret = queueItemRecycler.back(); + ret->Init(mgr, edge, hypoIndex, tpIndex); + queueItemRecycler.pop_back(); + } else { + // create new item + ret = new (mgr.GetPool().Allocate()) QueueItem(mgr, edge, hypoIndex, tpIndex); + } + + return ret; +} + +QueueItem::QueueItem(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex) + :edge(&edge) + ,hypoIndex(hypoIndex) + ,tpIndex(tpIndex) +{ + CreateHypothesis(mgr); +} + +void QueueItem::Init(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex) +{ + this->edge = &edge; + this->hypoIndex = hypoIndex; + this->tpIndex = tpIndex; + + CreateHypothesis(mgr); +} + +void QueueItem::CreateHypothesis(Manager &mgr) +{ + const Hypothesis *prevHypo = edge->miniStack.GetSortedAndPruneHypos(mgr)[hypoIndex]; + const TargetPhrase &tp = edge->tps[tpIndex]; + + //cerr << "hypoIndex=" << hypoIndex << endl; + //cerr << "edge.hypos=" << edge.hypos.size() << endl; + //cerr << prevHypo << endl; + //cerr << *prevHypo << endl; + + hypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + hypo->Init(mgr, *prevHypo, edge->path, tp, edge->newBitmap, edge->estimatedScore); + hypo->EvaluateWhenApplied(); +} + +//////////////////////////////////////////////////////////////////////// +CubeEdge::CubeEdge( + Manager &mgr, + const NSCubePruningMiniStack::MiniStack &miniStack, + const InputPath &path, + const TargetPhrases &tps, + const Bitmap &newBitmap) + :miniStack(miniStack) + ,path(path) + ,tps(tps) + ,newBitmap(newBitmap) +{ + estimatedScore = mgr.GetEstimatedScores().CalcEstimatedScore(newBitmap); +} + +std::ostream& operator<<(std::ostream &out, const CubeEdge &obj) +{ + out << obj.newBitmap; + return out; +} + +bool +CubeEdge::SetSeenPosition(const size_t x, const size_t y, SeenPositions &seenPositions) const +{ + //UTIL_THROW_IF2(x >= (1<<17), "Error"); + //UTIL_THROW_IF2(y >= (1<<17), "Error"); + + SeenPositionItem val(this, (x<<16) + y); + std::pair pairRet = seenPositions.insert(val); + return pairRet.second; +} + +void CubeEdge::CreateFirst(Manager &mgr, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler) +{ + if (miniStack.GetSortedAndPruneHypos(mgr).size()) { + assert(tps.GetSize()); + + QueueItem *item = QueueItem::Create(NULL, mgr, *this, 0, 0, queueItemRecycler); + queue.push(item); + bool setSeen = SetSeenPosition(0, 0, seenPositions); + assert(setSeen); + } +} + +void CubeEdge::CreateNext(Manager &mgr, + QueueItem *item, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler) +{ + size_t hypoIndex = item->hypoIndex; + size_t tpIndex = item->tpIndex; + + if (hypoIndex + 1 < miniStack.GetSortedAndPruneHypos(mgr).size() && SetSeenPosition(hypoIndex + 1, tpIndex, seenPositions)) { + // reuse incoming queue item to create new item + QueueItem *newItem = QueueItem::Create(item, mgr, *this, hypoIndex + 1, tpIndex, queueItemRecycler); + assert(newItem == item); + queue.push(newItem); + item = NULL; + } + + if (tpIndex + 1 < tps.GetSize() && SetSeenPosition(hypoIndex, tpIndex + 1, seenPositions)) { + QueueItem *newItem = QueueItem::Create(item, mgr, *this, hypoIndex, tpIndex + 1, queueItemRecycler); + queue.push(newItem); + item = NULL; + } + + if (item) { + // recycle unused queue item + queueItemRecycler.push_back(item); + } +} + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningPerBitmap/Misc.h b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Misc.h new file mode 100644 index 0000000000000000000000000000000000000000..3fa22f9a6b6aeab522718c50c8047d89c6ec15dc --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Misc.h @@ -0,0 +1,113 @@ +/* + * CubePruning.h + * + * Created on: 27 Nov 2015 + * Author: hieu + */ +#pragma once +#include +#include +#include +#include +#include +#include "../../legacy/Range.h" +#include "../Hypothesis.h" +#include "../../TypeDef.h" +#include "../../Vector.h" +#include "../CubePruningMiniStack/Stack.h" + +namespace Moses2 +{ + +class Manager; +class InputPath; +class TargetPhrases; +class Bitmap; + +namespace NSCubePruningPerBitmap +{ +class CubeEdge; + +/////////////////////////////////////////// +class QueueItem +{ + ~QueueItem(); // NOT IMPLEMENTED. Use MemPool +public: + static QueueItem *Create(QueueItem *currItem, + Manager &mgr, + CubeEdge &edge, + size_t hypoIndex, + size_t tpIndex, + std::deque &queueItemRecycler); + QueueItem(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex); + + void Init(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex); + + CubeEdge *edge; + size_t hypoIndex, tpIndex; + Hypothesis *hypo; + +protected: + void CreateHypothesis(Manager &mgr); +}; + +/////////////////////////////////////////// +class QueueItemOrderer +{ +public: + bool operator()(QueueItem* itemA, QueueItem* itemB) const { + HypothesisFutureScoreOrderer orderer; + return !orderer(itemA->hypo, itemB->hypo); + } +}; + +/////////////////////////////////////////// +class CubeEdge +{ + friend std::ostream& operator<<(std::ostream &, const CubeEdge &); + +public: + typedef std::priority_queue, + QueueItemOrderer> Queue; + + typedef std::pair SeenPositionItem; + typedef boost::unordered_set, + std::equal_to + > SeenPositions; + + const NSCubePruningMiniStack::MiniStack &miniStack; + const InputPath &path; + const TargetPhrases &tps; + const Bitmap &newBitmap; + SCORE estimatedScore; + + CubeEdge(Manager &mgr, + const NSCubePruningMiniStack::MiniStack &miniStack, + const InputPath &path, + const TargetPhrases &tps, + const Bitmap &newBitmap); + + bool SetSeenPosition(const size_t x, const size_t y, SeenPositions &seenPositions) const; + + void CreateFirst(Manager &mgr, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler); + void CreateNext(Manager &mgr, + QueueItem *item, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler); + + +protected: + +}; + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningPerBitmap/Search.cpp b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Search.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d07b28a7298e99a2d46656e82cb7eb793be1defd --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Search.cpp @@ -0,0 +1,271 @@ +/* + * Search.cpp + * + * Created on: 16 Nov 2015 + * Author: hieu + */ +#include +#include "Search.h" +#include "../Manager.h" +#include "../Hypothesis.h" +#include "../../InputPaths.h" +#include "../../InputPath.h" +#include "../../System.h" +#include "../../Sentence.h" +#include "../../TranslationTask.h" +#include "../../legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningPerBitmap +{ + +//////////////////////////////////////////////////////////////////////// +Search::Search(Manager &mgr) + :Moses2::Search(mgr) + ,m_stacks(mgr) + + ,m_queue(QueueItemOrderer(), + std::vector() ) + + ,m_seenPositions() +{ +} + +Search::~Search() +{ +} + +void Search::Decode() +{ + // init stacks + m_stacks.Init(mgr.GetInput().GetSize() + 1); + + const Bitmap &initBitmap = mgr.GetBitmaps().GetInitialBitmap(); + Hypothesis *initHypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + initHypo->Init(mgr, mgr.GetInputPaths().GetBlank(), mgr.GetInitPhrase(), initBitmap); + initHypo->EmptyHypothesisState(mgr.GetInput()); + + m_stacks.Add(initHypo, mgr.GetHypoRecycle()); + + for (size_t stackInd = 0; stackInd < m_stacks.GetSize() - 1; ++stackInd) { + CreateSearchGraph(stackInd); + } + + for (size_t stackInd = 1; stackInd < m_stacks.GetSize(); ++stackInd) { + //cerr << "stackInd=" << stackInd << endl; + Decode(stackInd); + + //cerr << m_stacks << endl; + } + + //DebugCounts(); +} + +void Search::Decode(size_t stackInd) +{ + NSCubePruningMiniStack::Stack &stack = m_stacks[stackInd]; + + // FOR EACH BITMAP IN EACH STACK + boost::unordered_map > uniqueBM; + + BOOST_FOREACH(NSCubePruningMiniStack::Stack::Coll::value_type &val, stack.GetColl()) { + NSCubePruningMiniStack::MiniStack &miniStack = *val.second; + + const Bitmap *bitmap = val.first.first; + uniqueBM[bitmap].push_back(&miniStack); + } + + // decode each bitmap + boost::unordered_map >::iterator iter; + for (iter = uniqueBM.begin(); iter != uniqueBM.end(); ++iter) { + const vector &miniStacks = iter->second; + Decode(miniStacks); + } + + /* + // FOR EACH STACK + vector miniStacks; + BOOST_FOREACH(NSCubePruningMiniStack::Stack::Coll::value_type &val, stack.GetColl()) { + NSCubePruningMiniStack::MiniStack &miniStack = *val.second; + + miniStacks.push_back(&miniStack); + } + Decode(miniStacks); + */ +} + +void Search::Decode(const vector &miniStacks) +{ + Recycler &hypoRecycler = mgr.GetHypoRecycle(); + + // reuse queue from previous stack. Clear it first + std::vector &container = Container(m_queue); + //cerr << "container=" << container.size() << endl; + BOOST_FOREACH(QueueItem *item, container) { + // recycle unused hypos from queue + Hypothesis *hypo = item->hypo; + hypoRecycler.Recycle(hypo); + + // recycle queue item + m_queueItemRecycler.push_back(item); + } + container.clear(); + + m_seenPositions.clear(); + + BOOST_FOREACH(NSCubePruningMiniStack::MiniStack *miniStack, miniStacks) { + // add top hypo from every edge into queue + CubeEdges &edges = *m_cubeEdges[miniStack]; + + BOOST_FOREACH(CubeEdge *edge, edges) { + //cerr << "edge=" << *edge << endl; + edge->CreateFirst(mgr, m_queue, m_seenPositions, m_queueItemRecycler); + } + } + + size_t pops = 0; + while (!m_queue.empty() && pops < mgr.system.popLimit) { + // get best hypo from queue, add to stack + //cerr << "queue=" << queue.size() << endl; + QueueItem *item = m_queue.top(); + m_queue.pop(); + + CubeEdge *edge = item->edge; + + // add hypo to stack + Hypothesis *hypo = item->hypo; + //cerr << "hypo=" << *hypo << " " << hypo->GetBitmap() << endl; + m_stacks.Add(hypo, hypoRecycler); + + edge->CreateNext(mgr, item, m_queue, m_seenPositions, m_queueItemRecycler); + + ++pops; + } + + /* + // create hypo from every edge. Increase diversity + while (!m_queue.empty()) { + QueueItem *item = m_queue.top(); + m_queue.pop(); + + if (item->hypoIndex == 0 && item->tpIndex == 0) { + CubeEdge &edge = item->edge; + + // add hypo to stack + Hypothesis *hypo = item->hypo; + //cerr << "hypo=" << *hypo << " " << hypo->GetBitmap() << endl; + m_stacks.Add(hypo, mgr.GetHypoRecycle()); + } + } + */ +} + + +void Search::CreateSearchGraph(size_t stackInd) +{ + NSCubePruningMiniStack::Stack &stack = m_stacks[stackInd]; + MemPool &pool = mgr.GetPool(); + + BOOST_FOREACH(const NSCubePruningMiniStack::Stack::Coll::value_type &val, stack.GetColl()) { + const Bitmap &hypoBitmap = *val.first.first; + size_t hypoEndPos = val.first.second; + //cerr << "key=" << hypoBitmap << " " << hypoEndPos << endl; + + // create edges to next hypos from existing hypos + const InputPaths &paths = mgr.GetInputPaths(); + + BOOST_FOREACH(const InputPath *path, paths) { + const Range &pathRange = path->range; + //cerr << "pathRange=" << pathRange << endl; + + if (!path->IsUsed()) { + continue; + } + if (!CanExtend(hypoBitmap, hypoEndPos, pathRange)) { + continue; + } + + const Bitmap &newBitmap = mgr.GetBitmaps().GetBitmap(hypoBitmap, pathRange); + + // sort hypo for a particular bitmap and hypoEndPos + const NSCubePruningMiniStack::MiniStack &miniStack = *val.second; + + + // add cube edge + size_t numPt = mgr.system.mappings.size(); + for (size_t i = 0; i < numPt; ++i) { + const TargetPhrases *tps = path->targetPhrases[i]; + if (tps && tps->GetSize()) { + // create next mini stack + NSCubePruningMiniStack::MiniStack &nextMiniStack = m_stacks.GetMiniStack(newBitmap, pathRange); + + CubeEdge *edge = new (pool.Allocate()) CubeEdge(mgr, miniStack, *path, *tps, newBitmap); + + CubeEdges *edges; + boost::unordered_map::iterator iter = m_cubeEdges.find(&nextMiniStack); + if (iter == m_cubeEdges.end()) { + edges = new (pool.Allocate()) CubeEdges(); + m_cubeEdges[&nextMiniStack] = edges; + } else { + edges = iter->second; + } + + edges->push_back(edge); + } + } + } + } + +} + + +const Hypothesis *Search::GetBestHypo() const +{ + const NSCubePruningMiniStack::Stack &lastStack = m_stacks.Back(); + std::vector sortedHypos = lastStack.GetBestHypos(1); + + const Hypothesis *best = NULL; + if (sortedHypos.size()) { + best = sortedHypos[0]; + } + return best; +} + +void Search::DebugCounts() +{ + std::map counts; + + for (size_t stackInd = 0; stackInd < m_stacks.GetSize(); ++stackInd) { + //cerr << "stackInd=" << stackInd << endl; + const NSCubePruningMiniStack::Stack &stack = m_stacks[stackInd]; + BOOST_FOREACH(const NSCubePruningMiniStack::Stack::Coll::value_type &val, stack.GetColl()) { + const NSCubePruningMiniStack::MiniStack &miniStack = *val.second; + size_t count = miniStack.GetColl().size(); + + if (counts.find(count) == counts.end()) { + counts[count] = 0; + } else { + ++counts[count]; + } + } + //cerr << m_stacks << endl; + } + + std::map::const_iterator iter; + for (iter = counts.begin(); iter != counts.end(); ++iter) { + cerr << iter->first << "=" << iter->second << " "; + } + cerr << endl; +} + + + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningPerBitmap/Search.h b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Search.h new file mode 100644 index 0000000000000000000000000000000000000000..cb2164074ba5cf3511929d57615c963e9c389572 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Search.h @@ -0,0 +1,66 @@ +/* + * Search.h + * + * Created on: 16 Nov 2015 + * Author: hieu + */ + +#pragma once +#include +#include +#include "../Search.h" +#include "Misc.h" +#include "Stacks.h" +#include "../../legacy/Range.h" + +namespace Moses2 +{ + +class Bitmap; +class Hypothesis; +class InputPath; +class TargetPhrases; + +namespace NSCubePruningMiniStack +{ +class MiniStack; +} + +namespace NSCubePruningPerBitmap +{ + +class Search : public Moses2::Search +{ +public: + Search(Manager &mgr); + virtual ~Search(); + + virtual void Decode(); + const Hypothesis *GetBestHypo() const; + +protected: + Stacks m_stacks; + + CubeEdge::Queue m_queue; + CubeEdge::SeenPositions m_seenPositions; + + // CUBE PRUNING VARIABLES + // setup + typedef std::vector CubeEdges; + boost::unordered_map m_cubeEdges; + + std::deque m_queueItemRecycler; + + // CUBE PRUNING + // decoding + void CreateSearchGraph(size_t stackInd); + void Decode(size_t stackInd); + void Decode(const std::vector &miniStacks); + + void DebugCounts(); +}; + +} + +} + diff --git a/mosesdecoder/moses2/defer/CubePruningPerBitmap/Stacks.cpp b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Stacks.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9930f575e858e5c53127258f70f2595b937c816e --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Stacks.cpp @@ -0,0 +1,72 @@ +/* + * Stacks.cpp + * + * Created on: 6 Nov 2015 + * Author: hieu + */ + +#include "Stacks.h" +#include "../../System.h" +#include "../Manager.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningPerBitmap +{ + +Stacks::Stacks(const Manager &mgr) + :m_mgr(mgr) +{ +} + +Stacks::~Stacks() +{ +} + +void Stacks::Init(size_t numStacks) +{ + m_stacks.resize(numStacks); + for (size_t i = 0; i < m_stacks.size(); ++i) { + m_stacks[i] = new (m_mgr.GetPool().Allocate()) NSCubePruningMiniStack::Stack(m_mgr); + } +} + + +std::ostream& operator<<(std::ostream &out, const Stacks &obj) +{ + for (size_t i = 0; i < obj.GetSize(); ++i) { + const NSCubePruningMiniStack::Stack &stack = *obj.m_stacks[i]; + out << stack.GetHypoSize() << " "; + } + + return out; +} + +void Stacks::Add(const Hypothesis *hypo, Recycler &hypoRecycle) +{ + size_t numWordsCovered = hypo->GetBitmap().GetNumWordsCovered(); + //cerr << "numWordsCovered=" << numWordsCovered << endl; + NSCubePruningMiniStack::Stack &stack = *m_stacks[numWordsCovered]; + stack.Add(hypo, hypoRecycle); + +} + +NSCubePruningMiniStack::MiniStack &Stacks::GetMiniStack(const Bitmap &newBitmap, const Range &pathRange) +{ + size_t numWordsCovered = newBitmap.GetNumWordsCovered(); + //cerr << "numWordsCovered=" << numWordsCovered << endl; + NSCubePruningMiniStack::Stack &stack = *m_stacks[numWordsCovered]; + + NSCubePruningMiniStack::Stack::HypoCoverage key(&newBitmap, pathRange.GetEndPos()); + stack.GetMiniStack(key); + +} + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningPerBitmap/Stacks.h b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Stacks.h new file mode 100644 index 0000000000000000000000000000000000000000..28d93988579a99db27b05b8263092baf85dc26d7 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerBitmap/Stacks.h @@ -0,0 +1,55 @@ +/* + * Stacks.h + * + * Created on: 6 Nov 2015 + * Author: hieu + */ + +#pragma once + +#include +#include "../CubePruningMiniStack/Stack.h" +#include "../../Recycler.h" + +namespace Moses2 +{ +class Manager; + +namespace NSCubePruningPerBitmap +{ + +class Stacks +{ + friend std::ostream& operator<<(std::ostream &, const Stacks &); +public: + Stacks(const Manager &mgr); + virtual ~Stacks(); + + void Init(size_t numStacks); + + size_t GetSize() const { + return m_stacks.size(); + } + + const NSCubePruningMiniStack::Stack &Back() const { + return *m_stacks.back(); + } + + NSCubePruningMiniStack::Stack &operator[](size_t ind) { + return *m_stacks[ind]; + } + + void Add(const Hypothesis *hypo, Recycler &hypoRecycle); + NSCubePruningMiniStack::MiniStack &GetMiniStack(const Bitmap &newBitmap, const Range &pathRange); + +protected: + const Manager &m_mgr; + std::vector m_stacks; +}; + + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Misc.cpp b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Misc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..de8971362d53958160ee232514280e89e0623b61 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Misc.cpp @@ -0,0 +1,159 @@ +/* + * CubePruning.cpp + * + * Created on: 27 Nov 2015 + * Author: hieu + */ + +#include "Misc.h" +#include "../Manager.h" +#include "../../MemPool.h" +#include "../../System.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningPerMiniStack +{ + +//////////////////////////////////////////////////////////////////////// +QueueItem *QueueItem::Create(QueueItem *currItem, + Manager &mgr, + CubeEdge &edge, + size_t hypoIndex, + size_t tpIndex, + std::deque &queueItemRecycler) +{ + QueueItem *ret; + if (currItem) { + // reuse incoming queue item to create new item + ret = currItem; + ret->Init(mgr, edge, hypoIndex, tpIndex); + } else if (!queueItemRecycler.empty()) { + // use item from recycle bin + ret = queueItemRecycler.back(); + ret->Init(mgr, edge, hypoIndex, tpIndex); + queueItemRecycler.pop_back(); + } else { + // create new item + ret = new (mgr.GetPool().Allocate()) QueueItem(mgr, edge, hypoIndex, tpIndex); + } + + return ret; +} + +QueueItem::QueueItem(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex) + :edge(&edge) + ,hypoIndex(hypoIndex) + ,tpIndex(tpIndex) +{ + CreateHypothesis(mgr); +} + +void QueueItem::Init(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex) +{ + this->edge = &edge; + this->hypoIndex = hypoIndex; + this->tpIndex = tpIndex; + + CreateHypothesis(mgr); +} + +void QueueItem::CreateHypothesis(Manager &mgr) +{ + const Hypothesis *prevHypo = edge->miniStack.GetSortedAndPruneHypos(mgr)[hypoIndex]; + const TargetPhrase &tp = edge->tps[tpIndex]; + + //cerr << "hypoIndex=" << hypoIndex << endl; + //cerr << "edge.hypos=" << edge.hypos.size() << endl; + //cerr << prevHypo << endl; + //cerr << *prevHypo << endl; + + hypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + hypo->Init(mgr, *prevHypo, edge->path, tp, edge->newBitmap, edge->estimatedScore); + hypo->EvaluateWhenApplied(); +} + +//////////////////////////////////////////////////////////////////////// +CubeEdge::CubeEdge( + Manager &mgr, + const NSCubePruningMiniStack::MiniStack &miniStack, + const InputPath &path, + const TargetPhrases &tps, + const Bitmap &newBitmap) + :miniStack(miniStack) + ,path(path) + ,tps(tps) + ,newBitmap(newBitmap) +{ + estimatedScore = mgr.GetEstimatedScores().CalcEstimatedScore(newBitmap); +} + +std::ostream& operator<<(std::ostream &out, const CubeEdge &obj) +{ + out << obj.newBitmap; + return out; +} + +bool +CubeEdge::SetSeenPosition(const size_t x, const size_t y, SeenPositions &seenPositions) const +{ + //UTIL_THROW_IF2(x >= (1<<17), "Error"); + //UTIL_THROW_IF2(y >= (1<<17), "Error"); + + SeenPositionItem val(this, (x<<16) + y); + std::pair pairRet = seenPositions.insert(val); + return pairRet.second; +} + +void CubeEdge::CreateFirst(Manager &mgr, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler) +{ + if (miniStack.GetSortedAndPruneHypos(mgr).size()) { + assert(tps.GetSize()); + + QueueItem *item = QueueItem::Create(NULL, mgr, *this, 0, 0, queueItemRecycler); + queue.push(item); + bool setSeen = SetSeenPosition(0, 0, seenPositions); + assert(setSeen); + } +} + +void CubeEdge::CreateNext(Manager &mgr, + QueueItem *item, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler) +{ + size_t hypoIndex = item->hypoIndex; + size_t tpIndex = item->tpIndex; + + if (hypoIndex + 1 < miniStack.GetSortedAndPruneHypos(mgr).size() && SetSeenPosition(hypoIndex + 1, tpIndex, seenPositions)) { + // reuse incoming queue item to create new item + QueueItem *newItem = QueueItem::Create(item, mgr, *this, hypoIndex + 1, tpIndex, queueItemRecycler); + assert(newItem == item); + queue.push(newItem); + item = NULL; + } + + if (tpIndex + 1 < tps.GetSize() && SetSeenPosition(hypoIndex, tpIndex + 1, seenPositions)) { + QueueItem *newItem = QueueItem::Create(item, mgr, *this, hypoIndex, tpIndex + 1, queueItemRecycler); + queue.push(newItem); + item = NULL; + } + + if (item) { + // recycle unused queue item + queueItemRecycler.push_back(item); + } +} + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Misc.h b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Misc.h new file mode 100644 index 0000000000000000000000000000000000000000..511fd42f57916180f053fc813f6891c7b0963af0 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Misc.h @@ -0,0 +1,113 @@ +/* + * CubePruning.h + * + * Created on: 27 Nov 2015 + * Author: hieu + */ +#pragma once +#include +#include +#include +#include +#include +#include "../../legacy/Range.h" +#include "../Hypothesis.h" +#include "../../TypeDef.h" +#include "../../Vector.h" +#include "../CubePruningMiniStack/Stack.h" + +namespace Moses2 +{ + +class Manager; +class InputPath; +class TargetPhrases; +class Bitmap; + +namespace NSCubePruningPerMiniStack +{ +class CubeEdge; + +/////////////////////////////////////////// +class QueueItem +{ + ~QueueItem(); // NOT IMPLEMENTED. Use MemPool +public: + static QueueItem *Create(QueueItem *currItem, + Manager &mgr, + CubeEdge &edge, + size_t hypoIndex, + size_t tpIndex, + std::deque &queueItemRecycler); + QueueItem(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex); + + void Init(Manager &mgr, CubeEdge &edge, size_t hypoIndex, size_t tpIndex); + + CubeEdge *edge; + size_t hypoIndex, tpIndex; + Hypothesis *hypo; + +protected: + void CreateHypothesis(Manager &mgr); +}; + +/////////////////////////////////////////// +class QueueItemOrderer +{ +public: + bool operator()(QueueItem* itemA, QueueItem* itemB) const { + HypothesisFutureScoreOrderer orderer; + return !orderer(itemA->hypo, itemB->hypo); + } +}; + +/////////////////////////////////////////// +class CubeEdge +{ + friend std::ostream& operator<<(std::ostream &, const CubeEdge &); + +public: + typedef std::priority_queue, + QueueItemOrderer> Queue; + + typedef std::pair SeenPositionItem; + typedef boost::unordered_set, + std::equal_to + > SeenPositions; + + const NSCubePruningMiniStack::MiniStack &miniStack; + const InputPath &path; + const TargetPhrases &tps; + const Bitmap &newBitmap; + SCORE estimatedScore; + + CubeEdge(Manager &mgr, + const NSCubePruningMiniStack::MiniStack &miniStack, + const InputPath &path, + const TargetPhrases &tps, + const Bitmap &newBitmap); + + bool SetSeenPosition(const size_t x, const size_t y, SeenPositions &seenPositions) const; + + void CreateFirst(Manager &mgr, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler); + void CreateNext(Manager &mgr, + QueueItem *item, + Queue &queue, + SeenPositions &seenPositions, + std::deque &queueItemRecycler); + + +protected: + +}; + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Search.cpp b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Search.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1de52cb3d7c680abcdd777c4b0e276c23ff9ded9 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Search.cpp @@ -0,0 +1,246 @@ +/* + * Search.cpp + * + * Created on: 16 Nov 2015 + * Author: hieu + */ +#include +#include "Search.h" +#include "../Manager.h" +#include "../Hypothesis.h" +#include "../../InputPaths.h" +#include "../../InputPath.h" +#include "../../System.h" +#include "../../Sentence.h" +#include "../../TranslationTask.h" +#include "../../legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningPerMiniStack +{ + +//////////////////////////////////////////////////////////////////////// +Search::Search(Manager &mgr) + :Moses2::Search(mgr) + ,m_stacks(mgr) + + ,m_queue(QueueItemOrderer(), + std::vector() ) + + ,m_seenPositions() +{ +} + +Search::~Search() +{ +} + +void Search::Decode() +{ + // init stacks + m_stacks.Init(mgr.GetInput().GetSize() + 1); + + const Bitmap &initBitmap = mgr.GetBitmaps().GetInitialBitmap(); + Hypothesis *initHypo = Hypothesis::Create(mgr.GetSystemPool(), mgr); + initHypo->Init(mgr, mgr.GetInputPaths().GetBlank(), mgr.GetInitPhrase(), initBitmap); + initHypo->EmptyHypothesisState(mgr.GetInput()); + + m_stacks.Add(initHypo, mgr.GetHypoRecycle()); + + for (size_t stackInd = 0; stackInd < m_stacks.GetSize() - 1; ++stackInd) { + CreateSearchGraph(stackInd); + } + + for (size_t stackInd = 1; stackInd < m_stacks.GetSize(); ++stackInd) { + //cerr << "stackInd=" << stackInd << endl; + Decode(stackInd); + + //cerr << m_stacks << endl; + } + + //DebugCounts(); +} + +void Search::Decode(size_t stackInd) +{ + NSCubePruningMiniStack::Stack &stack = m_stacks[stackInd]; + BOOST_FOREACH(NSCubePruningMiniStack::Stack::Coll::value_type &val, stack.GetColl()) { + NSCubePruningMiniStack::MiniStack &miniStack = *val.second; + Decode(miniStack); + } + +} + +void Search::Decode(NSCubePruningMiniStack::MiniStack &miniStack) +{ + Recycler &hypoRecycler = mgr.GetHypoRecycle(); + + // reuse queue from previous stack. Clear it first + std::vector &container = Container(m_queue); + //cerr << "container=" << container.size() << endl; + BOOST_FOREACH(QueueItem *item, container) { + // recycle unused hypos from queue + Hypothesis *hypo = item->hypo; + hypoRecycler.Recycle(hypo); + + // recycle queue item + m_queueItemRecycler.push_back(item); + } + container.clear(); + + m_seenPositions.clear(); + + // add top hypo from every edge into queue + CubeEdges &edges = *m_cubeEdges[&miniStack]; + + BOOST_FOREACH(CubeEdge *edge, edges) { + //cerr << "edge=" << *edge << endl; + edge->CreateFirst(mgr, m_queue, m_seenPositions, m_queueItemRecycler); + } + + size_t pops = 0; + while (!m_queue.empty() && pops < mgr.system.popLimit) { + // get best hypo from queue, add to stack + //cerr << "queue=" << queue.size() << endl; + QueueItem *item = m_queue.top(); + m_queue.pop(); + + CubeEdge *edge = item->edge; + + // add hypo to stack + Hypothesis *hypo = item->hypo; + //cerr << "hypo=" << *hypo << " " << hypo->GetBitmap() << endl; + m_stacks.Add(hypo, hypoRecycler); + + edge->CreateNext(mgr, item, m_queue, m_seenPositions, m_queueItemRecycler); + + ++pops; + } + + /* + // create hypo from every edge. Increase diversity + while (!m_queue.empty()) { + QueueItem *item = m_queue.top(); + m_queue.pop(); + + if (item->hypoIndex == 0 && item->tpIndex == 0) { + CubeEdge &edge = item->edge; + + // add hypo to stack + Hypothesis *hypo = item->hypo; + //cerr << "hypo=" << *hypo << " " << hypo->GetBitmap() << endl; + m_stacks.Add(hypo, mgr.GetHypoRecycle()); + } + } + */ +} + + +void Search::CreateSearchGraph(size_t stackInd) +{ + NSCubePruningMiniStack::Stack &stack = m_stacks[stackInd]; + MemPool &pool = mgr.GetPool(); + + BOOST_FOREACH(const NSCubePruningMiniStack::Stack::Coll::value_type &val, stack.GetColl()) { + const Bitmap &hypoBitmap = *val.first.first; + size_t hypoEndPos = val.first.second; + //cerr << "key=" << hypoBitmap << " " << hypoEndPos << endl; + + // create edges to next hypos from existing hypos + const InputPaths &paths = mgr.GetInputPaths(); + + BOOST_FOREACH(const InputPath *path, paths) { + const Range &pathRange = path->range; + //cerr << "pathRange=" << pathRange << endl; + + if (!path->IsUsed()) { + continue; + } + if (!CanExtend(hypoBitmap, hypoEndPos, pathRange)) { + continue; + } + + const Bitmap &newBitmap = mgr.GetBitmaps().GetBitmap(hypoBitmap, pathRange); + + // sort hypo for a particular bitmap and hypoEndPos + const NSCubePruningMiniStack::MiniStack &miniStack = *val.second; + + + // add cube edge + size_t numPt = mgr.system.mappings.size(); + for (size_t i = 0; i < numPt; ++i) { + const TargetPhrases *tps = path->targetPhrases[i]; + if (tps && tps->GetSize()) { + // create next mini stack + NSCubePruningMiniStack::MiniStack &nextMiniStack = m_stacks.GetMiniStack(newBitmap, pathRange); + + CubeEdge *edge = new (pool.Allocate()) CubeEdge(mgr, miniStack, *path, *tps, newBitmap); + + CubeEdges *edges; + boost::unordered_map::iterator iter = m_cubeEdges.find(&nextMiniStack); + if (iter == m_cubeEdges.end()) { + edges = new (pool.Allocate()) CubeEdges(); + m_cubeEdges[&nextMiniStack] = edges; + } else { + edges = iter->second; + } + + edges->push_back(edge); + } + } + } + } + +} + + +const Hypothesis *Search::GetBestHypo() const +{ + const NSCubePruningMiniStack::Stack &lastStack = m_stacks.Back(); + std::vector sortedHypos = lastStack.GetBestHypos(1); + + const Hypothesis *best = NULL; + if (sortedHypos.size()) { + best = sortedHypos[0]; + } + return best; +} + +void Search::DebugCounts() +{ + std::map counts; + + for (size_t stackInd = 0; stackInd < m_stacks.GetSize(); ++stackInd) { + //cerr << "stackInd=" << stackInd << endl; + const NSCubePruningMiniStack::Stack &stack = m_stacks[stackInd]; + BOOST_FOREACH(const NSCubePruningMiniStack::Stack::Coll::value_type &val, stack.GetColl()) { + const NSCubePruningMiniStack::MiniStack &miniStack = *val.second; + size_t count = miniStack.GetColl().size(); + + if (counts.find(count) == counts.end()) { + counts[count] = 0; + } else { + ++counts[count]; + } + } + //cerr << m_stacks << endl; + } + + std::map::const_iterator iter; + for (iter = counts.begin(); iter != counts.end(); ++iter) { + cerr << iter->first << "=" << iter->second << " "; + } + cerr << endl; +} + + + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Search.h b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Search.h new file mode 100644 index 0000000000000000000000000000000000000000..2adb9631c9e2ac64c7bb0c9cbb30d498ff24b7bb --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Search.h @@ -0,0 +1,66 @@ +/* + * Search.h + * + * Created on: 16 Nov 2015 + * Author: hieu + */ + +#pragma once +#include +#include +#include "../Search.h" +#include "Misc.h" +#include "Stacks.h" +#include "../../legacy/Range.h" + +namespace Moses2 +{ + +class Bitmap; +class Hypothesis; +class InputPath; +class TargetPhrases; + +namespace NSCubePruningMiniStack +{ +class MiniStack; +} + +namespace NSCubePruningPerMiniStack +{ + +class Search : public Moses2::Search +{ +public: + Search(Manager &mgr); + virtual ~Search(); + + virtual void Decode(); + const Hypothesis *GetBestHypo() const; + +protected: + Stacks m_stacks; + + CubeEdge::Queue m_queue; + CubeEdge::SeenPositions m_seenPositions; + + // CUBE PRUNING VARIABLES + // setup + typedef std::vector CubeEdges; + boost::unordered_map m_cubeEdges; + + std::deque m_queueItemRecycler; + + // CUBE PRUNING + // decoding + void CreateSearchGraph(size_t stackInd); + void Decode(size_t stackInd); + void Decode(NSCubePruningMiniStack::MiniStack &miniStack); + + void DebugCounts(); +}; + +} + +} + diff --git a/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Stacks.cpp b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Stacks.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4e81e8e481fc7d4430a05501eed930d544d2838e --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Stacks.cpp @@ -0,0 +1,72 @@ +/* + * Stacks.cpp + * + * Created on: 6 Nov 2015 + * Author: hieu + */ + +#include "Stacks.h" +#include "../../System.h" +#include "../Manager.h" + +using namespace std; + +namespace Moses2 +{ + +namespace NSCubePruningPerMiniStack +{ + +Stacks::Stacks(const Manager &mgr) + :m_mgr(mgr) +{ +} + +Stacks::~Stacks() +{ +} + +void Stacks::Init(size_t numStacks) +{ + m_stacks.resize(numStacks); + for (size_t i = 0; i < m_stacks.size(); ++i) { + m_stacks[i] = new (m_mgr.GetPool().Allocate()) NSCubePruningMiniStack::Stack(m_mgr); + } +} + + +std::ostream& operator<<(std::ostream &out, const Stacks &obj) +{ + for (size_t i = 0; i < obj.GetSize(); ++i) { + const NSCubePruningMiniStack::Stack &stack = *obj.m_stacks[i]; + out << stack.GetHypoSize() << " "; + } + + return out; +} + +void Stacks::Add(const Hypothesis *hypo, Recycler &hypoRecycle) +{ + size_t numWordsCovered = hypo->GetBitmap().GetNumWordsCovered(); + //cerr << "numWordsCovered=" << numWordsCovered << endl; + NSCubePruningMiniStack::Stack &stack = *m_stacks[numWordsCovered]; + stack.Add(hypo, hypoRecycle); + +} + +NSCubePruningMiniStack::MiniStack &Stacks::GetMiniStack(const Bitmap &newBitmap, const Range &pathRange) +{ + size_t numWordsCovered = newBitmap.GetNumWordsCovered(); + //cerr << "numWordsCovered=" << numWordsCovered << endl; + NSCubePruningMiniStack::Stack &stack = *m_stacks[numWordsCovered]; + + NSCubePruningMiniStack::Stack::HypoCoverage key(&newBitmap, pathRange.GetEndPos()); + stack.GetMiniStack(key); + +} + +} + +} + + diff --git a/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Stacks.h b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Stacks.h new file mode 100644 index 0000000000000000000000000000000000000000..74469b7674e460604611b4952216f3780c269b93 --- /dev/null +++ b/mosesdecoder/moses2/defer/CubePruningPerMiniStack/Stacks.h @@ -0,0 +1,55 @@ +/* + * Stacks.h + * + * Created on: 6 Nov 2015 + * Author: hieu + */ + +#pragma once + +#include +#include "../CubePruningMiniStack/Stack.h" +#include "../../Recycler.h" + +namespace Moses2 +{ +class Manager; + +namespace NSCubePruningPerMiniStack +{ + +class Stacks +{ + friend std::ostream& operator<<(std::ostream &, const Stacks &); +public: + Stacks(const Manager &mgr); + virtual ~Stacks(); + + void Init(size_t numStacks); + + size_t GetSize() const { + return m_stacks.size(); + } + + const NSCubePruningMiniStack::Stack &Back() const { + return *m_stacks.back(); + } + + NSCubePruningMiniStack::Stack &operator[](size_t ind) { + return *m_stacks[ind]; + } + + void Add(const Hypothesis *hypo, Recycler &hypoRecycle); + NSCubePruningMiniStack::MiniStack &GetMiniStack(const Bitmap &newBitmap, const Range &pathRange); + +protected: + const Manager &m_mgr; + std::vector m_stacks; +}; + + +} + +} + + diff --git a/mosesdecoder/moses2/legacy/Bitmap.cpp b/mosesdecoder/moses2/legacy/Bitmap.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ed5ccd7502258db3cbfa59991dd6b202f95c329d --- /dev/null +++ b/mosesdecoder/moses2/legacy/Bitmap.cpp @@ -0,0 +1,87 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include "Bitmap.h" + +namespace Moses2 +{ + +Bitmap::Bitmap(MemPool &pool, size_t size) : + m_bitmap(pool, size) +{ +} + +void Bitmap::Init(const std::vector& initializer) +{ + + for (size_t i = 0; i < initializer.size(); ++i) { + m_bitmap[i] = initializer[i]; + } + + // The initializer may not be of the same length. Change to the desired + // length. If we need to add any elements, initialize them to false. + for (size_t i = initializer.size(); i < m_bitmap.size(); ++i) { + m_bitmap[i] = false; + } + + m_numWordsCovered = std::count(m_bitmap.begin(), m_bitmap.end(), true); + + // Find the first gap, and cache it. + Array::const_iterator first_gap = std::find(m_bitmap.begin(), + m_bitmap.end(), false); + m_firstGap = ((first_gap == m_bitmap.end()) ? + NOT_FOUND: first_gap - m_bitmap.begin()); +} + +void Bitmap::Init(const Bitmap ©, const Range &range) +{ + m_firstGap = copy.m_firstGap; + m_numWordsCovered = copy.m_numWordsCovered; + for (size_t i = 0; i < m_bitmap.size(); ++i) { + m_bitmap[i] = copy.m_bitmap[i]; + } + SetValueNonOverlap(range); +} + +// for unordered_set in stack +size_t Bitmap::hash() const +{ + size_t ret = m_bitmap.hash(); + return ret; +} + +bool Bitmap::operator==(const Bitmap& other) const +{ + return m_bitmap == other.m_bitmap; +} + +// friend +std::ostream& operator<<(std::ostream& out, const Bitmap& bitmap) +{ + for (size_t i = 0; i < bitmap.m_bitmap.size(); i++) { + out << int(bitmap.GetValue(i)); + } + return out; +} + +} + diff --git a/mosesdecoder/moses2/legacy/Bitmap.h b/mosesdecoder/moses2/legacy/Bitmap.h new file mode 100644 index 0000000000000000000000000000000000000000..3ceb9b01d8f8d9d28298c2b57e2f7eb685853b1d --- /dev/null +++ b/mosesdecoder/moses2/legacy/Bitmap.h @@ -0,0 +1,240 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "Range.h" +#include "../Array.h" + +namespace Moses2 +{ +class MemPool; + +typedef unsigned long WordsBitmapID; + +/** Vector of boolean to represent whether a word has been translated or not. + * + * Implemented using a vector of char, which is usually the same representation + * for the elements that a C array of bool would use. A vector of bool, or a + * Boost dynamic_bitset, could be much more efficient in theory. Unfortunately + * algorithms like std::find() are not optimized for vector on gcc or + * clang, and dynamic_bitset lacks all the optimized search operations we want. + * Only benchmarking will tell what works best. Perhaps dynamic_bitset could + * still be a dramatic improvement, if we flip the meaning of the bits around + * so we can use its find_first() and find_next() for the most common searches. + */ +class Bitmap +{ + friend std::ostream& operator<<(std::ostream& out, const Bitmap& bitmap); +private: + Array m_bitmap; //! Ticks of words in sentence that have been done. + size_t m_firstGap; //! Cached position of first gap, or NOT_FOUND. + size_t m_numWordsCovered; + + Bitmap(); // not implemented + Bitmap& operator=(const Bitmap& other); + + /** Update the first gap, when bits are flipped */ + void UpdateFirstGap(size_t startPos, size_t endPos, bool value) { + if (value) { + //may remove gap + if (startPos <= m_firstGap && m_firstGap <= endPos) { + m_firstGap = NOT_FOUND; + for (size_t i = endPos + 1; i < m_bitmap.size(); ++i) { + if (!m_bitmap[i]) { + m_firstGap = i; + break; + } + } + } + + } else { + //setting positions to false, may add new gap + if (startPos < m_firstGap) { + m_firstGap = startPos; + } + } + } + + //! set value between 2 positions, inclusive + void + SetValueNonOverlap(Range const& range) { + size_t startPos = range.GetStartPos(); + size_t endPos = range.GetEndPos(); + + for(size_t pos = startPos; pos <= endPos; pos++) { + m_bitmap[pos] = true; + } + + m_numWordsCovered += range.GetNumWordsCovered(); + UpdateFirstGap(startPos, endPos, true); + } + +public: + //! Create Bitmap of length size, and initialise with vector. + explicit Bitmap(MemPool &pool, size_t size); + + void Init(const std::vector& initializer); + void Init(const Bitmap ©, const Range &range); + + //! Count of words translated. + size_t GetNumWordsCovered() const { + return m_numWordsCovered; + } + + //! position of 1st word not yet translated, or NOT_FOUND if everything already translated + size_t GetFirstGapPos() const { + return m_firstGap; + } + + //! position of last word not yet translated, or NOT_FOUND if everything already translated + size_t GetLastGapPos() const { + for (int pos = int(m_bitmap.size()) - 1; pos >= 0; pos--) { + if (!m_bitmap[pos]) { + return pos; + } + } + // no starting pos + return NOT_FOUND; + } + + //! position of last translated word + size_t GetLastPos() const { + for (int pos = int(m_bitmap.size()) - 1; pos >= 0; pos--) { + if (m_bitmap[pos]) { + return pos; + } + } + // no starting pos + return NOT_FOUND; + } + + //! whether a word has been translated at a particular position + bool GetValue(size_t pos) const { + return bool(m_bitmap[pos]); + } + //! set value at a particular position + void SetValue( size_t pos, bool value ) { + bool origValue = m_bitmap[pos]; + if (origValue == value) { + // do nothing + } else { + m_bitmap[pos] = value; + UpdateFirstGap(pos, pos, value); + if (value) { + ++m_numWordsCovered; + } else { + --m_numWordsCovered; + } + } + } + + //! whether every word has been translated + bool IsComplete() const { + return GetSize() == GetNumWordsCovered(); + } + //! whether the wordrange overlaps with any translated word in this bitmap + bool Overlap(const Range &compare) const { + for (size_t pos = compare.GetStartPos(); pos <= compare.GetEndPos(); pos++) { + if (m_bitmap[pos]) + return true; + } + return false; + } + //! number of elements + size_t GetSize() const { + return m_bitmap.size(); + } + + inline size_t GetEdgeToTheLeftOf(size_t l) const { + if (l == 0) return l; + while (l && !m_bitmap[l-1]) { + --l; + } + return l; + } + + inline size_t GetEdgeToTheRightOf(size_t r) const { + if (r+1 == m_bitmap.size()) return r; + return ( + std::find(m_bitmap.begin() + r + 1, m_bitmap.end(), true) - + m_bitmap.begin() + ) - 1; + } + + //! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16 + WordsBitmapID GetID() const { + assert(m_bitmap.size() < (1<<16)); + + size_t start = GetFirstGapPos(); + if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left + + size_t end = GetLastPos(); + if (end == NOT_FOUND) end = 0;// nothing translated yet + + assert(end < start || end-start <= 16); + WordsBitmapID id = 0; + for(size_t pos = end; pos > start; pos--) { + id = id*2 + (int) GetValue(pos); + } + return id + (1<<16) * start; + } + + //! converts bitmap into an integer ID, with an additional span covered + WordsBitmapID GetIDPlus( size_t startPos, size_t endPos ) const { + assert(m_bitmap.size() < (1<<16)); + + size_t start = GetFirstGapPos(); + if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left + + size_t end = GetLastPos(); + if (end == NOT_FOUND) end = 0;// nothing translated yet + + if (start == startPos) start = endPos+1; + if (end < endPos) end = endPos; + + assert(end < start || end-start <= 16); + WordsBitmapID id = 0; + for(size_t pos = end; pos > start; pos--) { + id = id*2; + if (GetValue(pos) || (startPos<=pos && pos<=endPos)) + id++; + } + return id + (1<<16) * start; + } + + // for unordered_set in stack + size_t hash() const; + bool operator==(const Bitmap& other) const; + bool operator!=(const Bitmap& other) const { + return !(*this == other); + } + +}; + +} diff --git a/mosesdecoder/moses2/legacy/Bitmaps.cpp b/mosesdecoder/moses2/legacy/Bitmaps.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b1fee5ea625e2e4a9c63855108cc31aefebf2fd1 --- /dev/null +++ b/mosesdecoder/moses2/legacy/Bitmaps.cpp @@ -0,0 +1,71 @@ +#include +#include "Bitmaps.h" +#include "Util2.h" + +using namespace std; + +namespace Moses2 +{ + +Bitmaps::Bitmaps(MemPool &pool) : + m_pool(pool) +{ +} + +Bitmaps::~Bitmaps() +{ +} + +void Bitmaps::Init(size_t inputSize, + const std::vector &initSourceCompleted) +{ + m_initBitmap = new (m_pool.Allocate()) Bitmap(m_pool, inputSize); + m_initBitmap->Init(initSourceCompleted); + m_coll[m_initBitmap]; +} + +const Bitmap &Bitmaps::GetNextBitmap(const Bitmap &bm, const Range &range) +{ + Bitmap *newBM; + if (m_recycler.empty()) { + newBM = new (m_pool.Allocate()) Bitmap(m_pool, bm.GetSize()); + } else { + newBM = m_recycler.top(); + m_recycler.pop(); + } + + newBM->Init(bm, range); + + Coll::const_iterator iter = m_coll.find(newBM); + if (iter == m_coll.end()) { + m_coll[newBM] = NextBitmaps(); + return *newBM; + } else { + m_recycler.push(newBM); + + return *iter->first; + } +} + +const Bitmap &Bitmaps::GetBitmap(const Bitmap &bm, const Range &range) +{ + Coll::iterator iter = m_coll.find(&bm); + assert(iter != m_coll.end()); + + const Bitmap *newBM; + NextBitmaps &next = iter->second; + NextBitmaps::const_iterator iterNext = next.find(&range); + if (iterNext == next.end()) { + // not seen the link yet. + newBM = &GetNextBitmap(bm, range); + next[&range] = newBM; + } else { + // link exist + //std::cerr << "link exists" << endl; + newBM = iterNext->second; + } + return *newBM; +} + +} + diff --git a/mosesdecoder/moses2/legacy/Bitmaps.h b/mosesdecoder/moses2/legacy/Bitmaps.h new file mode 100644 index 0000000000000000000000000000000000000000..c6061033dae1b1b515b3a52cea2d44ef5bd4981f --- /dev/null +++ b/mosesdecoder/moses2/legacy/Bitmaps.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include +#include "Bitmap.h" +#include "Util2.h" + +namespace Moses2 +{ +class MemPool; + +class Bitmaps +{ + typedef boost::unordered_map NextBitmaps; + typedef boost::unordered_map, UnorderedComparer > Coll; + //typedef std::set > Coll; + Coll m_coll; + Bitmap *m_initBitmap; + + MemPool &m_pool; + std::stack m_recycler; + + const Bitmap &GetNextBitmap(const Bitmap &bm, const Range &range); +public: + Bitmaps(MemPool &pool); + virtual ~Bitmaps(); + void Init(size_t inputSize, const std::vector &initSourceCompleted); + + const Bitmap &GetInitialBitmap() const { + return *m_initBitmap; + } + const Bitmap &GetBitmap(const Bitmap &bm, const Range &range); +}; + +} + diff --git a/mosesdecoder/moses2/legacy/Factor.cpp b/mosesdecoder/moses2/legacy/Factor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..be9bad2c11b5ed58d78a31083122374b44f34ff2 --- /dev/null +++ b/mosesdecoder/moses2/legacy/Factor.cpp @@ -0,0 +1,45 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "Factor.h" + +#include + +using namespace std; + +namespace Moses2 +{ + +// friend +ostream& operator<<(ostream& out, const Factor& factor) +{ + out << factor.GetString(); + return out; +} + +size_t hash_value(const Factor& f) +{ + boost::hash hasher; + return hasher(f.GetId()); +} + +} + diff --git a/mosesdecoder/moses2/legacy/Factor.h b/mosesdecoder/moses2/legacy/Factor.h new file mode 100644 index 0000000000000000000000000000000000000000..541f2364a32fc048b7613cd99be2c669508fccfc --- /dev/null +++ b/mosesdecoder/moses2/legacy/Factor.h @@ -0,0 +1,97 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include +#include "util/string_piece.hh" + +namespace Moses2 +{ + +struct FactorFriend; +class FactorCollection; + +/** Represents a factor (word, POS, etc). + * A Factor has a contiguous identifier and string value. + */ +class Factor +{ + friend std::ostream& operator<<(std::ostream&, const Factor&); + + // only these classes are allowed to instantiate this class + friend class FactorCollection; + friend struct FactorFriend; + + // FactorCollection writes here. + // This is mutable so the pointer can be changed to pool-backed memory. + mutable StringPiece m_string; + size_t m_id; + + //! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects + Factor() { + } + + // Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly. + Factor(const Factor &factor) : + m_string(factor.m_string), m_id(factor.m_id) { + } + + // Not implemented. Shouldn't be called. + Factor &operator=(const Factor &factor); + +public: + //! original string representation of the factor + StringPiece GetString() const { + return m_string; + } + //! contiguous ID + inline size_t GetId() const { + return m_id; + } + + /** transitive comparison between 2 factors. + * -1 = less than + * +1 = more than + * 0 = same + */ + inline int Compare(const Factor &compare) const { + if (this < &compare) return -1; + if (this > &compare) return 1; + return 0; + } + //! transitive comparison used for adding objects into FactorCollection + inline bool operator<(const Factor &compare) const { + return this < &compare; + } + + // quick equality comparison. Not used + inline bool operator==(const Factor &compare) const { + return this == &compare; + } +}; + +size_t hash_value(const Factor &f); + +} + diff --git a/mosesdecoder/moses2/legacy/FactorCollection.cpp b/mosesdecoder/moses2/legacy/FactorCollection.cpp new file mode 100644 index 0000000000000000000000000000000000000000..80081bab97096fc20b666c6080751df42460c450 --- /dev/null +++ b/mosesdecoder/moses2/legacy/FactorCollection.cpp @@ -0,0 +1,110 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#ifdef WITH_THREADS +#include +#endif +#include +#include +#include "FactorCollection.h" +#include "util/pool.hh" +#include "util/exception.hh" +#include "../System.h" + +using namespace std; + +namespace Moses2 +{ + +const Factor *FactorCollection::AddFactor(const StringPiece &factorString, + const System &system, bool isNonTerminal) +{ + FactorFriend to_ins; + to_ins.in.m_string = factorString; + to_ins.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId; + Set & set = (isNonTerminal) ? m_set : m_setNonTerminal; + // If we're threaded, hope a read-only lock is sufficient. +#ifdef WITH_THREADS + { + // read=lock scope + boost::shared_lock read_lock(m_accessLock); + Set::const_iterator i = set.find(to_ins); + if (i != set.end()) return &i->in; + } + boost::unique_lock lock(m_accessLock); +#endif // WITH_THREADS + std::pair ret(set.insert(to_ins)); + if (ret.second) { + ret.first->in.m_string.set( + memcpy(m_string_backing.Allocate(factorString.size()), + factorString.data(), factorString.size()), factorString.size()); + if (isNonTerminal) { + m_factorIdNonTerminal++; + UTIL_THROW_IF2(m_factorIdNonTerminal >= moses_MaxNumNonterminals, + "Number of non-terminals exceeds maximum size reserved. Adjust parameter moses_MaxNumNonterminals, then recompile"); + } else { + m_factorId++; + } + } + + const Factor *factor = &ret.first->in; + + return factor; +} + +const Factor *FactorCollection::GetFactor(const StringPiece &factorString, + bool isNonTerminal) +{ + FactorFriend to_find; + to_find.in.m_string = factorString; + to_find.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId; + Set & set = (isNonTerminal) ? m_set : m_setNonTerminal; + { + // read=lock scope +#ifdef WITH_THREADS + boost::shared_lock read_lock(m_accessLock); +#endif // WITH_THREADS + Set::const_iterator i = set.find(to_find); + if (i != set.end()) return &i->in; + } + return NULL; +} + +FactorCollection::~FactorCollection() +{ +} + +// friend +ostream& operator<<(ostream& out, const FactorCollection& factorCollection) +{ +#ifdef WITH_THREADS + boost::shared_lock lock(factorCollection.m_accessLock); +#endif + for (FactorCollection::Set::const_iterator i = factorCollection.m_set.begin(); + i != factorCollection.m_set.end(); ++i) { + out << i->in; + } + return out; +} + +} + diff --git a/mosesdecoder/moses2/legacy/FactorCollection.h b/mosesdecoder/moses2/legacy/FactorCollection.h new file mode 100644 index 0000000000000000000000000000000000000000..1b29dee699db3eaa67b67acaaf082d669f77a780 --- /dev/null +++ b/mosesdecoder/moses2/legacy/FactorCollection.h @@ -0,0 +1,123 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +// reserve space for non-terminal symbols (ensuring consecutive numbering, and allowing quick lookup by ID) +#ifndef moses_MaxNumNonterminals +#define moses_MaxNumNonterminals 10000 +#endif + +#ifdef WITH_THREADS +#include +#endif + +#include "util/murmur_hash.hh" +#include + +#include +#include + +#include "util/string_piece.hh" +#include "util/pool.hh" +#include "Factor.h" + +namespace Moses2 +{ + +class System; + +/** We don't want Factor to be copyable by anybody. But we also want to store + * it in an STL container. The solution is that Factor's copy constructor is + * private and friended to FactorFriend. The STL containers can delegate + * copying, so friending the container isn't sufficient. STL containers see + * FactorFriend's public copy constructor and everybody else sees Factor's + * private copy constructor. + */ +struct FactorFriend { + Factor in; +}; + +/** collection of factors + * + * All Factors in moses are accessed and created by a FactorCollection. + * By enforcing this strict creation processes (ie, forbidding factors + * from being created on the stack, etc), their memory addresses can + * be used as keys to uniquely identify them. + * Only 1 FactorCollection object should be created. + */ +class FactorCollection +{ + friend std::ostream& operator<<(std::ostream&, const FactorCollection&); + friend class System; + + struct HashFactor: public std::unary_function { + std::size_t operator()(const FactorFriend &factor) const { + return util::MurmurHashNative(factor.in.m_string.data(), + factor.in.m_string.size()); + } + }; + struct EqualsFactor: public std::binary_function { + bool operator()(const FactorFriend &left, const FactorFriend &right) const { + return left.in.GetString() == right.in.GetString(); + } + }; + typedef boost::unordered_set Set; + Set m_set; + Set m_setNonTerminal; + + util::Pool m_string_backing; + +#ifdef WITH_THREADS + //reader-writer lock + mutable boost::shared_mutex m_accessLock; +#endif + + size_t m_factorIdNonTerminal; /**< unique, contiguous ids, starting from 0, for each non-terminal factor */ + size_t m_factorId; /**< unique, contiguous ids, starting from moses_MaxNumNonterminals, for each terminal factor */ + + //! constructor. only the 1 static variable can be created + FactorCollection() : + m_factorIdNonTerminal(0), m_factorId(moses_MaxNumNonterminals) { + } + +public: + ~FactorCollection(); + + /** returns a factor with the same direction, factorType and factorString. + * If a factor already exist in the collection, return the existing factor, if not create a new 1 + */ + const Factor *AddFactor(const StringPiece &factorString, const System &system, + bool isNonTerminal); + + size_t GetNumNonTerminals() { + return m_factorIdNonTerminal; + } + + const Factor *GetFactor(const StringPiece &factorString, bool isNonTerminal = + false); + +}; + +} + diff --git a/mosesdecoder/moses2/legacy/InputFileStream.cpp b/mosesdecoder/moses2/legacy/InputFileStream.cpp new file mode 100644 index 0000000000000000000000000000000000000000..25bb156fe89482ea699b6de361736419dee646d8 --- /dev/null +++ b/mosesdecoder/moses2/legacy/InputFileStream.cpp @@ -0,0 +1,59 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "InputFileStream.h" +#include "gzfilebuf.h" +#include + +using namespace std; + +namespace Moses2 +{ + +InputFileStream::InputFileStream(const std::string &filePath) : + std::istream(NULL), m_streambuf(NULL) +{ + if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") { + m_streambuf = new gzfilebuf(filePath.c_str()); + } else { + std::filebuf* fb = new std::filebuf(); + fb = fb->open(filePath.c_str(), std::ios::in); + if (!fb) { + cerr << "Can't read " << filePath.c_str() << endl; + exit(1); + } + m_streambuf = fb; + } + this->init(m_streambuf); +} + +InputFileStream::~InputFileStream() +{ + delete m_streambuf; + m_streambuf = NULL; +} + +void InputFileStream::Close() +{ +} + +} + diff --git a/mosesdecoder/moses2/legacy/InputFileStream.h b/mosesdecoder/moses2/legacy/InputFileStream.h new file mode 100644 index 0000000000000000000000000000000000000000..d8f78848c3c6dd75490918d21c655108d23a5802 --- /dev/null +++ b/mosesdecoder/moses2/legacy/InputFileStream.h @@ -0,0 +1,46 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include + +namespace Moses2 +{ + +/** Used in place of std::istream, can read zipped files if it ends in .gz + */ +class InputFileStream: public std::istream +{ +protected: + std::streambuf *m_streambuf; +public: + + explicit InputFileStream(const std::string &filePath); + ~InputFileStream(); + + void Close(); +}; + +} + diff --git a/mosesdecoder/moses2/legacy/Matrix.cpp b/mosesdecoder/moses2/legacy/Matrix.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9d2abc8ab819526a7be951bfe0c5f37600425ff1 --- /dev/null +++ b/mosesdecoder/moses2/legacy/Matrix.cpp @@ -0,0 +1,34 @@ +// $Id$ +// vim:tabstop=2 + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include +#include "Matrix.h" +#include "Util2.h" + +using namespace std; + +namespace Moses2 +{ + +} + diff --git a/mosesdecoder/moses2/legacy/Matrix.h b/mosesdecoder/moses2/legacy/Matrix.h new file mode 100644 index 0000000000000000000000000000000000000000..e2dbbba2c0b07bee69905fdf2cc865bb3ae624d1 --- /dev/null +++ b/mosesdecoder/moses2/legacy/Matrix.h @@ -0,0 +1,97 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include "Util2.h" +#include "../MemPool.h" + +namespace Moses2 +{ +template +class Matrix +{ +protected: + size_t m_rows, m_cols; /**< length of the square (sentence length) */ + T *m_array; /**< two-dimensional array to store floats */ + + Matrix(); // not implemented + Matrix(const Matrix ©); // not implemented + +public: + Matrix(MemPool &pool, size_t rows, size_t cols) : + m_rows(rows), m_cols(cols) { + m_array = pool.Allocate(rows * cols); + } + + //~Matrix(); // not implemented + + // set upper triangle + void InitTriangle(const T &val) { + assert(m_rows == m_cols); + for (size_t row = 0; row < m_rows; row++) { + for (size_t col = row; col < m_cols; col++) { + SetValue(row, col, val); + } + } + } + + // everything + void Init(const T &val) { + for (size_t row = 0; row < m_rows; row++) { + for (size_t col = 0; col < m_cols; col++) { + SetValue(row, col, val); + } + } + } + + /** Returns length of the square: typically the sentence length */ + inline size_t GetSize() const { + assert(m_rows == m_cols); + return m_rows; + } + + inline size_t GetRows() const { + return m_rows; + } + + inline size_t GetCols() const { + return m_cols; + } + + /** Get a future cost score for a span */ + inline const T &GetValue(size_t row, size_t col) const { + return m_array[row * m_cols + col]; + } + + inline T &GetValue(size_t row, size_t col) { + return m_array[row * m_cols + col]; + } + + /** Set a future cost score for a span */ + inline void SetValue(size_t row, size_t col, const T &value) { + m_array[row * m_cols + col] = value; + } +}; + +} + diff --git a/mosesdecoder/moses2/legacy/OutputCollector.h b/mosesdecoder/moses2/legacy/OutputCollector.h new file mode 100644 index 0000000000000000000000000000000000000000..fdd54c5a2f84416c3cbeb4f18337c9bd43add317 --- /dev/null +++ b/mosesdecoder/moses2/legacy/OutputCollector.h @@ -0,0 +1,152 @@ +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2011 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#ifdef WITH_THREADS +#include +#endif + +#ifdef BOOST_HAS_PTHREADS +#include +#endif + +#include +#include +#include +#include +#include +#include "util/exception.hh" + +namespace Moses2 +{ +/** + * Makes sure output goes in the correct order when multi-threading + **/ +class OutputCollector +{ +public: + OutputCollector(std::ostream* outStream = &std::cout, + std::ostream* debugStream = &std::cerr) : + m_nextOutput(0), m_outStream(outStream), m_debugStream(debugStream), m_isHoldingOutputStream( + false), m_isHoldingDebugStream(false) { + } + + OutputCollector(std::string xout, std::string xerr = "") : + m_nextOutput(0) { + // TO DO open magic streams instead of regular ofstreams! [UG] + + if (xout == "/dev/stderr") { + m_outStream = &std::cerr; + m_isHoldingOutputStream = false; + } else if (xout.size() && xout != "/dev/stdout" && xout != "-") { + m_outStream = new std::ofstream(xout.c_str()); + UTIL_THROW_IF2(!m_outStream->good(), + "Failed to open output file" << xout); + m_isHoldingOutputStream = true; + } else { + m_outStream = &std::cout; + m_isHoldingOutputStream = false; + } + + if (xerr == "/dev/stdout") { + m_debugStream = &std::cout; + m_isHoldingDebugStream = false; + } else if (xerr.size() && xerr != "/dev/stderr") { + m_debugStream = new std::ofstream(xerr.c_str()); + UTIL_THROW_IF2(!m_debugStream->good(), + "Failed to open debug stream" << xerr); + m_isHoldingDebugStream = true; + } else { + m_debugStream = &std::cerr; + m_isHoldingDebugStream = false; + } + } + + ~OutputCollector() { + if (m_isHoldingOutputStream) delete m_outStream; + if (m_isHoldingDebugStream) delete m_debugStream; + } + + void HoldOutputStream() { + m_isHoldingOutputStream = true; + } + + void HoldDebugStream() { + m_isHoldingDebugStream = true; + } + + bool OutputIsCout() const { + return (m_outStream == &std::cout); + } + + /** + * Write or cache the output, as appropriate. + **/ + void Write(int sourceId, const std::string& output, const std::string& debug = + "") { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + if (sourceId == m_nextOutput) { + //This is the one we were expecting + *m_outStream << output << std::flush; + *m_debugStream << debug << std::flush; + ++m_nextOutput; + //see if there's any more + std::map::iterator iter; + while ((iter = m_outputs.find(m_nextOutput)) != m_outputs.end()) { + *m_outStream << iter->second << std::flush; + ++m_nextOutput; + std::map::iterator debugIter = m_debugs.find( + iter->first); + m_outputs.erase(iter); + if (debugIter != m_debugs.end()) { + *m_debugStream << debugIter->second << std::flush; + m_debugs.erase(debugIter); + } + } + } else { + //save for later + m_outputs[sourceId] = output; + m_debugs[sourceId] = debug; + } + } + +private: + std::map m_outputs; + std::map m_debugs; + int m_nextOutput; + std::ostream* m_outStream; + std::ostream* m_debugStream; + bool m_isHoldingOutputStream; + bool m_isHoldingDebugStream; +#ifdef WITH_THREADS + boost::mutex m_mutex; +#endif + +public: + void SetOutputStream(std::ostream* outStream) { + m_outStream = outStream; + } + +}; + +} // namespace Moses + diff --git a/mosesdecoder/moses2/legacy/OutputFileStream.cpp b/mosesdecoder/moses2/legacy/OutputFileStream.cpp new file mode 100644 index 0000000000000000000000000000000000000000..81047ffe1a0f842c1244b962279b14cef69bbd7f --- /dev/null +++ b/mosesdecoder/moses2/legacy/OutputFileStream.cpp @@ -0,0 +1,87 @@ +// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include +#include +#include "OutputFileStream.h" +#include "gzfilebuf.h" + +using namespace std; +using namespace boost::algorithm; + +namespace Moses2 +{ +OutputFileStream::OutputFileStream() : + boost::iostreams::filtering_ostream(), m_outFile(NULL), m_open(false) +{ +} + +OutputFileStream::OutputFileStream(const std::string &filePath) : + m_outFile(NULL), m_open(false) +{ + Open(filePath); +} + +OutputFileStream::~OutputFileStream() +{ + Close(); +} + +bool OutputFileStream::Open(const std::string &filePath) +{ + assert(!m_open); + if (filePath == std::string("-")) { + // Write to standard output. Leave m_outFile null. + this->push(std::cout); + } else { + m_outFile = new ofstream(filePath.c_str(), + ios_base::out | ios_base::binary); + if (m_outFile->fail()) { + return false; + } + + if (ends_with(filePath, ".gz")) { + this->push(boost::iostreams::gzip_compressor()); + } + this->push(*m_outFile); + } + + m_open = true; + return true; +} + +void OutputFileStream::Close() +{ + if (!m_open) return; + this->flush(); + if (m_outFile) { + this->pop(); // file + + m_outFile->close(); + delete m_outFile; + m_outFile = NULL; + } + m_open = false; +} + +} + diff --git a/mosesdecoder/moses2/legacy/OutputFileStream.h b/mosesdecoder/moses2/legacy/OutputFileStream.h new file mode 100644 index 0000000000000000000000000000000000000000..27c0b453905c0c3f182f8437dac8f0197d0146ca --- /dev/null +++ b/mosesdecoder/moses2/legacy/OutputFileStream.h @@ -0,0 +1,81 @@ +// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include + +namespace Moses2 +{ + +/** Version of std::ostream with transparent compression. + * + * Transparently compresses output when writing to a file whose name ends in + * ".gz". Or, writes to stdout instead of a file when given a filename + * consisting of just a dash ("-"). + */ +class OutputFileStream: public boost::iostreams::filtering_ostream +{ +private: + /** File that needs flushing & closing when we close this stream. + * + * Is NULL when no file is opened, e.g. when writing to standard output. + */ + std::ofstream *m_outFile; + + /// Is this stream open? + bool m_open; + +public: + /** Create an unopened OutputFileStream. + * + * Until it's been opened, nothing can be done with this stream. + */ + OutputFileStream(); + + /// Create an OutputFileStream, and open it by calling Open(). + OutputFileStream(const std::string &filePath); + virtual ~OutputFileStream(); + + // TODO: Can we please just always throw an exception when this fails? + /** Open stream. + * + * If filePath is "-" (just a dash), this opens the stream for writing to + * standard output. Otherwise, it opens the given file. If the filename + * has the ".gz" suffix, output will be transparently compressed. + * + * Call Close() to close the file. + * + * Returns whether opening the file was successful. It may also throw an + * exception on failure. + */ + bool Open(const std::string &filePath); + + /// Flush and close stream. After this, the stream can be opened again. + void Close(); +}; + +} + diff --git a/mosesdecoder/moses2/legacy/Parameter.cpp b/mosesdecoder/moses2/legacy/Parameter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7376c109909b81b5bd52032e205460bda9964e9d --- /dev/null +++ b/mosesdecoder/moses2/legacy/Parameter.cpp @@ -0,0 +1,1692 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Parameter.h" +#include "InputFileStream.h" +#include "../FF/FeatureRegistry.h" +#include "util/string_stream.hh" +#include "util/exception.hh" +#include "util/random.hh" + +using namespace std; +using namespace boost::algorithm; +namespace po = boost::program_options; + +namespace Moses2 +{ + +/** define allowed parameters */ +Parameter::Parameter() +{ + /////////////////////////////////////////////////////////////////////////////////////// + // general options + po::options_description main_opts("Main Options"); + AddParam(main_opts, "config", "f", "location of the configuration file"); + AddParam(main_opts, "input-file", "i", + "location of the input file to be translated"); + + AddParam(main_opts, "verbose", "v", "verbosity level of the logging"); + AddParam(main_opts, "show-weights", "print feature weights and exit"); + //AddParam(main_opts, "time-out", + // "seconds after which is interrupted (-1=no time-out, default is -1)"); + + /////////////////////////////////////////////////////////////////////////////////////// + // factorization options + po::options_description factor_opts("General Factorization Options"); + //AddParam(factor_opts, "factor-delimiter", "fd", + // "specify a different factor delimiter than the default"); + // one should be able to specify different factor delimiters for intput and output + AddParam(factor_opts, "mapping", "description of decoding steps"); // whatever that means ... + AddParam(factor_opts, "placeholder-factor", + "Which source factor to use to store the original text for placeholders. The factor must not be used by a translation or gen model"); + + /////////////////////////////////////////////////////////////////////////////////////// + // general search options + po::options_description search_opts("Search Options"); + string desc = "Which search algorithm to use.\n"; + desc += "0=normal stack (default)\n"; + desc += "1=cube pruning\n"; + desc += "3=chart (with cube pruning)\n"; + desc += "4=stack with batched lm requests\n"; + desc += "5=chart (with incremental search)\n"; + desc += "6=string-to-tree\n"; + desc += "7=tree-to-string\n"; + desc += "8=tree-to-string (SCFG-based)\n"; + desc += "9=forest-to-string"; + AddParam(search_opts, "search-algorithm", desc); + AddParam(search_opts, "beam-threshold", "b", + "threshold for threshold pruning"); + //AddParam(search_opts, "early-discarding-threshold", "edt", + // "threshold for constructing hypotheses based on estimate cost"); + AddParam(search_opts, "stack", "s", + "maximum stack size for histogram pruning. 0 = unlimited stack size"); + //AddParam(search_opts, "stack-diversity", "sd", + // "minimum number of hypothesis of each coverage in stack (default 0)"); + + // feature weight-related options + //AddParam(search_opts, "weight-file", "wf", + // "feature weights file. Do *not* put weights for 'core' features in here - they go in moses.ini"); + AddParam(search_opts, "weight", + "weights for ALL models, 1 per line 'WeightName value'. Weight names can be repeated"); + + AddParam(search_opts, "feature-overwrite", + "Override arguments in a particular feature function with a particular key. Format: -feature-overwrite \"FeatureName key=value\""); + + po::options_description tune_opts("Options used in tuning."); + AddParam(tune_opts, "weight-overwrite", + "special parameter for mert. All on 1 line. Overrides weights specified in 'weights' argument"); + AddParam(tune_opts, "feature-add", + "Add a feature function on the command line. Used by mira to add BLEU feature"); + AddParam(tune_opts, "weight-add", + "Add weight for FF if it doesn't exist, i.e weights here are added 1st, and can be override by the ini file or on the command line. Used to specify initial weights for FF that was also specified on the copmmand line"); + + // phrase table limitations: + //AddParam(search_opts, "max-partial-trans-opt", + // "maximum number of partial translation options per input span (during mapping steps)"); + //AddParam(search_opts, "max-trans-opt-per-coverage", + // "maximum number of translation options per input span (after applying mapping steps)"); + AddParam(search_opts, "max-phrase-length", + "maximum phrase length (default 20)"); + //AddParam(search_opts, "translation-option-threshold", "tot", + // "threshold for translation options relative to best for input phrase"); + + // miscellaneous search options + //AddParam(search_opts, "disable-discarding", "dd", + // "disable hypothesis discarding"); // ??? memory management? UG + //AddParam(search_opts, "phrase-drop-allowed", "da", + // "if present, allow dropping of source words"); //da = drop any (word); see -du for comparison + AddParam(search_opts, "threads", "th", + "number of threads to use in decoding (defaults to single-threaded)"); + + // distortion options + po::options_description disto_opts("Distortion options"); + AddParam(disto_opts, "distortion-limit", "dl", + "distortion (reordering) limit in maximum number of words (0 = monotone, -1 = unlimited)"); + AddParam(disto_opts, "monotone-at-punctuation", "mp", + "do not reorder over punctuation"); + //AddParam(disto_opts, "early-distortion-cost", "edc", + // "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no"); + //AddParam(disto_opts, "distortion", + // "configurations for each factorized/lexicalized reordering model."); // zombie parameter? + + // cube pruning + po::options_description cube_opts("Cube pruning options."); + AddParam(cube_opts, "cube-pruning-pop-limit", "cbp", + "How many hypotheses should be popped for each stack. (default = 1000)"); + AddParam(cube_opts, "cube-pruning-diversity", "cbd", + "How many hypotheses should be created for each coverage. (default = 0)"); + AddParam(cube_opts, "cube-pruning-lazy-scoring", "cbls", + "Don't fully score a hypothesis until it is popped"); + //AddParam(cube_opts, "cube-pruning-deterministic-search", "cbds", + // "Break ties deterministically during search"); + + /////////////////////////////////////////////////////////////////////////////////////// + // minimum bayes risk decoding + po::options_description mbr_opts( + "Minimum Bayes Risk (MBR), Lattice MBR, and Consensus decoding"); + + //AddParam(mbr_opts, "minimum-bayes-risk", "mbr", + // "use miminum Bayes risk to determine best translation"); + //AddParam(mbr_opts, "mbr-size", + // "number of translation candidates considered in MBR decoding (default 200)"); + //AddParam(mbr_opts, "mbr-scale", + // "scaling factor to convert log linear score probability in MBR decoding (default 1.0)"); + + //AddParam(mbr_opts, "lminimum-bayes-risk", "lmbr", + // "use lattice miminum Bayes risk to determine best translation"); + //AddParam(mbr_opts, "consensus-decoding", "con", + // "use consensus decoding (De Nero et. al. 2009)"); + + po::options_description lmbr_opts("Options specific to Lattic MBR"); + //AddParam(lmbr_opts, "lmbr-p", "unigram precision value for lattice mbr"); + //AddParam(lmbr_opts, "lmbr-r", "ngram precision decay value for lattice mbr"); + //AddParam(lmbr_opts, "lmbr-thetas", "theta(s) for lattice mbr calculation"); + //AddParam(mbr_opts, "lmbr-map-weight", + // "weight given to map solution when doing lattice MBR (default 0)"); + //AddParam(mbr_opts, "lmbr-pruning-factor", + // "average number of nodes/word wanted in pruned lattice"); + //AddParam(mbr_opts, "lattice-hypo-set", + // "to use lattice as hypo set during lattice MBR"); + + /////////////////////////////////////////////////////////////////////////////////////// + // OOV handling options + po::options_description oov_opts("OOV Handling Options"); + AddParam(oov_opts, "drop-unknown", "du", + "drop unknown words instead of copying them"); + AddParam(oov_opts, "mark-unknown", "mu", "mark unknown words in output"); + AddParam(oov_opts, "unknown-word-prefix", + "prefix to unknwon word when marked (default: 'UNK')"); + AddParam(oov_opts, "unknown-word-suffix", + "suffix to unknwon word when marked (default: '')"); + //AddParam(oov_opts, "lmodel-oov-feature", + // "add language model oov feature, one per model"); + //AddParam(oov_opts, "output-unknowns", + // "Output the unknown (OOV) words to the given file, one line per sentence"); + //AddParam(oov_opts, "always-create-direct-transopt", + // "Always create a translation that translates the source word ad-verbatim"); + + /////////////////////////////////////////////////////////////////////////////////////// + // input options + po::options_description input_opts("Input Format Options"); + AddParam(input_opts, "input-factors", "list of factors in the input"); + AddParam(input_opts, "inputtype", + "text (0), confusion network (1), word lattice (2), tree (3) (default = 0)"); + AddParam(input_opts, "xml-input", "xi", + "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'constraint', 'ignore'"); + //AddParam(input_opts, "xml-brackets", "xb", + // "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode"); + //AddParam(input_opts, "start-translation-id", "Id of 1st input. Default = 0"); + //AddParam(input_opts, "alternate-weight-setting", "aws", + // "alternate set of weights to used per xml specification"); + + /////////////////////////////////////////////////////////////////////////////////////// + // output options + po::options_description output_opts("Output Options"); + //AddParam(output_opts, "report-all-factors", + // "report all factors in output, not just first"); + AddParam(output_opts, "output-factors", "list if factors in the output"); + //AddParam(output_opts, "print-id", + // "prefix translations with id. Default if false"); + //AddParam(output_opts, "print-passthrough", + // "output the sgml tag without any computation on that. Default is false"); + //AddParam(output_opts, "print-passthrough-in-n-best", + // "output the sgml tag without any computation on that in each entry of the n-best-list. Default is false"); + //AddParam(output_opts, "print-all-derivations", + // "to print all derivations in search graph"); + AddParam(output_opts, "translation-details", "T", + "for each best hypothesis, report translation details to the given file"); + + AddParam(output_opts, "output-hypo-score", + "Output the hypo score to stdout with the output string. For search error analysis. Default is false"); + //AddParam(output_opts, "output-word-graph", "owg", + // "Output stack info as word graph. Takes filename, 0=only hypos in stack, 1=stack + nbest hypos"); + //AddParam(output_opts, "tree-translation-details", "Ttree", + // "for each hypothesis, report translation details with tree fragment info to given file"); + //AddParam(output_opts, "print-alignment-info", + // "Output word-to-word alignment to standard out, separated from translation by |||. Word-to-word alignments are takne from the phrase table if any. Default is false"); + //AddParam(output_opts, "alignment-output-file", + // "print output word alignments into given file"); + //AddParam(output_opts, "sort-word-alignment", + // "Sort word alignments for more consistent display. 0=no sort (default), 1=target order"); + AddParam(output_opts, "report-segmentation", "t", + "report phrase segmentation in the output"); + AddParam(output_opts, "report-segmentation-enriched", "tt", + "report phrase segmentation in the output with additional information"); + + // translation-all-details was introduced in the context of DIMwid: Decoder Inspection for Moses (using Widgets) + // see here: https://ufal.mff.cuni.cz/pbml/100/art-kurtz-seemann-braune-maletti.pdf + //AddParam(output_opts, "translation-all-details", "Tall", + // "for all hypotheses, report translation details to the given file"); + + po::options_description osg_opts("Options for outputting search graphs"); + //AddParam(osg_opts, "output-search-graph", "osg", + // "Output connected hypotheses of search into specified filename"); + //AddParam(osg_opts, "output-search-graph-extended", "osgx", + // "Output connected hypotheses of search into specified filename, in extended format"); + //AddParam(osg_opts, "unpruned-search-graph", "usg", + // "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses"); + //AddParam(osg_opts, "output-search-graph-slf", "slf", + // "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF) - the flag should be followed by a directory name, which must exist"); + //AddParam(output_opts, "include-lhs-in-search-graph", "lhssg", + // "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)"); +#ifdef HAVE_PROTOBUF + //AddParam(osg_opts,"output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path."); +#endif + //AddParam(osg_opts, "output-search-graph-hypergraph", + // "DEPRECATED! Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder). This flag is followed by 3 values: 'true (gz|txt|bz) directory-name'"); + + /////////////////////////////////////////////////////////////////////////////////////// + // nbest-options + po::options_description nbest_opts("N-best Options"); + AddParam(nbest_opts, "n-best-list", + "file and size of n-best-list to be generated; specify - as the file in order to write to STDOUT"); + // AddParam(nbest_opts,"n-best-list-file", "file of n-best-list to be generated; specify - as the file in order to write to STDOUT"); + // AddParam(nbest_opts,"n-best-list-size", "size of n-best-list to be generated; specify - as the file in order to write to STDOUT"); + //AddParam(nbest_opts, "labeled-n-best-list", + // "print out labels for each weight type in n-best list. default is true"); + //AddParam(nbest_opts, "n-best-trees", + // "Write n-best target-side trees to n-best-list"); + AddParam(nbest_opts, "n-best-factor", + "factor to compute the maximum number of contenders (=factor*nbest-size). value 0 means infinity, i.e. no threshold. default is 0"); + //AddParam(nbest_opts, "report-all-factors-in-n-best", + // "Report all factors in n-best-lists. Default is false"); + //AddParam(nbest_opts, "lattice-samples", + // "generate samples from lattice, in same format as nbest list. Uses the file and size arguments, as in n-best-list"); + //AddParam(nbest_opts, "include-segmentation-in-n-best", + // "include phrasal segmentation in the n-best list. default is false"); + //AddParam(nbest_opts, "print-alignment-info-in-n-best", + // "Include word-to-word alignment in the n-best list. Word-to-word alignments are taken from the phrase table if any. Default is false"); + + /////////////////////////////////////////////////////////////////////////////////////// + // server options + po::options_description server_opts("Moses Server Options"); + AddParam(server_opts, "server", "Run moses as a translation server."); + AddParam(server_opts, "server-port", "Port for moses server"); + AddParam(server_opts, "server-log", "Log destination for moses server"); + //AddParam(server_opts, "session-timeout", + // "Timeout for sessions, e.g. '2h30m' or 1d (=24h)"); + //AddParam(server_opts, "session-cache-size", + // string("Max. number of sessions cached.") + // + "Least recently used session is dumped first."); + AddParam(server_opts, "serial", + "Run server in serial mode, processing only one request at a time."); + + AddParam(server_opts,"server-maxconn", + "Max. No of simultaneous HTTP transactions allowed by the server."); + AddParam(server_opts,"server-maxconn-backlog", + "Max. No. of requests the OS will queue if the server is busy."); + AddParam(server_opts,"server-keepalive-maxconn", + "Max. No. of requests the server will accept on a single TCP connection."); + AddParam(server_opts,"server-keepalive-timeout", + "Max. number of seconds the server will keep a persistent connection alive."); + AddParam(server_opts,"server-timeout", + "Max. number of seconds the server will wait for a client to submit a request once a connection has been established."); + + po::options_description irstlm_opts("IRSTLM Options"); + //AddParam(irstlm_opts, "clean-lm-cache", + // "clean language model caches after N translations (default N=1)"); + + po::options_description chart_opts("Chart Decoding Options"); + AddParam(chart_opts, "max-chart-span", + "maximum num. of source word chart rules can consume (default 10)"); + AddParam(chart_opts, "non-terminals", + "list of non-term symbols, space separated"); + //AddParam(chart_opts, "rule-limit", + // "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE"); + //AddParam(chart_opts, "source-label-overlap", + // "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0"); + //AddParam(chart_opts, "unknown-lhs", + // "file containing target lhs of unknown words. 1 per line: LHS prob"); + + po::options_description misc_opts("Miscellaneous Options"); + //AddParam(misc_opts, "mira", "do mira training"); + //AddParam(misc_opts, "description", + // "Source language, target language, description"); + //AddParam(misc_opts, "no-cache", + // "Disable all phrase-table caching. Default = false (ie. enable caching)"); + //AddParam(misc_opts, "default-non-term-for-empty-range-only", + // "Don't add [X] to all ranges, just ranges where there isn't a source non-term. Default = false (ie. add [X] everywhere)"); + //AddParam(misc_opts, "s2t-parsing-algorithm", + // "Which S2T parsing algorithm to use. 0=recursive CYK+, 1=scope-3 (default = 0)"); + + //AddParam(o,"continue-partial-translation", "cpt", "start from nonempty hypothesis"); + AddParam(misc_opts, "decoding-graph-backoff", "dpb", + "only use subsequent decoding paths for unknown spans of given length"); + //AddParam(misc_opts, "references", + // "Reference file(s) - used for bleu score feature"); + //AddParam(misc_opts, "recover-input-path", "r", + // "(conf net/word lattice only) - recover input path corresponding to the best translation"); + //AddParam(misc_opts, "link-param-count", + // "Number of parameters on word links when using confusion networks or lattices (default = 1)"); + //AddParam(misc_opts, "feature-name-overwrite", + // "Override feature name (NOT arguments). Eg. SRILM-->KENLM, PhraseDictionaryMemory-->PhraseDictionaryScope3"); + + AddParam(misc_opts, "feature", "All the feature functions should be here"); + //AddParam(misc_opts, "context-string", + // "A (tokenized) string containing context words for context-sensitive translation."); + //AddParam(misc_opts, "context-weights", + // "A key-value map for context-sensitive translation."); + //AddParam(misc_opts, "context-window", + // "Context window (in words) for context-sensitive translation: {+|-|+-}."); + AddParam(misc_opts, "cpu-affinity-offset", "CPU Affinity. Default = -1 (no affinity)"); + AddParam(misc_opts, "cpu-affinity-increment", + "Set to 1 (default) to put each thread on different cores. 0 to run all threads on one core"); + + // Compact phrase table and reordering table. + po::options_description cpt_opts( + "Options when using compact phrase and reordering tables."); + //AddParam(cpt_opts, "minphr-memory", + // "Load phrase table in minphr format into memory"); + //AddParam(cpt_opts, "minlexr-memory", + // "Load lexical reordering table in minlexr format into memory"); + + po::options_description spe_opts("Simulated Post-editing Options"); + //AddParam(spe_opts, "spe-src", "Simulated post-editing. Source filename"); + //AddParam(spe_opts, "spe-trg", "Simulated post-editing. Target filename"); + //AddParam(spe_opts, "spe-aln", "Simulated post-editing. Alignment filename"); + + /////////////////////////////////////////////////////////////////////////////////////// + // DEPRECATED options + po::options_description deprec_opts("Deprecated Options"); + AddParam(deprec_opts, "text-type", + "DEPRECATED. DO NOT USE. should be one of dev/devtest/test, used for domain adaptation features"); + + /* + AddParam(deprec_opts, "link-param-count", + "DEPRECATED. DO NOT USE. Number of parameters on word links when using confusion networks or lattices (default = 1)"); + AddParam(deprec_opts, "weight-slm", "slm", + "DEPRECATED. DO NOT USE. weight(s) for syntactic language model"); + AddParam(deprec_opts, "weight-bl", "bl", + "DEPRECATED. DO NOT USE. weight for bleu score feature"); + AddParam(deprec_opts, "weight-d", "d", + "DEPRECATED. DO NOT USE. weight(s) for distortion (reordering components)"); + AddParam(deprec_opts, "weight-dlm", "dlm", + "DEPRECATED. DO NOT USE. weight for discriminative LM feature function (on top of sparse weights)"); + AddParam(deprec_opts, "weight-lr", "lr", + "DEPRECATED. DO NOT USE. weight(s) for lexicalized reordering, if not included in weight-d"); + AddParam(deprec_opts, "weight-generation", "g", + "DEPRECATED. DO NOT USE. weight(s) for generation components"); + AddParam(deprec_opts, "weight-i", "I", + "DEPRECATED. DO NOT USE. weight(s) for word insertion - used for parameters from confusion network and lattice input links"); + AddParam(deprec_opts, "weight-l", "lm", + "DEPRECATED. DO NOT USE. weight(s) for language models"); + AddParam(deprec_opts, "weight-lex", "lex", + "DEPRECATED. DO NOT USE. weight for global lexical model"); + AddParam(deprec_opts, "weight-glm", "glm", + "DEPRECATED. DO NOT USE. weight for global lexical feature, sparse producer"); + AddParam(deprec_opts, "weight-wt", "wt", + "DEPRECATED. DO NOT USE. weight for word translation feature"); + AddParam(deprec_opts, "weight-pp", "pp", + "DEPRECATED. DO NOT USE. weight for phrase pair feature"); + AddParam(deprec_opts, "weight-pb", "pb", + "DEPRECATED. DO NOT USE. weight for phrase boundary feature"); + AddParam(deprec_opts, "weight-t", "tm", + "DEPRECATED. DO NOT USE. weights for translation model components"); + AddParam(deprec_opts, "weight-p", "w", + "DEPRECATED. DO NOT USE. weight for phrase penalty"); + AddParam(deprec_opts, "weight-w", "w", + "DEPRECATED. DO NOT USE. weight for word penalty"); + AddParam(deprec_opts, "weight-u", "u", + "DEPRECATED. DO NOT USE. weight for unknown word penalty"); + AddParam(deprec_opts, "weight-e", "e", + "DEPRECATED. DO NOT USE. weight for word deletion"); + AddParam(deprec_opts, "input-scores", + "DEPRECATED. DO NOT USE. 2 numbers on 2 lines - [1] of scores on each edge of a confusion network or lattice input (default=1). [2] Number of 'real' word scores (0 or 1. default=0)"); + AddParam(deprec_opts, "dlm-model", + "DEPRECATED. DO NOT USE. Order, factor and vocabulary file for discriminative LM. Use * for filename to indicate unlimited vocabulary."); + AddParam(deprec_opts, "generation-file", + "DEPRECATED. DO NOT USE. location and properties of the generation table"); + AddParam(deprec_opts, "global-lexical-file", "gl", + "DEPRECATED. DO NOT USE. discriminatively trained global lexical translation model file"); + AddParam(deprec_opts, "glm-feature", + "DEPRECATED. DO NOT USE. discriminatively trained global lexical translation feature, sparse producer"); + AddParam(deprec_opts, "lmodel-file", + "DEPRECATED. DO NOT USE. location and properties of the language models"); + AddParam(deprec_opts, "lmodel-dub", + "DEPRECATED. DO NOT USE. dictionary upper bounds of language models"); + #ifdef HAVE_SYNLM + AddParam(deprec_opts,"slmodel-file", "DEPRECATED. DO NOT USE. location of the syntactic language model file(s)"); + AddParam(deprec_opts,"slmodel-factor", "DEPRECATED. DO NOT USE. factor to use with syntactic language model"); + AddParam(deprec_opts,"slmodel-beam", "DEPRECATED. DO NOT USE. beam width to use with syntactic language model's parser"); + #endif + AddParam(deprec_opts, "ttable-file", + "DEPRECATED. DO NOT USE. location and properties of the translation tables"); + AddParam(deprec_opts, "phrase-pair-feature", + "DEPRECATED. DO NOT USE. Source and target factors for phrase pair feature"); + AddParam(deprec_opts, "phrase-boundary-source-feature", + "DEPRECATED. DO NOT USE. Source factors for phrase boundary feature"); + AddParam(deprec_opts, "phrase-boundary-target-feature", + "DEPRECATED. DO NOT USE. Target factors for phrase boundary feature"); + AddParam(deprec_opts, "phrase-length-feature", + "DEPRECATED. DO NOT USE. Count features for source length, target length, both of each phrase"); + AddParam(deprec_opts, "target-word-insertion-feature", + "DEPRECATED. DO NOT USE. Count feature for each unaligned target word"); + AddParam(deprec_opts, "source-word-deletion-feature", + "DEPRECATED. DO NOT USE. Count feature for each unaligned source word"); + AddParam(deprec_opts, "word-translation-feature", + "DEPRECATED. DO NOT USE. Count feature for word translation according to word alignment"); + */ + + po::options_description zombie_opts("Zombie Options"); + //AddParam(zombie_opts, "distortion-file", + // "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables"); + + //mbr_opts.add(lmbr_opts); + search_opts.add(cube_opts); + //search_opts.add(mbr_opts); + search_opts.add(disto_opts); + search_opts.add(chart_opts); + + //input_opts.add(spe_opts); + + output_opts.add(nbest_opts); + //output_opts.add(osg_opts); + + m_options.add(main_opts); + m_options.add(server_opts); + m_options.add(input_opts); + m_options.add(search_opts); + m_options.add(output_opts); + m_options.add(oov_opts); + m_options.add(factor_opts); + //m_options.add(cpt_opts); + //m_options.add(irstlm_opts); + m_options.add(tune_opts); + m_options.add(misc_opts); + //m_options.add(deprec_opts); + //m_options.add(zombie_opts); + +} + +Parameter::~Parameter() +{ +} + +const PARAM_VEC *Parameter::GetParam(const std::string ¶mName) const +{ + PARAM_MAP::const_iterator iter = m_setting.find(paramName); + if (iter == m_setting.end()) { + return NULL; + } else { + return &iter->second; + } + +} + +/** initialize a parameter, sub of constructor */ +void Parameter::AddParam(po::options_description& optgroup, + string const& paramName, string const& description) +{ + m_valid[paramName] = true; + m_description[paramName] = description; + optgroup.add_options()(paramName.c_str(), description.c_str()); +} + +/** initialize a parameter (including abbreviation), sub of constructor */ +void Parameter::AddParam(po::options_description& optgroup, + string const& paramName, string const& abbrevName, + string const& description) +{ + m_valid[paramName] = true; + m_valid[abbrevName] = true; + m_abbreviation[paramName] = abbrevName; + m_fullname[abbrevName] = paramName; + m_description[paramName] = description; + string optname = paramName; + if (abbrevName.size() == 1) { + optname += string(",") + abbrevName; + // m_confusable[abbrevName[0]].insert(paramName); + } + optgroup.add_options()(optname.c_str(), description.c_str()); +} + +/** print descriptions of all parameters */ +void Parameter::Explain() +{ + cerr << "Usage:" << endl; + cerr << m_options << endl; + // for(PARAM_STRING::const_iterator iterParam = m_description.begin(); + // iterParam != m_description.end(); iterParam++) + // { + // const string paramName = iterParam->first; + // const string paramDescription = iterParam->second; + // cerr << "\t-" << paramName; + // PARAM_STRING::const_iterator iterAbbr = m_abbreviation.find( paramName ); + // if ( iterAbbr != m_abbreviation.end() ) + // cerr << " (" << iterAbbr->second << ")"; + // cerr << ": " << paramDescription << endl; + // } +} + +/** check whether an item on the command line is a switch or a value + * \param token token on the command line to checked **/ + +bool Parameter::isOption(const char* token) +{ + if (!token) return false; + std::string tokenString(token); + size_t length = tokenString.size(); + if (length <= 1) return false; + if (!starts_with(tokenString, "-")) return false; + if (tokenString.substr(1, 1).find_first_not_of("0123456789") == 0) return true; + return false; +} + +/** load all parameters from the configuration file and the command line switches */ +bool Parameter::LoadParam(const string &filePath) +{ + const char *argv[] = { "executable", "-f", filePath.c_str() }; + return LoadParam(3, (char**) argv); +} + +/** load all parameters from the configuration file and the command line switches */ +bool Parameter::LoadParam(int argc, char* xargv[]) +{ + // legacy parameter handling: all parameters are expected + // to start with a single dash + char **argv = (char**) alloca(argc * sizeof(char*)); + + for (int i = 0; i < argc; ++i) { + argv[i] = xargv[i]; + if (strlen(argv[i]) > 2 && argv[i][0] == '-' && argv[i][1] == '-') ++argv[i]; + } + + // config file (-f) arg mandatory + string configPath; + if ((configPath = FindParam("-f", argc, argv)) == "" && (configPath = + FindParam("-config", argc, argv)) == "") { + PrintCredit(); + Explain(); + FeatureRegistry::Instance().PrintFF(); + + cerr << endl; + cerr << "No configuration file was specified. Use -config or -f"; + cerr << endl; + return false; + } else { + if (!ReadConfigFile(configPath)) { + std::cerr << "Could not read " << configPath; + return false; + } + } + + // overwrite parameters with values from switches + for (PARAM_STRING::const_iterator iterParam = m_description.begin(); + iterParam != m_description.end(); iterParam++) { + const string paramName = iterParam->first; + OverwriteParam("-" + paramName, paramName, argc, argv); + } + + // ... also shortcuts + for (PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); + iterParam != m_abbreviation.end(); iterParam++) { + const string paramName = iterParam->first; + const string paramShortName = iterParam->second; + OverwriteParam("-" + paramShortName, paramName, argc, argv); + } + + AddFeaturesCmd(); + + // logging of parameters that were set in either config or switch + int verbose = 1; + if (m_setting.find("verbose") != m_setting.end() + && m_setting["verbose"].size() > 0) verbose = Scan( + m_setting["verbose"][0]); + if (verbose >= 1) { // only if verbose + cerr << "Defined parameters (per moses.ini or switch):" << endl; + for (PARAM_MAP::const_iterator iterParam = m_setting.begin(); + iterParam != m_setting.end(); iterParam++) { + cerr << "\t" << iterParam->first << ": "; + for (size_t i = 0; i < iterParam->second.size(); i++) + cerr << iterParam->second[i] << " "; + cerr << endl; + } + } + + // don't mix old and new format + if ((GetParam("feature") || GetParam("weight")) + && (GetParam("weight-slm") || GetParam("weight-bl") + || GetParam("weight-d") || GetParam("weight-dlm") + || GetParam("weight-lrl") || GetParam("weight-generation") + || GetParam("weight-i") || GetParam("weight-l") + || GetParam("weight-lex") || GetParam("weight-glm") + || GetParam("weight-wt") || GetParam("weight-pp") + || GetParam("weight-pb") || GetParam("weight-t") + || GetParam("weight-w") || GetParam("weight-p") + || GetParam("weight-u") || GetParam("weight-e") + || GetParam("dlm-mode") || GetParam("generation-file") + || GetParam("global-lexical-file") || GetParam("glm-feature") + || GetParam("lmodel-file") || GetParam("lmodel-dub") + || GetParam("slmodel-file") || GetParam("slmodel-factor") + || GetParam("slmodel-beam") || GetParam("ttable-file") + || GetParam("phrase-pair-feature") + || GetParam("phrase-boundary-source-feature") + || GetParam("phrase-boundary-target-feature") + || GetParam("phrase-length-feature") + || GetParam("target-word-insertion-feature") + || GetParam("source-word-deletion-feature") + || GetParam("word-translation-feature"))) { + UTIL_THROW(util::Exception, "Don't mix old and new ini file format"); + } + + // convert old weights args to new format + if (GetParam("feature") == NULL) { + ConvertWeightArgs(); + } + CreateWeightsMap(); + WeightOverwrite(); + + // check for illegal parameters + bool noErrorFlag = true; + for (int i = 0; i < argc; i++) { + if (isOption(argv[i])) { + string paramSwitch = (string) argv[i]; + string paramName = paramSwitch.substr(1); + if (m_valid.find(paramName) == m_valid.end()) { + std::cerr << "illegal switch: " << paramSwitch; + noErrorFlag = false; + } + } + } + + //Save("/tmp/moses.ini.new"); + + // check if parameters make sense + return Validate() && noErrorFlag; +} + +void Parameter::AddFeaturesCmd() +{ + const PARAM_VEC *params = GetParam("feature-add"); + if (params) { + PARAM_VEC::const_iterator iter; + for (iter = params->begin(); iter != params->end(); ++iter) { + const string &line = *iter; + AddFeature(line); + } + + m_setting.erase("feature-add"); + } +} + +std::vector Parameter::GetWeights(const std::string &name) +{ + std::vector ret = m_weights[name]; + + // cerr << "WEIGHT " << name << "="; + // for (size_t i = 0; i < ret.size(); ++i) { + // cerr << ret[i] << ","; + // } + // cerr << endl; + return ret; +} + +void Parameter::SetWeight(const std::string &name, size_t ind, float weight) +{ + PARAM_VEC &newWeights = m_setting["weight"]; + string line = name + SPrint(ind) + "= " + SPrint(weight); + newWeights.push_back(line); +} + +void Parameter::SetWeight(const std::string &name, size_t ind, + const vector &weights) +{ + PARAM_VEC &newWeights = m_setting["weight"]; + string line = name + SPrint(ind) + "="; + + for (size_t i = 0; i < weights.size(); ++i) { + line += " " + SPrint(weights[i]); + } + newWeights.push_back(line); +} + +void Parameter::AddWeight(const std::string &name, size_t ind, + const std::vector &weights) +{ + PARAM_VEC &newWeights = m_setting["weight"]; + + string sought = name + SPrint(ind) + "="; + for (size_t i = 0; i < newWeights.size(); ++i) { + string &line = newWeights[i]; + if (line.find(sought) == 0) { + // found existing weight, most likely to be input weights. Append to this line + for (size_t i = 0; i < weights.size(); ++i) { + line += " " + SPrint(weights[i]); + } + return; + } + } + + // nothing found. Just set + SetWeight(name, ind, weights); +} + +void Parameter::ConvertWeightArgsSingleWeight(const string &oldWeightName, + const string &newWeightName) +{ + size_t ind = 0; + PARAM_MAP::iterator iterMap; + + iterMap = m_setting.find(oldWeightName); + if (iterMap != m_setting.end()) { + const PARAM_VEC &weights = iterMap->second; + for (size_t i = 0; i < weights.size(); ++i) { + SetWeight(newWeightName, ind, Scan(weights[i])); + } + + m_setting.erase(iterMap); + } +} + +void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName) +{ + const PARAM_VEC *params; + + // process input weights 1st + params = GetParam("weight-i"); + if (params) { + vector inputWeights = Scan(*params); + PARAM_VEC &numInputScores = m_setting["input-scores"]; + if (inputWeights.size() == 1) { + UTIL_THROW_IF2(numInputScores.size() != 0, + "No [input-scores] section allowed"); + numInputScores.push_back("1"); + numInputScores.push_back("0"); + } else if (inputWeights.size() == 2) { + UTIL_THROW_IF2(numInputScores.size() != 0, + "No [input-scores] section allowed"); + numInputScores.push_back("1"); + numInputScores.push_back("1"); + } + + SetWeight("PhraseDictionaryBinary", 0, inputWeights); + } + + // convert actually pt feature + cerr << "Creating phrase table features" << endl; + + size_t numInputScores = 0; + size_t numRealWordsInInput = 0; + map ptIndices; + + params = GetParam("input-scores"); + if (params) { + numInputScores = Scan(params->at(0)); + + if (params->size() > 1) { + numRealWordsInInput = Scan(params->at(1)); + } + } + + // load phrase translation tables + params = GetParam("ttable-file"); + if (params) { + // weights + const vector translationVector = *params; + + vector maxTargetPhrase; + params = GetParam("ttable-limit"); + if (params) { + maxTargetPhrase = Scan(*params); + } + + if (maxTargetPhrase.size() == 1 && translationVector.size() > 1) { + cerr << "Using uniform ttable-limit of " << maxTargetPhrase[0] + << " for all translation tables." << endl; + for (size_t i = 1; i < translationVector.size(); i++) + maxTargetPhrase.push_back(maxTargetPhrase[0]); + } else if (maxTargetPhrase.size() != 1 + && maxTargetPhrase.size() < translationVector.size()) { + std::cerr << "You specified " << translationVector.size() + << " translation tables, but only " << maxTargetPhrase.size() + << " ttable-limits."; + return; + } + + // MAIN LOOP + const PARAM_VEC &oldWeights = m_setting[oldWeightName]; + + size_t currOldInd = 0; + for (size_t currDict = 0; currDict < translationVector.size(); currDict++) { + util::StringStream ptLine; + + vector token = Tokenize(translationVector[currDict]); + + if (currDict == 0 && token.size() == 4) { + std::cerr + << "Phrase table specification in old 4-field format. No longer supported"; + return; + } + UTIL_THROW_IF2(token.size() < 5, + "Phrase table must have at least 5 scores"); + + int implementation = Scan(token[0]); + + string ptType; + switch (implementation) { + case 0: // Memory + ptType = "PhraseDictionaryMemory"; + break; + case 1: // Binary + ptType = "PhraseDictionaryBinary"; + break; + case 2: // OnDisk + ptType = "PhraseDictionaryOnDisk"; + break; + case 6: // SCFG + ptType = "PhraseDictionaryMemory"; + break; + case 12: // Compact + ptType = "PhraseDictionaryCompact"; + break; + case 8: // SuffixArray + ptType = "PhraseDictionarySuffixArray"; + break; + case 14: // DSuffixArray + ptType = "PhraseDictionaryDynSuffixArray"; + break; + case 15: // DCacheBased: + ptType = "PhraseDictionaryDynamicCacheBased"; + break; + default: + break; + } + + size_t ptInd; + if (ptIndices.find(ptType) == ptIndices.end()) { + ptIndices[ptType] = 0; + ptInd = 0; + } else { + ptInd = ++ptIndices[ptType]; + } + + // weights + size_t numFFInd = (token.size() == 4) ? 2 : 3; + size_t numFF = Scan(token[numFFInd]); + + vector weights(numFF); + for (size_t currFF = 0; currFF < numFF; ++currFF) { + UTIL_THROW_IF2(currOldInd >= oldWeights.size(), + "Errors converting old phrase-table weights to new weights"); + float weight = Scan(oldWeights[currOldInd]); + weights[currFF] = weight; + + ++currOldInd; + } + + // cerr << weights.size() << " PHRASE TABLE WEIGHTS " + // << __FILE__ << ":" << __LINE__ << endl; + AddWeight(ptType, ptInd, weights); + + // actual pt + ptLine << ptType << " "; + ptLine << "input-factor=" << token[1] << " "; + ptLine << "output-factor=" << token[2] << " "; + ptLine << "path=" << token[4] << " "; + + //characteristics of the phrase table + + vector input = Tokenize(token[1], ","), output = + Tokenize(token[2], ","); + size_t numScoreComponent = Scan(token[3]); + string filePath = token[4]; + + if (currDict == 0) { + // only the 1st pt. THis is shit + // TODO. find what the assumptions made by confusion network about phrase table output which makes + // it only work with binary file. This is a hack + numScoreComponent += numInputScores + numRealWordsInInput; + } + + ptLine << "num-features=" << numScoreComponent << " "; + ptLine << "table-limit=" << maxTargetPhrase[currDict] << " "; + + if (implementation == 8 || implementation == 14) { + ptLine << "target-path=" << token[5] << " "; + ptLine << "alignment-path=" << token[6] << " "; + } + + AddFeature(ptLine.str()); + } // for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) { + } // if (GetParam("ttable-file").size() > 0) { + + m_setting.erase("weight-i"); + m_setting.erase(oldWeightName); + m_setting.erase("ttable-file"); + m_setting.erase("ttable-limit"); + +} + +void Parameter::AddFeature(const std::string &line) +{ + PARAM_VEC &features = m_setting["feature"]; + features.push_back(line); +} + +void Parameter::ConvertWeightArgsDistortion() +{ + const string oldWeightName = "weight-d"; + const string oldLexReordingName = "distortion-file"; + + // distortion / lex distortion + const PARAM_VEC *oldWeights = GetParam(oldWeightName); + + if (oldWeights) { + const PARAM_VEC *searchAlgo = GetParam("search-algorithm"); + if (searchAlgo == NULL + || (searchAlgo->size() > 0 + && (Trim(searchAlgo->at(0)) == "0" || Trim(searchAlgo->at(0)) == "1"))) { + // phrase-based. Add distance distortion to list of features + AddFeature("Distortion"); + SetWeight("Distortion", 0, Scan(oldWeights->at(0))); + } + + // everything but the last is lex reordering model + + size_t currOldInd = 1; + const PARAM_VEC *lextable = GetParam(oldLexReordingName); + + for (size_t indTable = 0; lextable && indTable < lextable->size(); + ++indTable) { + const string &line = lextable->at(indTable); + vector toks = Tokenize(line); + + size_t numFF = Scan(toks[2]); + + vector weights(numFF); + for (size_t currFF = 0; currFF < numFF; ++currFF) { + UTIL_THROW_IF2(oldWeights && currOldInd >= oldWeights->size(), + "Errors converting old distortion weights to new weights"); + float weight = Scan(oldWeights->at(currOldInd)); + weights[currFF] = weight; + + ++currOldInd; + } + SetWeight("LexicalReordering", indTable, weights); + + util::StringStream strme; + strme << "LexicalReordering " << "type=" << toks[1] << " "; + + vector factors = Tokenize(toks[0], "-"); + UTIL_THROW_IF2(factors.size() != 2, + "Error in old factor specification for lexicalized reordering model: " << toks[0]); + strme << "input-factor=" << factors[0] << " output-factor=" << factors[1] + << " "; + + strme << "num-features=" << toks[2] << " "; + strme << "path=" << toks[3]; + + AddFeature(strme.str()); + } + } + + m_setting.erase(oldWeightName); + m_setting.erase(oldLexReordingName); + +} + +void Parameter::ConvertWeightArgsLM() +{ + const string oldWeightName = "weight-l"; + const string oldFeatureName = "lmodel-file"; + const PARAM_VEC *params; + + bool isChartDecoding = true; + + params = GetParam("search-algorithm"); + if (params == NULL + || (params->size() > 0 + && (Trim(params->at(0)) == "0" || Trim(params->at(0)) == "1"))) { + isChartDecoding = false; + } + + vector oovWeights; + params = GetParam("lmodel-oov-feature"); + if (params) { + oovWeights = Scan(*params); + } + + PARAM_MAP::iterator iterMap; + + iterMap = m_setting.find(oldWeightName); + if (iterMap != m_setting.end()) { + + size_t currOldInd = 0; + const PARAM_VEC &weights = iterMap->second; + const PARAM_VEC &models = m_setting[oldFeatureName]; + for (size_t lmIndex = 0; lmIndex < models.size(); ++lmIndex) { + const string &line = models[lmIndex]; + vector modelToks = Tokenize(line); + + int lmType = Scan(modelToks[0]); + + string newFeatureName; + switch (lmType) { + case 0: + newFeatureName = "SRILM"; + break; + case 1: + newFeatureName = "IRSTLM"; + break; + case 8: + case 9: + newFeatureName = "KENLM"; + break; + default: + UTIL_THROW2("Unkown language model type id:" << lmType) + ; + } + + size_t numFF = 1; + if (oovWeights.size() > lmIndex) numFF += oovWeights[lmIndex]; + + vector weightsLM(numFF); + for (size_t currFF = 0; currFF < numFF; ++currFF) { + UTIL_THROW_IF2(currOldInd >= weights.size(), + "Errors converting old LM weights to new weights"); + weightsLM[currFF] = Scan(weights[currOldInd]); + if (isChartDecoding) { + weightsLM[currFF] = UntransformLMScore(weightsLM[currFF]); + } + + ++currOldInd; + } + + SetWeight(newFeatureName, lmIndex, weightsLM); + + string featureLine = newFeatureName + " " + "factor=" + modelToks[1] + " " // factor + + "order=" + modelToks[2] + " " // order + + "num-features=" + SPrint(numFF) + " "; + if (lmType == 9) { + featureLine += "lazyken=1 "; + } else if (lmType == 8) { + featureLine += "lazyken=0 "; + } + + featureLine += "path=" + modelToks[3]; // file + + AddFeature(featureLine); + } // for (size_t lmIndex = 0; lmIndex < models.size(); ++lmIndex) { + + m_setting.erase(iterMap); + } + + m_setting.erase(oldFeatureName); +} + +void Parameter::ConvertWeightArgsGeneration(const std::string &oldWeightName, + const std::string &newWeightName) +{ + string oldFeatureName = "generation-file"; + + // distortion / lex distortion + PARAM_VEC &oldWeights = m_setting[oldWeightName]; + + if (oldWeights.size() > 0) { + size_t currOldInd = 0; + PARAM_VEC &models = m_setting[oldFeatureName]; + + for (size_t indTable = 0; indTable < models.size(); ++indTable) { + string &line = models[indTable]; + vector modelToks = Tokenize(line); + + size_t numFF = Scan(modelToks[2]); + + vector weights(numFF); + for (size_t currFF = 0; currFF < numFF; ++currFF) { + UTIL_THROW_IF2(currOldInd >= oldWeights.size(), + "Errors converting old generation weights to new weights"); + float weight = Scan(oldWeights[currOldInd]); + weights[currFF] = weight; + + ++currOldInd; + } + SetWeight(newWeightName, indTable, weights); + + util::StringStream strme; + strme << "Generation " << "input-factor=" << modelToks[0] << " " + << "output-factor=" << modelToks[1] << " " << "num-features=" + << modelToks[2] << " " << "path=" << modelToks[3]; + AddFeature(strme.str()); + } + } + + m_setting.erase(oldWeightName); + m_setting.erase(oldFeatureName); +} + +void Parameter::ConvertWeightArgsWordPenalty() +{ + const std::string oldWeightName = "weight-w"; + const std::string newWeightName = "WordPenalty"; + + bool isChartDecoding = true; + const PARAM_VEC *searchAlgo = GetParam("search-algorithm"); + if (searchAlgo == NULL + || (searchAlgo->size() > 0 + && (Trim(searchAlgo->at(0)) == "0" || Trim(searchAlgo->at(0)) == "1"))) { + isChartDecoding = false; + } + + PARAM_MAP::iterator iterMap; + + iterMap = m_setting.find(oldWeightName); + if (iterMap != m_setting.end()) { + const PARAM_VEC &weights = iterMap->second; + for (size_t i = 0; i < weights.size(); ++i) { + float weight = Scan(weights[i]); + if (isChartDecoding) { + weight *= 0.434294482; + } + SetWeight(newWeightName, i, weight); + } + + m_setting.erase(iterMap); + } + +} + +void Parameter::ConvertPhrasePenalty() +{ + string oldWeightName = "weight-p"; + const PARAM_VEC *params = GetParam(oldWeightName); + if (params) { + UTIL_THROW_IF2(params->size() != 1, + "There should be only 1 phrase-penalty weight"); + float weight = Scan(params->at(0)); + AddFeature("PhrasePenalty"); + SetWeight("PhrasePenalty", 0, weight); + + m_setting.erase(oldWeightName); + } +} + +void Parameter::ConvertWeightArgs() +{ + // can't handle discr LM. must do it manually 'cos of bigram/n-gram split + UTIL_THROW_IF2(m_setting.count("weight-dlm") != 0, + "Can't handle discr LM. must do it manually 'cos of bigram/n-gram split"); + + // check that old & new format aren't mixed + if (m_setting.count("weight") + && (m_setting.count("weight-i") || m_setting.count("weight-t") + || m_setting.count("weight-w") || m_setting.count("weight-l") + || m_setting.count("weight-u") || m_setting.count("weight-lex") + || m_setting.count("weight-generation") + || m_setting.count("weight-lr") || m_setting.count("weight-d"))) { + cerr << "Do not mix old and new format for specify weights"; + } + + ConvertWeightArgsWordPenalty(); + ConvertWeightArgsLM(); + ConvertWeightArgsSingleWeight("weight-slm", "SyntacticLM"); + ConvertWeightArgsSingleWeight("weight-u", "UnknownWordPenalty"); + ConvertWeightArgsGeneration("weight-generation", "Generation"); + ConvertWeightArgsDistortion(); + + // don't know or can't be bothered converting these weights + ConvertWeightArgsSingleWeight("weight-lr", "LexicalReordering"); + ConvertWeightArgsSingleWeight("weight-bl", "BleuScoreFeature"); + ConvertWeightArgsSingleWeight("weight-glm", "GlobalLexicalModel"); + ConvertWeightArgsSingleWeight("weight-wt", "WordTranslationFeature"); + ConvertWeightArgsSingleWeight("weight-pp", "PhrasePairFeature"); + ConvertWeightArgsSingleWeight("weight-pb", "PhraseBoundaryFeature"); + + ConvertWeightArgsSingleWeight("weight-e", "WordDeletion"); // TODO Can't find real name + ConvertWeightArgsSingleWeight("weight-lex", "GlobalLexicalReordering"); // TODO Can't find real name + + ConvertPhrasePenalty(); + + AddFeature("WordPenalty"); + AddFeature("UnknownWordPenalty"); + + ConvertWeightArgsPhraseModel("weight-t"); + +} + +void Parameter::CreateWeightsMap() +{ + CreateWeightsMap(m_setting["weight-add"]); + CreateWeightsMap(m_setting["weight"]); +} + +void Parameter::CreateWeightsMap(const PARAM_VEC &vec) +{ + for (size_t i = 0; i < vec.size(); ++i) { + const string &line = vec[i]; + vector toks = Tokenize(line); + UTIL_THROW_IF2(toks.size() < 2, "Error in format of weights: " << line); + + string name = toks[0]; + name = name.substr(0, name.size() - 1); + + vector weights(toks.size() - 1); + for (size_t i = 1; i < toks.size(); ++i) { + float weight = Scan(toks[i]); + weights[i - 1] = weight; + } + m_weights[name] = weights; + } +} + +void Parameter::WeightOverwrite() +{ + PARAM_VEC &vec = m_setting["weight-overwrite"]; + + if (vec.size() == 0) return; + + // should only be on 1 line + UTIL_THROW_IF2(vec.size() != 1, "weight-overwrite should only be on 1 line"); + + string name(""); + vector weights; + vector toks = Tokenize(vec[0]); + size_t cnt = 0; + const std::vector* oldWeights = NULL; + for (size_t i = 0; i < toks.size(); ++i) { + const string &tok = toks[i]; + + if (ends_with(tok, "=")) { + // start of new feature + + if (name != "") { + // save previous ff + m_weights[name] = weights; + weights.clear(); + } + + name = tok.substr(0, tok.size() - 1); + std::map >::const_iterator found = + m_weights.find(name); + if (found != m_weights.end()) { + oldWeights = &(found->second); + } else { + oldWeights = NULL; + } + cnt = 0; + } else { + // a weight for curr ff + if (toks[i] == "x") { + UTIL_THROW_IF2(!oldWeights || cnt >= oldWeights->size(), + "Keeping previous weight failed in weight-overwrite"); + weights.push_back(oldWeights->at(cnt)); + } else { + float weight = Scan(toks[i]); + weights.push_back(weight); + } + ++cnt; + } + } + + if (name != "") { + m_weights[name] = weights; + } + +} + +/** check that parameter settings make sense */ +bool Parameter::Validate() +{ + bool noErrorFlag = true; + + PARAM_MAP::const_iterator iterParams; + for (iterParams = m_setting.begin(); iterParams != m_setting.end(); + ++iterParams) { + const std::string &key = iterParams->first; + + if (m_valid.find(key) == m_valid.end()) { + std::cerr << "Unknown parameter " << key; + noErrorFlag = false; + } + } + + if (m_setting["lmodel-dub"].size() > 0) { + if (m_setting["lmodel-file"].size() != m_setting["lmodel-dub"].size()) { + std::cerr << "Config and parameters specify " + << static_cast(m_setting["lmodel-file"].size()) + << " language model files (lmodel-file), but " + << static_cast(m_setting["lmodel-dub"].size()) + << " LM upperbounds (lmodel-dub)" << endl; + noErrorFlag = false; + } + } + + // do files exist? + + // input file + if (noErrorFlag && m_setting["input-file"].size() == 1) { + noErrorFlag = FileExists(m_setting["input-file"][0]); + if (!noErrorFlag) { + std::cerr << endl << "Input file " << m_setting["input-file"][0] + << " does not exist"; + } + } + // generation tables + if (noErrorFlag) { + std::vector ext; + //raw tables in either un compressed or compressed form + ext.push_back(""); + ext.push_back(".gz"); + noErrorFlag = FilesExist("generation-file", 3, ext); + } + // distortion + if (noErrorFlag) { + std::vector ext; + //raw tables in either un compressed or compressed form + ext.push_back(""); + ext.push_back(".gz"); + //prefix tree format + ext.push_back(".binlexr.idx"); + //prefix tree format + ext.push_back(".minlexr"); + noErrorFlag = FilesExist("distortion-file", 3, ext); + } + return noErrorFlag; +} + +/** check whether a file exists */ +bool Parameter::FilesExist(const string ¶mName, int fieldNo, + std::vector const& extensions) +{ + typedef std::vector StringVec; + StringVec::const_iterator iter; + + PARAM_MAP::const_iterator iterParam = m_setting.find(paramName); + if (iterParam == m_setting.end()) { + // no param. therefore nothing to check + return true; + } + const StringVec &pathVec = (*iterParam).second; + for (iter = pathVec.begin(); iter != pathVec.end(); ++iter) { + StringVec vec = Tokenize(*iter); + + size_t tokenizeIndex; + if (fieldNo == -1) tokenizeIndex = vec.size() - 1; + else tokenizeIndex = static_cast(fieldNo); + + if (tokenizeIndex >= vec.size()) { + std::cerr << "Expected at least " << (tokenizeIndex + 1) + << " tokens per entry in '" << paramName << "', but only found " + << vec.size(); + return false; + } + const string &pathStr = vec[tokenizeIndex]; + + bool fileFound = 0; + for (size_t i = 0; i < extensions.size() && !fileFound; ++i) { + fileFound |= FileExists(pathStr + extensions[i]); + } + if (!fileFound) { + std::cerr << "File " << pathStr << " does not exist"; + return false; + } + } + return true; +} + +/** look for a switch in arg, update parameter */ +// TODO arg parsing like this does not belong in the library, it belongs +// in moses-cmd +string Parameter::FindParam(const string ¶mSwitch, int argc, char* argv[]) +{ + for (int i = 0; i < argc; i++) { + if (string(argv[i]) == paramSwitch) { + if (i + 1 < argc) { + return argv[i + 1]; + } else { + std::cerr << "Option " << paramSwitch << " requires a parameter!"; + // TODO return some sort of error, not the empty string + } + } + } + return ""; +} + +/** update parameter settings with command line switches + * \param paramSwitch (potentially short) name of switch + * \param paramName full name of parameter + * \param argc number of arguments on command line + * \param argv values of paramters on command line */ +void Parameter::OverwriteParam(const string ¶mSwitch, + const string ¶mName, int argc, char* argv[]) +{ + int startPos = -1; + for (int i = 0; i < argc; i++) { + if (string(argv[i]) == paramSwitch) { + startPos = i + 1; + break; + } + } + if (startPos < 0) return; + + int index = 0; + m_setting[paramName]; // defines the parameter, important for boolean switches + while (startPos < argc && (!isOption(argv[startPos]))) { + if (m_setting[paramName].size() > (size_t) index) m_setting[paramName][index] = + argv[startPos]; + else m_setting[paramName].push_back(argv[startPos]); + index++; + startPos++; + } +} + +/** read parameters from a configuration file */ +bool Parameter::ReadConfigFile(const string &filePath) +{ + InputFileStream inFile(filePath); + string line, paramName; + while (getline(inFile, line)) { + // comments + size_t comPos = line.find_first_of("#"); + if (comPos != string::npos) line = line.substr(0, comPos); + // trim leading and trailing spaces/tabs + line = Trim(line); + + if (line.size() == 0) { + // blank line. do nothing. + } else if (line[0] == '[') { + // new parameter + for (size_t currPos = 0; currPos < line.size(); currPos++) { + if (line[currPos] == ']') { + paramName = line.substr(1, currPos - 1); + break; + } + } + } else { + // add value to parameter + m_setting[paramName].push_back(line); + } + } + return true; +} + +struct Credit { + string name, contact, currentPursuits, areaResponsibility; + int sortId; + + Credit(string name, string contact, string currentPursuits, + string areaResponsibility) { + this->name = name; + this->contact = contact; + this->currentPursuits = currentPursuits; + this->areaResponsibility = areaResponsibility; + this->sortId = util::rand_excl(1000); + } + + bool operator<(const Credit &other) const { + /* + if (areaResponsibility.size() != 0 && other.areaResponsibility.size() ==0) + return true; + if (areaResponsibility.size() == 0 && other.areaResponsibility.size() !=0) + return false; + + return name < other.name; + */ + return sortId < other.sortId; + } + +}; + +std::ostream& operator<<(std::ostream &os, const Credit &credit) +{ + os << credit.name; + if (credit.contact != "") os << "\t contact: " << credit.contact; + if (credit.currentPursuits != "") os << " " << credit.currentPursuits; + if (credit.areaResponsibility != "") os << " I'll answer question on: " + << credit.areaResponsibility; + return os; +} + +void Parameter::PrintCredit() +{ + vector everyone; + srand(time(NULL)); + + everyone.push_back( + Credit("Nicola Bertoldi", "911", "", "scripts & other stuff")); + everyone.push_back(Credit("Ondrej Bojar", "", "czech this out!", "")); + everyone.push_back( + Credit("Chris Callison-Burch", "anytime, anywhere", + "international playboy", "")); + everyone.push_back(Credit("Alexandra Constantin", "", "eu sunt varza", "")); + everyone.push_back( + Credit("Brooke Cowan", "brooke@csail.mit.edu", + "if you're going to san francisco, be sure to wear a flower in your hair", + "")); + everyone.push_back( + Credit("Chris Dyer", "can't. i'll be out driving my mustang", + "driving my mustang", "")); + everyone.push_back( + Credit("Marcello Federico", "federico at itc at it", + "Researcher at ITC-irst, Trento, Italy", "IRST language model")); + everyone.push_back( + Credit("Evan Herbst", "Small college in upstate New York", "", "")); + everyone.push_back( + Credit("Philipp Koehn", "only between 2 and 4am", "", + "Nothing fazes this dude")); + everyone.push_back( + Credit("Christine Moran", "weird building at MIT", "", "")); + everyone.push_back( + Credit("Wade Shen", "via morse code", "buying another laptop", "")); + everyone.push_back( + Credit("Richard Zens", "richard at aachen dot de", "", + "ambiguous source input, confusion networks, confusing source code")); + everyone.push_back( + Credit("Hieu Hoang", "http://www.hoang.co.uk/hieu/", + "phd student at Edinburgh Uni. Original Moses developer", + "general queries/ flames on Moses.")); + + sort(everyone.begin(), everyone.end()); + + cerr + << "Moses - A beam search decoder for phrase-based statistical machine translation models" + << endl << "Copyright (C) 2006 University of Edinburgh" << endl << endl + + << "This library is free software; you can redistribute it and/or" << endl + << "modify it under the terms of the GNU Lesser General Public" << endl + << "License as published by the Free Software Foundation; either" << endl + << "version 2.1 of the License, or (at your option) any later version." + << endl << endl + + << "This library is distributed in the hope that it will be useful," + << endl + << "but WITHOUT ANY WARRANTY; without even the implied warranty of" + << endl + << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU" + << endl << "Lesser General Public License for more details." << endl + << endl + + << "You should have received a copy of the GNU Lesser General Public" + << endl + << "License along with this library; if not, write to the Free Software" + << endl + << "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA" + << endl << endl + << "***********************************************************************" + << endl << endl << "Built on " << __DATE__ << " at " __TIME__ << endl + << endl << "WHO'S FAULT IS THIS GODDAM SOFTWARE:" << endl; + + ostream_iterator out(cerr, "\n"); + copy(everyone.begin(), everyone.end(), out); + cerr << endl << endl; +} + +/** update parameter settings with command line switches + * \param paramName full name of parameter + * \param values inew values for paramName */ +void Parameter::OverwriteParam(const string ¶mName, PARAM_VEC values) +{ + cerr << "Overwriting parameter " << paramName; + + m_setting[paramName]; // defines the parameter, important for boolean switches + if (m_setting[paramName].size() > 1) { + cerr << " (the parameter had " << m_setting[paramName].size() + << " previous values)"; + UTIL_THROW_IF2(m_setting[paramName].size() != values.size(), + "Number of weight override for " << paramName << " is not the same as the original number of weights"); + } else { + cerr << " (the parameter does not have previous values)"; + m_setting[paramName].resize(values.size()); + } + cerr << " with the following values:"; + int i = 0; + for (PARAM_VEC::iterator iter = values.begin(); iter != values.end(); + iter++, i++) { + m_setting[paramName][i] = *iter; + cerr << " " << *iter; + } + cerr << std::endl; +} + +std::set Parameter::GetWeightNames() const +{ + std::set ret; + std::map >::const_iterator iter; + for (iter = m_weights.begin(); iter != m_weights.end(); ++iter) { + const string &key = iter->first; + ret.insert(key); + } + return ret; +} + +void Parameter::Save(const std::string path) +{ + ofstream file; + file.open(path.c_str()); + + PARAM_MAP::const_iterator iterOuter; + for (iterOuter = m_setting.begin(); iterOuter != m_setting.end(); + ++iterOuter) { + const std::string §ionName = iterOuter->first; + file << "[" << sectionName << "]" << endl; + + const PARAM_VEC &values = iterOuter->second; + + PARAM_VEC::const_iterator iterInner; + for (iterInner = values.begin(); iterInner != values.end(); ++iterInner) { + const std::string &value = *iterInner; + file << value << endl; + } + + file << endl; + } + + file.close(); +} + +template<> +void Parameter::SetParameter(bool ¶meter, + std::string const& parameterName, bool const& defaultValue) const +{ + const PARAM_VEC *params = GetParam(parameterName); + + // default value if nothing is specified + parameter = defaultValue; + if (params == NULL) { + return; + } + + // if parameter is just specified as, e.g. "-parameter" set it true + if (params->size() == 0) { + parameter = true; + } + // if paramter is specified "-parameter true" or "-parameter false" + else if (params->size() == 1) { + parameter = Scan(params->at(0)); + } +} + +void Parameter::SetParameter(bool& var, std::string const& name) +{ + SetParameter(var, name, false); +} + +} + diff --git a/mosesdecoder/moses2/legacy/Parameter.h b/mosesdecoder/moses2/legacy/Parameter.h new file mode 100644 index 0000000000000000000000000000000000000000..501f35e9955f3e8b396ed19ecdf5d2515e3f7afc --- /dev/null +++ b/mosesdecoder/moses2/legacy/Parameter.h @@ -0,0 +1,170 @@ +/// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include "Util2.h" + +namespace Moses2 +{ + +typedef std::vector PARAM_VEC; +typedef std::map PARAM_MAP; +typedef std::map PARAM_BOOL; +typedef std::map PARAM_STRING; + +/** Handles parameter values set in config file or on command line. + * Process raw parameter data (names and values as strings) for StaticData + * to parse; to get useful values, see StaticData. + */ +class Parameter +{ + typedef boost::program_options::options_description options_description; + typedef boost::program_options::value_semantic value_semantic; +protected: + PARAM_MAP m_setting; + PARAM_BOOL m_valid; + PARAM_STRING m_abbreviation; + PARAM_STRING m_description; + PARAM_STRING m_fullname; + // std::map > m_confusable; + // stores long parameter names that start with a letter that is also a short option. + options_description m_options; + + std::map > m_weights; + + std::string FindParam(const std::string ¶mSwitch, int argc, char* argv[]); + void OverwriteParam(const std::string ¶mSwitch, + const std::string ¶mName, int argc, char* argv[]); + bool ReadConfigFile(const std::string &filePath); + bool FilesExist(const std::string ¶mName, int fieldNo, + std::vector const& fileExtension = std::vector( + 1, "")); + bool isOption(const char* token); + bool Validate(); + + void + AddParam(options_description& optgroup, value_semantic const* optvalue, + std::string const& paramName, std::string const& description); + + void + AddParam(options_description& optgroup, std::string const ¶mName, + std::string const &description); + + void + AddParam(options_description& optgroup, value_semantic const* optvalue, + std::string const& paramName, std::string const& abbrevName, + std::string const& description); + + void + AddParam(options_description& optgroup, std::string const& paramName, + std::string const& abbrevName, std::string const& description); + + void PrintCredit(); + + void SetWeight(const std::string &name, size_t ind, float weight); + void SetWeight(const std::string &name, size_t ind, + const std::vector &weights); + void AddWeight(const std::string &name, size_t ind, + const std::vector &weights); + void ConvertWeightArgs(); + void ConvertWeightArgsSingleWeight(const std::string &oldWeightName, + const std::string &newWeightName); + void ConvertWeightArgsPhraseModel(const std::string &oldWeightName); + void ConvertWeightArgsLM(); + void ConvertWeightArgsDistortion(); + void ConvertWeightArgsGeneration(const std::string &oldWeightName, + const std::string &newWeightName); + void ConvertWeightArgsPhrasePenalty(); + void ConvertWeightArgsWordPenalty(); + void ConvertPhrasePenalty(); + void CreateWeightsMap(); + void CreateWeightsMap(const PARAM_VEC &vec); + void WeightOverwrite(); + void AddFeature(const std::string &line); + void AddFeaturesCmd(); + +public: + Parameter(); + ~Parameter(); + bool LoadParam(int argc, char* argv[]); + bool LoadParam(const std::string &filePath); + void Explain(); + + /** return a vector of strings holding the whitespace-delimited values on the ini-file line corresponding to the given parameter name */ + const PARAM_VEC *GetParam(const std::string ¶mName) const; + + /** check if parameter is defined (either in moses.ini or as switch) */ + bool isParamSpecified(const std::string ¶mName) const { + return m_setting.find(paramName) != m_setting.end(); + } + + void OverwriteParam(const std::string ¶mName, PARAM_VEC values); + + std::vector GetWeights(const std::string &name); + const std::map > &GetAllWeights() const { + return m_weights; + } + std::set GetWeightNames() const; + + const PARAM_MAP &GetParams() const { + return m_setting; + } + + void Save(const std::string path); + + template + void SetParameter(T &var, const std::string &name, + const T &defaultValue) const { + const PARAM_VEC *params = GetParam(name); + if (params && params->size()) { + var = Scan(params->at(0)); + } else { + var = defaultValue; + } + } + + void SetParameter(bool& var, std::string const& name); + + bool SetBooleanSwitch(bool& val, std::string const name) { + // issues a warning if format is wrong + const PARAM_VEC *params = GetParam(name); + val = (params && params->size()); + if (val && params->size() != 1) { + std::cerr << "ERROR: wrong format for switch -" << name; + return false; + } + return true; + } + +}; + +template<> +void Parameter::SetParameter(bool &var, const std::string &name, + const bool &defaultValue) const; + +} + diff --git a/mosesdecoder/moses2/legacy/Range.cpp b/mosesdecoder/moses2/legacy/Range.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7186e4265575aed2b9198a3600d982a748d22702 --- /dev/null +++ b/mosesdecoder/moses2/legacy/Range.cpp @@ -0,0 +1,32 @@ +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "Range.h" + +namespace Moses2 +{ + +std::ostream& operator <<(std::ostream& out, const Range& range) +{ + out << "[" << range.m_startPos << ".." << range.m_endPos << "]"; + return out; +} + +} + diff --git a/mosesdecoder/moses2/legacy/Range.h b/mosesdecoder/moses2/legacy/Range.h new file mode 100644 index 0000000000000000000000000000000000000000..9acfba45d537940ce410b8e8f2028e4cb1cb81bc --- /dev/null +++ b/mosesdecoder/moses2/legacy/Range.h @@ -0,0 +1,115 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include +#include "Util2.h" +#include "util/exception.hh" + +#ifdef WIN32 +#undef max +#endif + +namespace Moses2 +{ + +/*** + * Efficient version of Bitmap for contiguous ranges + */ +class Range +{ + friend std::ostream& operator <<(std::ostream& out, const Range& range); + + // m_endPos is inclusive + size_t m_startPos, m_endPos; +public: + inline explicit Range() { + } + inline Range(size_t startPos, size_t endPos) : + m_startPos(startPos), m_endPos(endPos) { + } + inline Range(const Range ©) : + m_startPos(copy.GetStartPos()), m_endPos(copy.GetEndPos()) { + } + + inline size_t GetStartPos() const { + return m_startPos; + } + inline size_t GetEndPos() const { + return m_endPos; + } + + inline void SetStartPos(size_t val) { + m_startPos = val; + } + inline void SetEndPos(size_t val) { + m_endPos = val; + } + + //! count of words translated + inline size_t GetNumWordsCovered() const { + assert( + (m_startPos == NOT_FOUND && m_endPos == NOT_FOUND) || (m_startPos != NOT_FOUND && m_endPos != NOT_FOUND)); + return (m_startPos == NOT_FOUND) ? 0 : m_endPos - m_startPos + 1; + } + + //! transitive comparison + inline bool operator<(const Range& x) const { + return (m_startPos m_endPos) return false; + + return true; + } + + inline size_t GetNumWordsBetween(const Range& x) const { + UTIL_THROW_IF2(Overlap(x), "Overlapping ranges"); + + if (x.m_endPos < m_startPos) { + return m_startPos - x.m_endPos - 1; + } + + return x.m_startPos - m_endPos - 1; + } + +}; + +inline size_t hash_value(const Range& range) +{ + size_t seed = range.GetStartPos(); + boost::hash_combine(seed, range.GetEndPos()); + return seed; +} + +} + diff --git a/mosesdecoder/moses2/legacy/ThreadPool.cpp b/mosesdecoder/moses2/legacy/ThreadPool.cpp new file mode 100644 index 0000000000000000000000000000000000000000..861d95030d5f2c3ccb25338551497e1e96745264 --- /dev/null +++ b/mosesdecoder/moses2/legacy/ThreadPool.cpp @@ -0,0 +1,157 @@ +// $Id: ThreadPool.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2009 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ +#include +#ifdef __linux +#include +#include +#endif +#include +#include +#include +#include + +#include "ThreadPool.h" + +using namespace std; + +namespace Moses2 +{ + +#define handle_error_en(en, msg) \ + do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0) + +ThreadPool::ThreadPool(size_t numThreads, int cpuAffinityOffset, + int cpuAffinityIncr) : + m_stopped(false), m_stopping(false), m_queueLimit(numThreads*2) +{ +#if defined(_WIN32) || defined(_WIN64) + size_t numCPU = std::thread::hardware_concurrency(); +#else + size_t numCPU = sysconf(_SC_NPROCESSORS_ONLN); +#endif + //cerr << "numCPU=" << numCPU << endl; + + int cpuInd = cpuAffinityOffset % numCPU; + + for (size_t i = 0; i < numThreads; ++i) { + boost::thread *thread = m_threads.create_thread( + boost::bind(&ThreadPool::Execute, this)); + +#ifdef __linux + if (cpuAffinityOffset >= 0) { + int s; + + boost::thread::native_handle_type handle = thread->native_handle(); + + //cerr << "numCPU=" << numCPU << endl; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + + CPU_SET(cpuInd, &cpuset); + cpuInd += cpuAffinityIncr; + cpuInd = cpuInd % numCPU; + + s = pthread_setaffinity_np(handle, sizeof(cpu_set_t), &cpuset); + if (s != 0) { + handle_error_en(s, "pthread_setaffinity_np"); + //cerr << "affinity error with thread " << i << endl; + } + + // get affinity + CPU_ZERO(&cpuset); + s = pthread_getaffinity_np(handle, sizeof(cpu_set_t), &cpuset); + cerr << "Set returned by pthread_getaffinity_np() contained:\n"; + for (int j = 0; j < CPU_SETSIZE; j++) { + if (CPU_ISSET(j, &cpuset)) { + cerr << " CPU " << j << "\n"; + } + } + } +#endif + } +} + +void ThreadPool::Execute() +{ + do { + boost::shared_ptr task; + { + // Find a job to perform + boost::mutex::scoped_lock lock(m_mutex); + if (m_tasks.empty() && !m_stopped) { + m_threadNeeded.wait(lock); + } + if (!m_stopped && !m_tasks.empty()) { + task = m_tasks.front(); + m_tasks.pop(); + } + } + //Execute job + if (task) { + // must read from task before run. otherwise task may be deleted by main thread + // race condition + task->DeleteAfterExecution(); + task->Run(); + } + m_threadAvailable.notify_all(); + } while (!m_stopped); +} + +void ThreadPool::Submit(boost::shared_ptr task) +{ + boost::mutex::scoped_lock lock(m_mutex); + if (m_stopping) { + throw runtime_error("ThreadPool stopping - unable to accept new jobs"); + } + while (m_queueLimit > 0 && m_tasks.size() >= m_queueLimit) { + m_threadAvailable.wait(lock); + } + m_tasks.push(task); + m_threadNeeded.notify_all(); +} + +void ThreadPool::Stop(bool processRemainingJobs) +{ + { + //prevent more jobs from being added to the queue + boost::mutex::scoped_lock lock(m_mutex); + if (m_stopped) return; + m_stopping = true; + } + if (processRemainingJobs) { + boost::mutex::scoped_lock lock(m_mutex); + //wait for queue to drain. + while (!m_tasks.empty() && !m_stopped) { + m_threadAvailable.wait(lock); + } + } + //tell all threads to stop + { + boost::mutex::scoped_lock lock(m_mutex); + m_stopped = true; + } + m_threadNeeded.notify_all(); + + m_threads.join_all(); +} + +} + diff --git a/mosesdecoder/moses2/legacy/ThreadPool.h b/mosesdecoder/moses2/legacy/ThreadPool.h new file mode 100644 index 0000000000000000000000000000000000000000..e2cfac4a84691bde277dfda743567bfa9c1d82f1 --- /dev/null +++ b/mosesdecoder/moses2/legacy/ThreadPool.h @@ -0,0 +1,133 @@ +// $Id: ThreadPool.h 3045 2010-04-05 13:07:29Z hieuhoang1972 $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2009 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include + +#include + +#ifdef WITH_THREADS +#include +#include +#endif + +#ifdef BOOST_HAS_PTHREADS +#include +#endif + +//#include "Util.h" + +namespace Moses2 +{ + +/** + * Classes to implement a ThreadPool. + **/ + +/** A task to be executed by the ThreadPool + */ +class Task +{ +public: + virtual void Run() = 0; + virtual bool DeleteAfterExecution() { + return true; + } + virtual ~Task() { + } +}; + +class ThreadPool +{ +public: + /** + * Construct a thread pool of a fixed size. + **/ + explicit ThreadPool(size_t numThreads, int cpuAffinityOffset = -1, + int cpuAffinityIncr = 1); + + ~ThreadPool() { + Stop(); + } + + /** + * Add a job to the threadpool. + **/ + void Submit(boost::shared_ptr task); + + /** + * Wait until all queued jobs have completed, and shut down + * the ThreadPool. + **/ + void Stop(bool processRemainingJobs = false); + + /** + * Set maximum number of queued threads (otherwise Submit blocks) + **/ + void SetQueueLimit(size_t limit) { + m_queueLimit = limit; + } + +private: + /** + * The main loop executed by each thread. + **/ + void Execute(); + + std::queue > m_tasks; + boost::thread_group m_threads; + boost::mutex m_mutex; + boost::condition_variable m_threadNeeded; + boost::condition_variable m_threadAvailable; + bool m_stopped; + bool m_stopping; + size_t m_queueLimit; +}; + +class TestTask: public Task +{ +public: + TestTask(int id) : + m_id(id) { + } + + virtual void Run() { +#ifdef BOOST_HAS_PTHREADS + pthread_t tid = pthread_self(); +#else + typedef void * pthread_t; + pthread_t tid = 0; +#endif + std::cerr << "Executing " << m_id << " in thread id " << tid << std::endl; + } + + virtual ~TestTask() { + } + +private: + int m_id; +}; + +} + diff --git a/mosesdecoder/moses2/legacy/Timer.cpp b/mosesdecoder/moses2/legacy/Timer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..81858e2fc809e449a1042d470f694b84607a505b --- /dev/null +++ b/mosesdecoder/moses2/legacy/Timer.cpp @@ -0,0 +1,103 @@ +#include +#include +#include "Timer.h" + +#include "util/usage.hh" + +namespace Moses2 +{ + +Timer::Timer() : + running(false), stopped(false) +{ + start_time = 0; +} + +/*** + * Return the total wall time that the timer has been in the "running" + * state since it was first "started". + */ +double Timer::get_elapsed_time() const +{ + if (stopped) { + return stop_time - start_time; + } + if (running) { + return util::WallTime() - start_time; + } + return 0; +} + +/*** + * Start a timer. If it is already running, let it continue running. + * Print an optional message. + */ +void Timer::start(const char* msg) +{ + // Print an optional message, something like "Starting timer t"; + if (msg) { + std::cerr << msg << std::endl; + } + + // Return immediately if the timer is already running + if (running && !stopped) return; + + // If stopped, recompute start time + if (stopped) { + start_time = util::WallTime() - (stop_time - start_time); + stopped = false; + } else { + start_time = util::WallTime(); + running = true; + } +} + +/*** + * Stop a timer. + * Print an optional message. + */ +void Timer::stop(const char* msg) +{ + // Print an optional message, something like "Stopping timer t"; + if (msg) { + std::cerr << msg << std::endl; + } + + // Return immediately if the timer is not running + if (stopped || !running) return; + + // Record stopped time + stop_time = util::WallTime(); + + // Change timer status to running + stopped = true; +} + +/*** + * Print out an optional message followed by the current timer timing. + */ +void Timer::check(const char* msg) +{ + // Print an optional message, something like "Checking timer t"; + if (msg) { + std::cerr << msg << " : "; + } + +// VERBOSE(1, "[" << std::setiosflags(std::ios::fixed) << std::setprecision(2) << (running ? elapsed_time() : 0) << "] seconds\n"); + std::cerr << "[" << (running ? get_elapsed_time() : 0) << "] seconds\n"; +} + +/*** + * Allow timers to be printed to ostreams using the syntax 'os << t' + * for an ostream 'os' and a timer 't'. For example, "cout << t" will + * print out the total amount of time 't' has been "running". + */ +std::ostream& operator<<(std::ostream& os, Timer& t) +{ + //os << std::setprecision(2) << std::setiosflags(std::ios::fixed) << (t.running ? t.elapsed_time() : 0); + os << (t.running ? t.get_elapsed_time() : 0); + return os; +} + +} + diff --git a/mosesdecoder/moses2/legacy/Timer.h b/mosesdecoder/moses2/legacy/Timer.h new file mode 100644 index 0000000000000000000000000000000000000000..3f44ef4b9e858dd16ce5e3b9ab2182981372a57f --- /dev/null +++ b/mosesdecoder/moses2/legacy/Timer.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include + +namespace Moses2 +{ + +/** Wrapper around time_t to time how long things have been running + * according to walltime. We avoid CPU time since it is less reliable + * in a multi-threaded environment and can spuriously include clock cycles + * used by other threads in the same process. + */ +class Timer +{ + friend std::ostream& operator<<(std::ostream& os, Timer& t); + +private: + bool running; + bool stopped; + double start_time; + double stop_time; + +public: + /*** + * 'running' is initially false. A timer needs to be explicitly started + * using 'start' + */ + Timer(); + + void start(const char* msg = 0); + void stop(const char* msg = 0); + void check(const char* msg = 0); + double get_elapsed_time() const; +}; + +} + diff --git a/mosesdecoder/moses2/legacy/Util2.cpp b/mosesdecoder/moses2/legacy/Util2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9b4ff217cf3fa7f9acb387b11d0a038771f04c54 --- /dev/null +++ b/mosesdecoder/moses2/legacy/Util2.cpp @@ -0,0 +1,30 @@ +#include "Util2.h" +#include "util/exception.hh" + +namespace Moses2 +{ + +class BoolValueException: public util::Exception +{ +}; + +template<> +bool Scan(const std::string &input) +{ + std::string lc = ToLower(input); + if (lc == "yes" || lc == "y" || lc == "true" || lc == "1") return true; + if (lc == "no" || lc == "n" || lc == "false" || lc == "0") return false; + UTIL_THROW(BoolValueException, + "Could not interpret " << input << " as a boolean. After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0."); +} + +const std::string ToLower(const std::string& str) +{ + std::string lc(str); + std::transform(lc.begin(), lc.end(), lc.begin(), (int (*)(int))std::tolower); + return + lc ; +} + +} + diff --git a/mosesdecoder/moses2/legacy/Util2.h b/mosesdecoder/moses2/legacy/Util2.h new file mode 100644 index 0000000000000000000000000000000000000000..3a5ad47190c05bf797347b6b8b1c3327a48a1833 --- /dev/null +++ b/mosesdecoder/moses2/legacy/Util2.h @@ -0,0 +1,336 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../TypeDef.h" +#include "util/exception.hh" + +namespace Moses2 +{ + +#ifdef TRACE_ERR +#undef TRACE_ERR +#endif +#ifdef TRACE_ENABLE +#define TRACE_ERR(str) do { std::cerr << str; } while (false) +#else +#define TRACE_ERR(str) do {} while (false) +#endif + +//////////////////////////////////////////////////// + +template +class UnorderedComparer +{ +public: + size_t operator()(const T* obj) const { + return obj->hash(); + } + + bool operator()(const T* a, const T* b) const { + return a->hash() == b->hash(); + } + +}; + +//////////////////////////////////////////////////// + + +template +void Init(T arr[], size_t size, const T &val) +{ + for (size_t i = 0; i < size; ++i) { + arr[i] = val; + } +} + +//! delete white spaces at beginning and end of string +inline std::string Trim(const std::string& str, const std::string dropChars = + " \t\n\r") +{ + std::string res = str; + res.erase(str.find_last_not_of(dropChars) + 1); + return res.erase(0, res.find_first_not_of(dropChars)); +} + +//! convert string to variable of type T. Used to reading floats, int etc from files +template +inline T Scan(const std::string &input) +{ + std::stringstream stream(input); + T ret; + stream >> ret; + return ret; +} + +//! just return input +template<> +inline std::string Scan(const std::string &input) +{ + return input; +} + +template<> +inline SCORE Scan(const std::string &input) +{ + SCORE ret = atof(input.c_str()); + return ret; +} + +//! Specialisation to understand yes/no y/n true/false 0/1 +template<> +bool Scan(const std::string &input); + +template<> +inline S2TParsingAlgorithm Scan(const std::string &input) +{ + return (S2TParsingAlgorithm) Scan(input); +} + +template<> +inline SourceLabelOverlap Scan(const std::string &input) +{ + return (SourceLabelOverlap) Scan(input); +} + +template<> +inline SearchAlgorithm Scan(const std::string &input) +{ + return (SearchAlgorithm) Scan(input); +} + +template<> +inline XmlInputType Scan(const std::string &input) +{ + XmlInputType ret; + if (input=="exclusive") ret = XmlExclusive; + else if (input=="inclusive") ret = XmlInclusive; + else if (input=="constraint") ret = XmlConstraint; + else if (input=="ignore") ret = XmlIgnore; + else if (input=="pass-through") ret = XmlPassThrough; + else { + UTIL_THROW2("Unknown XML input type"); + } + + return ret; +} + +template<> +inline InputTypeEnum Scan(const std::string &input) +{ + return (InputTypeEnum) Scan(input); +} + +template<> +inline WordAlignmentSort Scan(const std::string &input) +{ + return (WordAlignmentSort) Scan(input); +} + +//! convert vectors of string to vectors of type T variables +template +inline std::vector Scan(const std::vector &input) +{ + std::vector output(input.size()); + for (size_t i = 0; i < input.size(); i++) { + output[i] = Scan(input[i]); + } + return output; +} + +//! speeded up version of above +template +inline void Scan(std::vector &output, const std::vector &input) +{ + output.resize(input.size()); + for (size_t i = 0; i < input.size(); i++) { + output[i] = Scan(input[i]); + } +} + +/** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument. + The separator can only be 1 character long. The default delimiters are space or tab + */ +inline std::vector Tokenize(const std::string& str, + const std::string& delimiters = " \t") +{ + std::vector tokens; + // Skip delimiters at beginning. + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (std::string::npos != pos || std::string::npos != lastPos) { + // Found a token, add it to the vector. + tokens.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } + + return tokens; +} + +//! tokenise input string to vector of type T +template +inline std::vector Tokenize(const std::string &input, + const std::string& delimiters = " \t") +{ + std::vector stringVector = Tokenize(input, delimiters); + return Scan(stringVector); +} + +/** only split of the first delimiter. Used by class FeatureFunction for parse key=value pair. + * Value may have = character + */ +inline std::vector TokenizeFirstOnly(const std::string& str, + const std::string& delimiters = " \t") +{ + std::vector tokens; + std::string::size_type pos = str.find_first_of(delimiters); + + if (std::string::npos != pos) { + // Found a token, add it to the vector. + tokens.push_back(str.substr(0, pos)); + tokens.push_back(str.substr(pos + 1, str.size() - pos - 1)); + } else { + tokens.push_back(str); + } + + return tokens; +} + +inline std::vector TokenizeMultiCharSeparator( + const std::string& str, const std::string& separator) +{ + std::vector tokens; + + size_t pos = 0; + // Find first "non-delimiter". + std::string::size_type nextPos = str.find(separator, pos); + + while (nextPos != std::string::npos) { + // Found a token, add it to the vector. + tokens.push_back(str.substr(pos, nextPos - pos)); + // Skip delimiters. Note the "not_of" + pos = nextPos + separator.size(); + // Find next "non-delimiter" + nextPos = str.find(separator, pos); + } + tokens.push_back(str.substr(pos, nextPos - pos)); + + return tokens; +} + +// speeded up version of above +inline void TokenizeMultiCharSeparator(std::vector &output, + const std::string& str, const std::string& separator) +{ + size_t pos = 0; + // Find first "non-delimiter". + std::string::size_type nextPos = str.find(separator, pos); + + while (nextPos != std::string::npos) { + // Found a token, add it to the vector. + output.push_back(Trim(str.substr(pos, nextPos - pos))); + // Skip delimiters. Note the "not_of" + pos = nextPos + separator.size(); + // Find next "non-delimiter" + nextPos = str.find(separator, pos); + } + output.push_back(Trim(str.substr(pos, nextPos - pos))); +} + +//! get string representation of any object/variable, as long as it can pipe to a stream +template +inline std::string SPrint(const T &input) +{ + std::stringstream stream(""); + stream << input; + return stream.str(); +} + +//! irst number are in log 10, transform to natural log +inline float TransformLMScore(float irstScore) +{ + return irstScore * 2.30258509299405f; +} + +//! transform prob to natural log score +inline float TransformScore(float prob) +{ + return log(prob); +} + +//! make sure score doesn't fall below LOWEST_SCORE +inline float FloorScore(float logScore) +{ + return (std::max)(logScore, LOWEST_SCORE); +} + +inline float UntransformLMScore(float logNScore) +{ + // opposite of above + return logNScore / 2.30258509299405f; +} + +inline bool FileExists(const std::string& filePath) +{ + std::ifstream ifs(filePath.c_str()); + return !ifs.fail(); +} + +const std::string ToLower(const std::string& str); + +//! delete and remove every element of a collection object such as set, list etc +template +void RemoveAllInColl(COLL &coll) +{ + for (typename COLL::const_iterator iter = coll.begin(); iter != coll.end(); + ++iter) { + delete (*iter); + } + coll.clear(); +} + +template +void Swap(T &a, T &b) +{ + T &c = a; + a = b; + b = c; +} + +// grab the underlying contain of priority queue +template +S& Container(std::priority_queue& q) +{ + struct HackedQueue: private std::priority_queue { + static S& Container(std::priority_queue& q) { + return q.*&HackedQueue::c; + } + }; + return HackedQueue::Container(q); +} + +#define HERE __FILE__ << ":" << __LINE__ + +/** Enforce rounding */ +inline void FixPrecision(std::ostream& stream, size_t size = 3) +{ + stream.setf(std::ios::fixed); + stream.precision(size); +} + +} + diff --git a/mosesdecoder/moses2/legacy/gzfilebuf.h b/mosesdecoder/moses2/legacy/gzfilebuf.h new file mode 100644 index 0000000000000000000000000000000000000000..db59980951190cb8703aefb41a38c35e755ceb5b --- /dev/null +++ b/mosesdecoder/moses2/legacy/gzfilebuf.h @@ -0,0 +1,94 @@ +#ifndef moses_gzfile_buf_h +#define moses_gzfile_buf_h + +#include +#include +#include +#include + +namespace Moses2 +{ + +/** wrapper around gzip input stream. Unknown parentage + * @todo replace with boost version - output stream already uses it + */ +class gzfilebuf: public std::streambuf +{ +public: + gzfilebuf(const char *filename) { + _gzf = gzopen(filename, "rb"); + if (!_gzf) throw std::runtime_error( + "Could not open " + std::string(filename) + "."); + setg(_buff + sizeof(int), // beginning of putback area + _buff + sizeof(int), // read position + _buff + sizeof(int)); // end position + } + ~gzfilebuf() { + gzclose(_gzf); + } +protected: + virtual int_type overflow(int_type /* c */) { + throw; + } + + // write multiple characters + virtual std::streamsize xsputn(const char* /* s */, std::streamsize /* num */) { + throw; + } + + virtual std::streampos seekpos(std::streampos /* sp */, + std::ios_base::openmode /* which = std::ios_base::in | std::ios_base::out */) { + throw; + } + + //read one character + virtual int_type underflow() { + // is read position before end of _buff? + if (gptr() < egptr()) { + return traits_type::to_int_type(*gptr()); + } + + /* process size of putback area + * - use number of characters read + * - but at most four + */ + unsigned int numPutback = gptr() - eback(); + if (numPutback > sizeof(int)) { + numPutback = sizeof(int); + } + + /* copy up to four characters previously read into + * the putback _buff (area of first four characters) + */ + std::memmove(_buff + (sizeof(int) - numPutback), gptr() - numPutback, + numPutback); + + // read new characters + int num = gzread(_gzf, _buff + sizeof(int), _buffsize - sizeof(int)); + if (num <= 0) { + // ERROR or EOF + return EOF; + } + + // reset _buff pointers + setg(_buff + (sizeof(int) - numPutback), // beginning of putback area + _buff + sizeof(int), // read position + _buff + sizeof(int) + num); // end of buffer + + // return next character + return traits_type::to_int_type(*gptr()); + } + + std::streamsize xsgetn(char* s, std::streamsize num) { + return gzread(_gzf, s, num); + } + +private: + gzFile _gzf; + static const unsigned int _buffsize = 1024; + char _buff[_buffsize]; +}; + +} + +#endif diff --git a/mosesdecoder/moses2/legacy/xmlrpc-c.h b/mosesdecoder/moses2/legacy/xmlrpc-c.h new file mode 100644 index 0000000000000000000000000000000000000000..1cdccad16d9947c2ffacdc8065f54ccf2f6fae87 --- /dev/null +++ b/mosesdecoder/moses2/legacy/xmlrpc-c.h @@ -0,0 +1,10 @@ +#pragma once + +#ifdef HAVE_XMLRPC_C +#include +#else +namespace xmlrpc_c +{ +class value; +} +#endif diff --git a/mosesdecoder/moses2/parameters/AllOptions.cpp b/mosesdecoder/moses2/parameters/AllOptions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..664b8dc6d4dd27d0707dadb5ab147b9895352afd --- /dev/null +++ b/mosesdecoder/moses2/parameters/AllOptions.cpp @@ -0,0 +1,118 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "../legacy/Parameter.h" +#include "../legacy/Util2.h" +#include "AllOptions.h" + +namespace Moses2 +{ +AllOptions:: +AllOptions() + : mira(false) + , use_legacy_pt(false) +{ } + +AllOptions:: +AllOptions(Parameter const& param) +{ + init(param); +} + +bool +AllOptions:: +init(Parameter const& param) +{ + if (!search.init(param)) return false; + if (!cube.init(param)) return false; + if (!nbest.init(param)) return false; + if (!reordering.init(param)) return false; + if (!context.init(param)) return false; + if (!input.init(param)) return false; + if (!mbr.init(param)) return false; + if (!lmbr.init(param)) return false; + if (!output.init(param)) return false; + if (!unk.init(param)) return false; + if (!server.init(param)) return false; + if (!syntax.init(param)) return false; + + param.SetParameter(mira, "mira", false); + + return sanity_check(); +} + +bool +AllOptions:: +sanity_check() +{ + using namespace std; + if (lmbr.enabled) { + if (mbr.enabled) { + cerr << "Error: Cannot use both n-best mbr and lattice mbr together" << endl; + return false; + } + mbr.enabled = true; + } + if (search.consensus) { + if (mbr.enabled) { + cerr << "Error: Cannot use consensus decoding together with mbr" + << endl; + return false; + } + mbr.enabled = true; + } + + // RecoverPath should only be used with confusion net or word lattice input + if (output.RecoverPath && input.input_type == SentenceInput) { + TRACE_ERR("--recover-input-path should only be used with confusion net or word lattice input!\n"); + output.RecoverPath = false; + } + + // set m_nbest_options.enabled = true if necessary: + nbest.enabled = (nbest.enabled || mira || search.consensus + || nbest.nbest_size > 0 + || mbr.enabled || lmbr.enabled + || !output.SearchGraph.empty() + || !output.SearchGraphExtended.empty() + || !output.SearchGraphSLF.empty() + || !output.SearchGraphHG.empty() + || !output.SearchGraphPB.empty() + || output.lattice_sample_size != 0); + + return true; +} + +#ifdef HAVE_XMLRPC_C +bool +AllOptions:: +update(std::mapconst& param) +{ + if (!search.update(param)) return false; + if (!cube.update(param)) return false; + if (!nbest.update(param)) return false; + if (!reordering.update(param)) return false; + if (!context.update(param)) return false; + if (!input.update(param)) return false; + if (!mbr.update(param)) return false; + if (!lmbr.update(param)) return false; + if (!output.update(param)) return false; + if (!unk.update(param)) return false; + //if (!server.update(param)) return false; + //if (!syntax.update(param)) return false; + return sanity_check(); +} +#endif + +bool +AllOptions:: +NBestDistinct() const +{ + return (nbest.only_distinct + || mbr.enabled || lmbr.enabled + || output.lattice_sample_size + || !output.SearchGraph.empty() + || !output.SearchGraphExtended.empty() + || !output.SearchGraphSLF.empty() + || !output.SearchGraphHG.empty()); +} + + +} diff --git a/mosesdecoder/moses2/parameters/AllOptions.h b/mosesdecoder/moses2/parameters/AllOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..2f09cd3854deec9dcaa23e242838036ac32e251e --- /dev/null +++ b/mosesdecoder/moses2/parameters/AllOptions.h @@ -0,0 +1,50 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include +#include "OptionsBaseClass.h" +#include "SearchOptions.h" +#include "CubePruningOptions.h" +#include "NBestOptions.h" +#include "ReorderingOptions.h" +#include "ContextParameters.h" +#include "InputOptions.h" +#include "MBR_Options.h" +#include "LMBR_Options.h" +#include "ReportingOptions.h" +#include "OOVHandlingOptions.h" +#include "ServerOptions.h" +#include "SyntaxOptions.h" + +namespace Moses2 +{ +struct + AllOptions : public OptionsBaseClass { + typedef boost::shared_ptr ptr; + SearchOptions search; + CubePruningOptions cube; + NBestOptions nbest; + ReorderingOptions reordering; + ContextParameters context; + InputOptions input; + MBR_Options mbr; + LMBR_Options lmbr; + ReportingOptions output; + OOVHandlingOptions unk; + ServerOptions server; + SyntaxOptions syntax; + bool mira; + bool use_legacy_pt; + // StackOptions stack; + // BeamSearchOptions beam; + bool init(Parameter const& param); + bool sanity_check(); + AllOptions(); + AllOptions(Parameter const& param); + + bool update(std::mapconst& param); + bool NBestDistinct() const; + +}; + +} diff --git a/mosesdecoder/moses2/parameters/BeamSearchOptions.h b/mosesdecoder/moses2/parameters/BeamSearchOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..590c7a53f7db01dbba0deaf564310326edb0e17b --- /dev/null +++ b/mosesdecoder/moses2/parameters/BeamSearchOptions.h @@ -0,0 +1,14 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "OptionsBaseClass.h" +namespace Moses2 +{ + +struct + BeamSearchOptions : public OptionsBaseClass { + bool init(Parameter const& param); + BeamSearchOptions(Parameter const& param); +}; + +} diff --git a/mosesdecoder/moses2/parameters/BookkeepingOptions.cpp b/mosesdecoder/moses2/parameters/BookkeepingOptions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d54f84644a4162e26f7e8c69b10bf455422996fe --- /dev/null +++ b/mosesdecoder/moses2/parameters/BookkeepingOptions.cpp @@ -0,0 +1,26 @@ +#include "BookkeepingOptions.h" +#include "../legacy/Parameter.h" + +namespace Moses2 +{ + +bool +BookkeepingOptions:: +init(Parameter const& P) +{ + bool& x = need_alignment_info; + P.SetParameter(x, "print-alignment-info", false); + if (!x) P.SetParameter(x, "print-alignment-info-in-n-best", false); + if (!x) { + PARAM_VEC const* params = P.GetParam("alignment-output-file"); + x = params && params->size(); + } + return true; +} + +BookkeepingOptions:: +BookkeepingOptions() + : need_alignment_info(false) +{ } + +} diff --git a/mosesdecoder/moses2/parameters/BookkeepingOptions.h b/mosesdecoder/moses2/parameters/BookkeepingOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..75a04a2a03a03abed2c66cf8e1aa279c0ed56255 --- /dev/null +++ b/mosesdecoder/moses2/parameters/BookkeepingOptions.h @@ -0,0 +1,17 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include "OptionsBaseClass.h" + +namespace Moses2 +{ +class Parameter; + +struct BookkeepingOptions : public OptionsBaseClass { + bool need_alignment_info; + bool init(Parameter const& param); + BookkeepingOptions(); +}; + + + +} diff --git a/mosesdecoder/moses2/parameters/ContextParameters.cpp b/mosesdecoder/moses2/parameters/ContextParameters.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3c5b894eefa17281a50157e64a061c998d0b041d --- /dev/null +++ b/mosesdecoder/moses2/parameters/ContextParameters.cpp @@ -0,0 +1,51 @@ +#include "ContextParameters.h" +#include "../legacy/Parameter.h" + +namespace Moses2 +{ + +ContextParameters:: +ContextParameters() + : look_ahead(0), look_back(0) +{ } + +bool +ContextParameters:: +init(Parameter const& params) +{ + look_back = look_ahead = 0; + params.SetParameter(context_string, "context-string", std::string("")); + std::string context_window; + params.SetParameter(context_window, "context-window", std::string("")); + + if (context_window == "") + return true; + + if (context_window.substr(0,3) == "all") { + look_back = look_ahead = std::numeric_limits::max(); + return true; + } + + size_t p = context_window.find_first_of("0123456789"); + if (p == 0) + look_back = look_ahead = atoi(context_window.c_str()); + + if (p == 1) { + if (context_window[0] == '-') + look_back = atoi(context_window.substr(1).c_str()); + else if (context_window[0] == '+') + look_ahead = atoi(context_window.substr(1).c_str()); + else + UTIL_THROW2("Invalid specification of context window."); + } + + if (p == 2) { + if (context_window.substr(0,2) == "+-" || + context_window.substr(0,2) == "-+") + look_back = look_ahead = atoi(context_window.substr(p).c_str()); + else + UTIL_THROW2("Invalid specification of context window."); + } + return true; +} +} diff --git a/mosesdecoder/moses2/parameters/ContextParameters.h b/mosesdecoder/moses2/parameters/ContextParameters.h new file mode 100644 index 0000000000000000000000000000000000000000..5226e8ecadc6003fe63ef1e12427ac97af3fa85a --- /dev/null +++ b/mosesdecoder/moses2/parameters/ContextParameters.h @@ -0,0 +1,19 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "OptionsBaseClass.h" + +namespace Moses2 +{ + +class ContextParameters : public OptionsBaseClass +{ +public: + ContextParameters(); + bool init(Parameter const& params); + size_t look_ahead; // # of words to look ahead for context-sensitive decoding + size_t look_back; // # of works to look back for context-sensitive decoding + std::string context_string; // fixed context string specified on command line +}; + +} diff --git a/mosesdecoder/moses2/parameters/CubePruningOptions.cpp b/mosesdecoder/moses2/parameters/CubePruningOptions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0772eaddb5d380e2cdb0d07bf15400da24af5da6 --- /dev/null +++ b/mosesdecoder/moses2/parameters/CubePruningOptions.cpp @@ -0,0 +1,76 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "CubePruningOptions.h" +#include "../TypeDef.h" +#include "../legacy/Parameter.h" + +namespace Moses2 +{ + +CubePruningOptions:: +CubePruningOptions() + : pop_limit(DEFAULT_CUBE_PRUNING_POP_LIMIT) + , diversity(DEFAULT_CUBE_PRUNING_DIVERSITY) + , lazy_scoring(false) + , deterministic_search(false) +{} + +bool +CubePruningOptions:: +init(Parameter const& param) +{ + param.SetParameter(pop_limit, "cube-pruning-pop-limit", + DEFAULT_CUBE_PRUNING_POP_LIMIT); + param.SetParameter(diversity, "cube-pruning-diversity", + DEFAULT_CUBE_PRUNING_DIVERSITY); + param.SetParameter(lazy_scoring, "cube-pruning-lazy-scoring", false); + //param.SetParameter(deterministic_search, "cube-pruning-deterministic-search", false); + return true; +} + +#ifdef HAVE_XMLRPC_C +bool +CubePruningOptions:: +update(std::mapconst& params) +{ + typedef std::map params_t; + + params_t::const_iterator si = params.find("cube-pruning-pop-limit"); + if (si != params.end()) pop_limit = xmlrpc_c::value_int(si->second); + + si = params.find("cube-pruning-diversity"); + if (si != params.end()) diversity = xmlrpc_c::value_int(si->second); + + si = params.find("cube-pruning-lazy-scoring"); + if (si != params.end()) { + std::string spec = xmlrpc_c::value_string(si->second); + if (spec == "true" or spec == "on" or spec == "1") + lazy_scoring = true; + else if (spec == "false" or spec == "off" or spec == "0") + lazy_scoring = false; + else { + char const* msg + = "Error parsing specification for cube-pruning-lazy-scoring"; + xmlrpc_c::fault(msg, xmlrpc_c::fault::CODE_PARSE); + } + } + + si = params.find("cube-pruning-deterministic-search"); + if (si != params.end()) { + std::string spec = xmlrpc_c::value_string(si->second); + if (spec == "true" or spec == "on" or spec == "1") + deterministic_search = true; + else if (spec == "false" or spec == "off" or spec == "0") + deterministic_search = false; + else { + char const* msg + = "Error parsing specification for cube-pruning-deterministic-search"; + xmlrpc_c::fault(msg, xmlrpc_c::fault::CODE_PARSE); + } + } + + return true; +} +#endif + + +} diff --git a/mosesdecoder/moses2/parameters/CubePruningOptions.h b/mosesdecoder/moses2/parameters/CubePruningOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..6fa43b7ec7a43e4cbe473a583bacc5707616475a --- /dev/null +++ b/mosesdecoder/moses2/parameters/CubePruningOptions.h @@ -0,0 +1,24 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "OptionsBaseClass.h" + +namespace Moses2 +{ + +struct + CubePruningOptions : public OptionsBaseClass { + size_t pop_limit; + size_t diversity; + bool lazy_scoring; + bool deterministic_search; + + bool init(Parameter const& param); + CubePruningOptions(Parameter const& param); + CubePruningOptions(); + + bool + update(std::mapconst& params); +}; + +} diff --git a/mosesdecoder/moses2/parameters/InputOptions.cpp b/mosesdecoder/moses2/parameters/InputOptions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7a8c9242c5c0a6f03f13bdf5138def6fbfce45d5 --- /dev/null +++ b/mosesdecoder/moses2/parameters/InputOptions.cpp @@ -0,0 +1,99 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "InputOptions.h" +#include +#include +#include "../legacy/Parameter.h" + +namespace Moses2 +{ + +InputOptions:: +InputOptions() + : continue_partial_translation(false) + , input_type(SentenceInput) + , xml_policy(XmlPassThrough) + , placeholder_factor(NOT_FOUND) +{ + xml_brackets.first = "<"; + xml_brackets.second = ">"; + factor_order.assign(1,0); + factor_delimiter = "|"; +} + +bool +InputOptions:: +init(Parameter const& param) +{ + param.SetParameter(input_type, "inputtype", SentenceInput); +#if 0 + if (input_type == SentenceInput) { + VERBOSE(2, "input type is: text input"); + } else if (input_type == ConfusionNetworkInput) { + VERBOSE(2, "input type is: confusion net"); + } else if (input_type == WordLatticeInput) { + VERBOSE(2, "input type is: word lattice"); + } else if (input_type == TreeInputType) { + VERBOSE(2, "input type is: tree"); + } else if (input_type == TabbedSentenceInput) { + VERBOSE(2, "input type is: tabbed sentence"); + } else if (input_type == ForestInputType) { + VERBOSE(2, "input type is: forest"); + } +#endif + + + param.SetParameter(continue_partial_translation, + "continue-partial-translation", false); + + param.SetParameter(xml_policy, "xml-input", XmlPassThrough); + + // specify XML tags opening and closing brackets for XML option + // Do we really want this to be configurable???? UG + const PARAM_VEC *pspec; + pspec = param.GetParam("xml-brackets"); + if (pspec && pspec->size()) { + std::vector brackets = Tokenize(pspec->at(0)); + if(brackets.size()!=2) { + std::cerr << "invalid xml-brackets value, " + << "must specify exactly 2 blank-delimited strings " + << "for XML tags opening and closing brackets" + << std::endl; + exit(1); + } + + xml_brackets.first= brackets[0]; + xml_brackets.second=brackets[1]; + +#if 0 + VERBOSE(1,"XML tags opening and closing brackets for XML input are: " + << xml_brackets.first << " and " + << xml_brackets.second << std::endl); +#endif + } + + pspec = param.GetParam("input-factors"); + if (pspec) factor_order = Scan(*pspec); + if (factor_order.empty()) factor_order.assign(1,0); + param.SetParameter(placeholder_factor, "placeholder-factor", NOT_FOUND); + + param.SetParameter(factor_delimiter, "factor-delimiter", "|"); + param.SetParameter(input_file_path,"input-file",""); + + return true; +} + + +#ifdef HAVE_XMLRPC_C +bool +InputOptions:: +update(std::mapconst& param) +{ + typedef std::map params_t; + params_t::const_iterator si = param.find("xml-input"); + if (si != param.end()) + xml_policy = Scan(xmlrpc_c::value_string(si->second)); + return true; +} +#endif + +} diff --git a/mosesdecoder/moses2/parameters/InputOptions.h b/mosesdecoder/moses2/parameters/InputOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..6e70e1e1e627058d11dc2f2953a9694b64db2bda --- /dev/null +++ b/mosesdecoder/moses2/parameters/InputOptions.h @@ -0,0 +1,31 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include +#include "OptionsBaseClass.h" +#include "../TypeDef.h" + +namespace Moses2 +{ +struct + InputOptions : public OptionsBaseClass { + bool continue_partial_translation; + InputTypeEnum input_type; + XmlInputType xml_policy; // pass through, ignore, exclusive, inclusive + std::vector factor_order; // input factor order + std::string factor_delimiter; + FactorType placeholder_factor; // where to store original text for placeholders + std::string input_file_path; + std::pair xml_brackets; + // strings to use as XML tags' opening and closing brackets. + // Default are "<" and ">" + + InputOptions(); + + bool init(Parameter const& param); + bool update(std::mapconst& param); + +}; + +} + diff --git a/mosesdecoder/moses2/parameters/LMBR_Options.cpp b/mosesdecoder/moses2/parameters/LMBR_Options.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a65c071b974e5b61e4ba2c1139abe93dd1486c9c --- /dev/null +++ b/mosesdecoder/moses2/parameters/LMBR_Options.cpp @@ -0,0 +1,39 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "LMBR_Options.h" +#include "../legacy/Parameter.h" + +namespace Moses2 +{ + +LMBR_Options:: +LMBR_Options() + : enabled(false) + , use_lattice_hyp_set(false) + , precision(0.8f) + , ratio(0.6f) + , map_weight(0.8f) + , pruning_factor(30) +{ } + +bool +LMBR_Options:: +init(Parameter const& param) +{ + param.SetParameter(enabled, "lminimum-bayes-risk", false); + + param.SetParameter(ratio, "lmbr-r", 0.6f); + param.SetParameter(precision, "lmbr-p", 0.8f); + param.SetParameter(map_weight, "lmbr-map-weight", 0.0f); + param.SetParameter(pruning_factor, "lmbr-pruning-factor", size_t(30)); + param.SetParameter(use_lattice_hyp_set, "lattice-hypo-set", false); + + PARAM_VEC const* params = param.GetParam("lmbr-thetas"); + if (params) theta = Scan(*params); + + return true; +} + + + + +} diff --git a/mosesdecoder/moses2/parameters/LMBR_Options.h b/mosesdecoder/moses2/parameters/LMBR_Options.h new file mode 100644 index 0000000000000000000000000000000000000000..84e5fd75900c212db17c9ba474df05425eb4fe42 --- /dev/null +++ b/mosesdecoder/moses2/parameters/LMBR_Options.h @@ -0,0 +1,25 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include +#include "OptionsBaseClass.h" + +namespace Moses2 +{ + +// Options for mimum bayes risk decoding +struct + LMBR_Options : public OptionsBaseClass { + bool enabled; + bool use_lattice_hyp_set; //! to use nbest as hypothesis set during lattice MBR + float precision; //! unigram precision theta - see Tromble et al 08 for more details + float ratio; //! decaying factor for ngram thetas - see Tromble et al 08 + float map_weight; //! Weight given to the map solution. See Kumar et al 09 + size_t pruning_factor; //! average number of nodes per word wanted in pruned lattice + std::vector theta; //! theta(s) for lattice mbr calculation + bool init(Parameter const& param); + LMBR_Options(); +}; + +} + diff --git a/mosesdecoder/moses2/parameters/LookupOptions.h b/mosesdecoder/moses2/parameters/LookupOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..58aa733cfb88fcdee9427d53318f286886f147d9 --- /dev/null +++ b/mosesdecoder/moses2/parameters/LookupOptions.h @@ -0,0 +1,16 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "OptionsBaseClass.h" + +namespace Moses2 +{ + +struct + LookupOptions : public OptionsBaseClass { + bool init(Parameter const& param); + LookupOptions() {} +}; + +} + diff --git a/mosesdecoder/moses2/parameters/MBR_Options.cpp b/mosesdecoder/moses2/parameters/MBR_Options.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0e7abd3846c4c0920b11ebc523e794e91d62f88f --- /dev/null +++ b/mosesdecoder/moses2/parameters/MBR_Options.cpp @@ -0,0 +1,26 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "MBR_Options.h" +#include "../legacy/Parameter.h" + +namespace Moses2 +{ + +MBR_Options:: +MBR_Options() + : enabled(false) + , size(200) + , scale(1.0f) +{} + + +bool +MBR_Options:: +init(Parameter const& param) +{ + param.SetParameter(enabled, "minimum-bayes-risk", false); + param.SetParameter(size, "mbr-size", 200); + param.SetParameter(scale, "mbr-scale", 1.0f); + return true; +} + +} diff --git a/mosesdecoder/moses2/parameters/MBR_Options.h b/mosesdecoder/moses2/parameters/MBR_Options.h new file mode 100644 index 0000000000000000000000000000000000000000..0f8068ca01804538e74a09b1588819dc854ca3b4 --- /dev/null +++ b/mosesdecoder/moses2/parameters/MBR_Options.h @@ -0,0 +1,20 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "OptionsBaseClass.h" +namespace Moses2 +{ + +// Options for mimum bayes risk decoding +struct + MBR_Options : public OptionsBaseClass { + bool enabled; + size_t size; //! number of translation candidates considered + float scale; /*! scaling factor for computing marginal probability + * of candidate translation */ + bool init(Parameter const& param); + MBR_Options(); +}; + +} + diff --git a/mosesdecoder/moses2/parameters/NBestOptions.cpp b/mosesdecoder/moses2/parameters/NBestOptions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d72c155e286f8346a0bf58c5d9581c8ff7374f15 --- /dev/null +++ b/mosesdecoder/moses2/parameters/NBestOptions.cpp @@ -0,0 +1,68 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "NBestOptions.h" +#include "../legacy/Parameter.h" + +namespace Moses2 +{ + +NBestOptions:: +NBestOptions() + : nbest_size(0) + , factor(20) + , enabled(false) + , print_trees(false) + , only_distinct(false) + , include_alignment_info(false) + , include_feature_labels(true) + , include_segmentation(false) + , include_passthrough(false) + , include_all_factors(false) +{} + + +bool +NBestOptions:: +init(Parameter const& P) +{ + const PARAM_VEC *params; + params = P.GetParam("n-best-list"); + if (params) { + if (params->size() >= 2) { + output_file_path = params->at(0); + nbest_size = Scan( params->at(1) ); + only_distinct = (params->size()>2 && params->at(2)=="distinct"); + } else { + std::cerr << "wrong format for switch -n-best-list file size [distinct]"; + return false; + } + } else nbest_size = 0; + + P.SetParameter(factor, "n-best-factor", 20); + P.SetParameter(include_alignment_info, "print-alignment-info-in-n-best", false ); + P.SetParameter(include_feature_labels, "labeled-n-best-list", true ); + P.SetParameter(include_segmentation, "include-segmentation-in-n-best", false ); + P.SetParameter(include_passthrough, "print-passthrough-in-n-best", false ); + P.SetParameter(include_all_factors, "report-all-factors-in-n-best", false ); + P.SetParameter(print_trees, "n-best-trees", false ); + + enabled = output_file_path.size(); + return true; +} + +#ifdef HAVE_XMLRPC_C +bool +NBestOptions:: +update(std::mapconst& param) +{ + typedef std::map params_t; + params_t::const_iterator si = param.find("nbest"); + if (si != param.end()) + nbest_size = xmlrpc_c::value_int(si->second); + only_distinct = check(param, "nbest-distinct", only_distinct); + enabled = (nbest_size > 0); + return true; +} +#endif + + +} // namespace Moses diff --git a/mosesdecoder/moses2/parameters/NBestOptions.h b/mosesdecoder/moses2/parameters/NBestOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..d3caed425d87e6422121d8b292178eb25b410238 --- /dev/null +++ b/mosesdecoder/moses2/parameters/NBestOptions.h @@ -0,0 +1,31 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "OptionsBaseClass.h" +namespace Moses2 +{ + +struct NBestOptions : public OptionsBaseClass { + size_t nbest_size; + size_t factor; + bool enabled; + bool print_trees; + bool only_distinct; + + bool include_alignment_info; + bool include_segmentation; + bool include_feature_labels; + bool include_passthrough; + + bool include_all_factors; + + std::string output_file_path; + + bool init(Parameter const& param); + + bool update(std::mapconst& param); + + NBestOptions(); +}; + +} diff --git a/mosesdecoder/moses2/parameters/OOVHandlingOptions.cpp b/mosesdecoder/moses2/parameters/OOVHandlingOptions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c7a5e30f2d7ff377a0b8fd83c17153d71bbacbe7 --- /dev/null +++ b/mosesdecoder/moses2/parameters/OOVHandlingOptions.cpp @@ -0,0 +1,48 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "OOVHandlingOptions.h" +#include +#include +#include "../legacy/Parameter.h" + +namespace Moses2 +{ + +OOVHandlingOptions:: +OOVHandlingOptions() +{ + drop = false; + mark = false; + prefix = "UNK"; + suffix = ""; + word_deletion_enabled = false; + always_create_direct_transopt = false; +} + +bool +OOVHandlingOptions:: +init(Parameter const& param) +{ + param.SetParameter(drop,"drop-unknown",false); + param.SetParameter(mark,"mark-unknown",false); + param.SetParameter(word_deletion_enabled, "phrase-drop-allowed", false); + param.SetParameter(always_create_direct_transopt, "always-create-direct-transopt", false); + param.SetParameter(prefix,"unknown-word-prefix","UNK"); + param.SetParameter(suffix,"unknown-word-suffix",""); + return true; +} + + +#ifdef HAVE_XMLRPC_C +bool +OOVHandlingOptions:: +update(std::mapconst& param) +{ + typedef std::map params_t; + // params_t::const_iterator si = param.find("xml-input"); + // if (si != param.end()) + // xml_policy = Scan(xmlrpc_c::value_string(si->second)); + return true; +} +#endif + +} diff --git a/mosesdecoder/moses2/parameters/OOVHandlingOptions.h b/mosesdecoder/moses2/parameters/OOVHandlingOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..1b56d6d93e38027379d7bce3683483ce2f10c35e --- /dev/null +++ b/mosesdecoder/moses2/parameters/OOVHandlingOptions.h @@ -0,0 +1,26 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include +#include "OptionsBaseClass.h" + +namespace Moses2 +{ +struct + OOVHandlingOptions : public OptionsBaseClass { + bool drop; + bool mark; + std::string prefix; + std::string suffix; + + bool word_deletion_enabled; + bool always_create_direct_transopt; + OOVHandlingOptions(); + + bool init(Parameter const& param); + bool update(std::mapconst& param); + +}; + +} + diff --git a/mosesdecoder/moses2/parameters/OptionsBaseClass.cpp b/mosesdecoder/moses2/parameters/OptionsBaseClass.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8ccb0a5639aabb10a13099cb40270c3629a7f5e0 --- /dev/null +++ b/mosesdecoder/moses2/parameters/OptionsBaseClass.cpp @@ -0,0 +1,29 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- +#include "OptionsBaseClass.h" +#include "../legacy/Parameter.h" + +namespace Moses2 +{ + +#ifdef HAVE_XMLRPC_C +bool +OptionsBaseClass:: +update(std::mapconst& params) +{ + return true; +} +#endif + +#ifdef HAVE_XMLRPC_C +bool +OptionsBaseClass:: +check(std::map const& param, + std::string const key, bool dfltval) +{ + std::map::const_iterator m; + m = param.find(key); + if (m == param.end()) return dfltval; + return Scan(xmlrpc_c::value_string(m->second)); +} +#endif +} diff --git a/mosesdecoder/moses2/parameters/OptionsBaseClass.h b/mosesdecoder/moses2/parameters/OptionsBaseClass.h new file mode 100644 index 0000000000000000000000000000000000000000..5265e9b23dd6a5975304f7b08fcb4da635201b4d --- /dev/null +++ b/mosesdecoder/moses2/parameters/OptionsBaseClass.h @@ -0,0 +1,19 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include "../legacy/xmlrpc-c.h" +#include +#include +namespace Moses2 +{ +class Parameter; + +struct OptionsBaseClass { +#ifdef HAVE_XMLRPC_C + virtual bool + update(std::mapconst& params); +#endif + bool + check(std::map const& param, + std::string const key, bool dfltval); +}; +} diff --git a/mosesdecoder/moses2/parameters/ReorderingOptions.cpp b/mosesdecoder/moses2/parameters/ReorderingOptions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..64e777de76f7c12a0eb200f9a748dfea78a7474d --- /dev/null +++ b/mosesdecoder/moses2/parameters/ReorderingOptions.cpp @@ -0,0 +1,31 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "ReorderingOptions.h" +#include "../legacy/Parameter.h" + +namespace Moses2 +{ + +ReorderingOptions:: +ReorderingOptions() + : max_distortion(-1) + , monotone_at_punct(false) + , use_early_distortion_cost(false) +{} + + +ReorderingOptions:: +ReorderingOptions(Parameter const& param) +{ + init(param); +} + +bool +ReorderingOptions:: +init(Parameter const& param) +{ + param.SetParameter(max_distortion, "distortion-limit", -1); + param.SetParameter(monotone_at_punct, "monotone-at-punctuation", false); + param.SetParameter(use_early_distortion_cost, "early-distortion-cost", false); + return true; +} +} diff --git a/mosesdecoder/moses2/parameters/ReorderingOptions.h b/mosesdecoder/moses2/parameters/ReorderingOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..319124e8303be09b97e4e01a7b4cdfe88cae4f26 --- /dev/null +++ b/mosesdecoder/moses2/parameters/ReorderingOptions.h @@ -0,0 +1,19 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "OptionsBaseClass.h" +namespace Moses2 +{ + +struct + ReorderingOptions : public OptionsBaseClass { + int max_distortion; + bool monotone_at_punct; + bool use_early_distortion_cost; + bool init(Parameter const& param); + ReorderingOptions(Parameter const& param); + ReorderingOptions(); +}; + +} + diff --git a/mosesdecoder/moses2/parameters/ReportingOptions.cpp b/mosesdecoder/moses2/parameters/ReportingOptions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..428cc056602db49058a244a705e35391998c215f --- /dev/null +++ b/mosesdecoder/moses2/parameters/ReportingOptions.cpp @@ -0,0 +1,152 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "ReportingOptions.h" +#include "../legacy/Parameter.h" + +namespace Moses2 +{ +using namespace std; + +ReportingOptions:: +ReportingOptions() + : start_translation_id(0) + , ReportAllFactors(false) + , ReportSegmentation(0) + , PrintAlignmentInfo(false) + , PrintAllDerivations(false) + , PrintTranslationOptions(false) + , WA_SortOrder(NoSort) + , WordGraph(false) + , DontPruneSearchGraph(false) + , RecoverPath(false) + , ReportHypoScore(false) + , PrintID(false) + , PrintPassThrough(false) + , include_lhs_in_search_graph(false) + , lattice_sample_size(0) +{ + factor_order.assign(1,0); + factor_delimiter = "|"; +} + +bool +ReportingOptions:: +init(Parameter const& param) +{ + param.SetParameter(start_translation_id, "start-translation-id", 0); + + // including factors in the output + param.SetParameter(ReportAllFactors, "report-all-factors", false); + + // segmentation reporting + ReportSegmentation = (param.GetParam("report-segmentation-enriched") + ? 2 : param.GetParam("report-segmentation") + ? 1 : 0); + + // word alignment reporting + param.SetParameter(PrintAlignmentInfo, "print-alignment-info", false); + param.SetParameter(WA_SortOrder, "sort-word-alignment", NoSort); + std::string e; // hack to save us param.SetParameter(...) + param.SetParameter(AlignmentOutputFile,"alignment-output-file", e); + + + param.SetParameter(PrintAllDerivations, "print-all-derivations", false); + param.SetParameter(PrintTranslationOptions, "print-translation-option", false); + + // output a word graph + PARAM_VEC const* params; + params = param.GetParam("output-word-graph"); + WordGraph = (params && params->size() == 2); // what are the two options? + + // dump the search graph + param.SetParameter(SearchGraph, "output-search-graph", e); + param.SetParameter(SearchGraphExtended, "output-search-graph-extended", e); + param.SetParameter(SearchGraphSLF,"output-search-graph-slf", e); + param.SetParameter(SearchGraphHG, "output-search-graph-hypergraph", e); +#ifdef HAVE_PROTOBUF + param.SetParameter(SearchGraphPB, "output-search-graph-pb", e); +#endif + + param.SetParameter(DontPruneSearchGraph, "unpruned-search-graph", false); + param.SetParameter(include_lhs_in_search_graph, + "include-lhs-in-search-graph", false ); + + + // miscellaneous + param.SetParameter(RecoverPath, "recover-input-path",false); + param.SetParameter(ReportHypoScore, "output-hypo-score",false); + param.SetParameter(PrintID, "print-id",false); + param.SetParameter(PrintPassThrough, "print-passthrough",false); + param.SetParameter(detailed_all_transrep_filepath, + "translation-all-details", e); + param.SetParameter(detailed_transrep_filepath, "translation-details", e); + param.SetParameter(detailed_tree_transrep_filepath, + "tree-translation-details", e); + + params = param.GetParam("lattice-samples"); + if (params) { + if (params->size() ==2 ) { + lattice_sample_filepath = params->at(0); + lattice_sample_size = Scan(params->at(1)); + } else { + std::cerr <<"wrong format for switch -lattice-samples file size"; + return false; + } + } + + + if (ReportAllFactors) { + factor_order.clear(); + for (size_t i = 0; i < MAX_NUM_FACTORS; ++i) + factor_order.push_back(i); + } else { + params= param.GetParam("output-factors"); + if (params) factor_order = Scan(*params); + if (factor_order.empty()) factor_order.assign(1,0); + } + + param.SetParameter(factor_delimiter, "factor-delimiter", std::string("|")); + param.SetParameter(factor_delimiter, "output-factor-delimiter", factor_delimiter); + + return true; +} + +#ifdef HAVE_XMLRPC_C +bool +ReportingOptions:: +update(std::mapconst& param) +{ + ReportAllFactors = check(param, "report-all-factors", ReportAllFactors); + + + std::map::const_iterator m; + m = param.find("output-factors"); + if (m != param.end()) { + factor_order=Tokenize(xmlrpc_c::value_string(m->second),","); + } + + if (ReportAllFactors) { + factor_order.clear(); + for (size_t i = 0; i < MAX_NUM_FACTORS; ++i) + factor_order.push_back(i); + } + + m = param.find("align"); + if (m != param.end() && Scan(xmlrpc_c::value_string(m->second))) + ReportSegmentation = 1; + + PrintAlignmentInfo = check(param,"word-align",PrintAlignmentInfo); + + m = param.find("factor-delimiter"); + if (m != param.end()) { + factor_delimiter = Trim(xmlrpc_c::value_string(m->second)); + } + + m = param.find("output-factor-delimiter"); + if (m != param.end()) { + factor_delimiter = Trim(xmlrpc_c::value_string(m->second)); + } + + return true; +} +#endif +} diff --git a/mosesdecoder/moses2/parameters/ReportingOptions.h b/mosesdecoder/moses2/parameters/ReportingOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..6b491f3b655a50b7feae0db42fffcba155ed60c7 --- /dev/null +++ b/mosesdecoder/moses2/parameters/ReportingOptions.h @@ -0,0 +1,69 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include +#include "OptionsBaseClass.h" +#include "../TypeDef.h" + +namespace Moses2 +{ + +struct + ReportingOptions : public OptionsBaseClass { + long start_translation_id; + + std::vector factor_order; + std::string factor_delimiter; + + bool ReportAllFactors; // m_reportAllFactors; + int ReportSegmentation; // 0: no 1: m_reportSegmentation 2: ..._enriched + + bool PrintAlignmentInfo; // m_PrintAlignmentInfo + bool PrintAllDerivations; + bool PrintTranslationOptions; + + WordAlignmentSort WA_SortOrder; // 0: no, 1: target order + std::string AlignmentOutputFile; + + bool WordGraph; + + std::string SearchGraph; + std::string SearchGraphExtended; + std::string SearchGraphSLF; + std::string SearchGraphHG; + std::string SearchGraphPB; + bool DontPruneSearchGraph; + + bool RecoverPath; // recover input path? + bool ReportHypoScore; + + bool PrintID; + bool PrintPassThrough; + + // transrep = translation reporting + std::string detailed_transrep_filepath; + std::string detailed_tree_transrep_filepath; + std::string detailed_all_transrep_filepath; + bool include_lhs_in_search_graph; + + + std::string lattice_sample_filepath; + size_t lattice_sample_size; + + bool init(Parameter const& param); + + /// do we need to keep the search graph from decoding? + bool NeedSearchGraph() const { + return !(SearchGraph.empty() && SearchGraphExtended.empty()); + } + +#ifdef HAVE_XMLRPC_C + bool update(std::mapconst& param); +#endif + + + ReportingOptions(); +}; + +} + diff --git a/mosesdecoder/moses2/parameters/SearchOptions.cpp b/mosesdecoder/moses2/parameters/SearchOptions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b3a468896552db110d87be58d3350332aeab7726 --- /dev/null +++ b/mosesdecoder/moses2/parameters/SearchOptions.cpp @@ -0,0 +1,106 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "SearchOptions.h" +#include "../legacy/Parameter.h" + +namespace Moses2 +{ + +SearchOptions:: +SearchOptions() + : algo(Normal) + , stack_size(DEFAULT_MAX_HYPOSTACK_SIZE) + , stack_diversity(0) + , disable_discarding(false) + , max_phrase_length(DEFAULT_MAX_PHRASE_LENGTH) + , max_trans_opt_per_cov(DEFAULT_MAX_TRANS_OPT_SIZE) + , max_partial_trans_opt(DEFAULT_MAX_PART_TRANS_OPT_SIZE) + , beam_width(DEFAULT_BEAM_WIDTH) + , timeout(0) + , consensus(false) + , early_discarding_threshold(DEFAULT_EARLY_DISCARDING_THRESHOLD) + , trans_opt_threshold(DEFAULT_TRANSLATION_OPTION_THRESHOLD) +{ } + +SearchOptions:: +SearchOptions(Parameter const& param) + : stack_diversity(0) +{ + init(param); +} + +bool +SearchOptions:: +init(Parameter const& param) +{ + param.SetParameter(algo, "search-algorithm", Normal); + param.SetParameter(stack_size, "stack", DEFAULT_MAX_HYPOSTACK_SIZE); + param.SetParameter(stack_diversity, "stack-diversity", size_t(0)); + param.SetParameter(beam_width, "beam-threshold", DEFAULT_BEAM_WIDTH); + param.SetParameter(early_discarding_threshold, "early-discarding-threshold", + DEFAULT_EARLY_DISCARDING_THRESHOLD); + param.SetParameter(timeout, "time-out", 0); + param.SetParameter(max_phrase_length, "max-phrase-length", + DEFAULT_MAX_PHRASE_LENGTH); + param.SetParameter(trans_opt_threshold, "translation-option-threshold", + DEFAULT_TRANSLATION_OPTION_THRESHOLD); + param.SetParameter(max_trans_opt_per_cov, "max-trans-opt-per-coverage", + DEFAULT_MAX_TRANS_OPT_SIZE); + param.SetParameter(max_partial_trans_opt, "max-partial-trans-opt", + DEFAULT_MAX_PART_TRANS_OPT_SIZE); + + param.SetParameter(consensus, "consensus-decoding", false); + param.SetParameter(disable_discarding, "disable-discarding", false); + + // transformation to log of a few scores + beam_width = TransformScore(beam_width); + trans_opt_threshold = TransformScore(trans_opt_threshold); + early_discarding_threshold = TransformScore(early_discarding_threshold); + + return true; +} + +bool +is_syntax(SearchAlgorithm algo) +{ + return (algo == CYKPlus || algo == ChartIncremental || + algo == SyntaxS2T || algo == SyntaxT2S || + algo == SyntaxF2S || algo == SyntaxT2S_SCFG); +} + +#ifdef HAVE_XMLRPC_C +bool +SearchOptions:: +update(std::mapconst& params) +{ + typedef std::map params_t; + + params_t::const_iterator si = params.find("search-algorithm"); + if (si != params.end()) { + // use named parameters + std::string spec = xmlrpc_c::value_string(si->second); + if (spec == "normal" || spec == "0") algo = Normal; + else if (spec == "cube" || spec == "1") algo = CubePruning; + else throw xmlrpc_c::fault("Unsupported search algorithm", + xmlrpc_c::fault::CODE_PARSE); + } + + si = params.find("stack"); + if (si != params.end()) stack_size = xmlrpc_c::value_int(si->second); + + si = params.find("stack-diversity"); + if (si != params.end()) stack_diversity = xmlrpc_c::value_int(si->second); + + si = params.find("beam-threshold"); + if (si != params.end()) beam_width = xmlrpc_c::value_double(si->second); + + si = params.find("time-out"); + if (si != params.end()) timeout = xmlrpc_c::value_int(si->second); + + si = params.find("max-phrase-length"); + if (si != params.end()) max_phrase_length = xmlrpc_c::value_int(si->second); + + return true; +} +#endif + +} diff --git a/mosesdecoder/moses2/parameters/SearchOptions.h b/mosesdecoder/moses2/parameters/SearchOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..31e364d1443dba0df05956db023c80c4accc853b --- /dev/null +++ b/mosesdecoder/moses2/parameters/SearchOptions.h @@ -0,0 +1,53 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include +#include "OptionsBaseClass.h" +#include "../TypeDef.h" + +namespace Moses2 +{ + +bool is_syntax(SearchAlgorithm algo); + +struct + SearchOptions : public OptionsBaseClass { + SearchAlgorithm algo; + + // stack decoding + size_t stack_size; // maxHypoStackSize; + size_t stack_diversity; // minHypoStackDiversity; + bool disable_discarding; + // Disable discarding of bad hypotheses from HypothesisStackNormal + size_t max_phrase_length; + size_t max_trans_opt_per_cov; + size_t max_partial_trans_opt; + // beam search + float beam_width; + + int timeout; + + bool consensus; //! Use Consensus decoding (DeNero et al 2009) + + // reordering options + // bool reorderingConstraint; //! use additional reordering constraints + // bool useEarlyDistortionCost; + + float early_discarding_threshold; + float trans_opt_threshold; + + bool init(Parameter const& param); + SearchOptions(Parameter const& param); + SearchOptions(); + + bool + UseEarlyDiscarding() const { + return early_discarding_threshold != -std::numeric_limits::infinity(); + } + + bool + update(std::mapconst& params); + +}; + +} diff --git a/mosesdecoder/moses2/parameters/ServerOptions.cpp b/mosesdecoder/moses2/parameters/ServerOptions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3a21c1891bf330088a069fbeaafd399e3acb9ff9 --- /dev/null +++ b/mosesdecoder/moses2/parameters/ServerOptions.cpp @@ -0,0 +1,83 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include +#include +#include "../legacy/Parameter.h" +#include "ServerOptions.h" +#include "../legacy/Util2.h" +#include "util/exception.hh" + +namespace Moses2 +{ + +// parse the session timeout specifciation for moses server +// Format is "d[[h[m[s]]]]". +// If none of 'dhms' is given, it is assumed that it's seconds. +// Specs can be combined, e.g. 2h30m, although it's probably nonsense +// to be so specific. +size_t +parse_timespec(std::string const& spec) +{ + size_t t = 0, timeout = 0; + BOOST_FOREACH(char const& c, spec) { + if (c >= '0' && c <= '9') { + t = t * 10 + c - '0'; + } else { + if (c == 'd') timeout = t * 24 * 3600; + else if (c == 'h') timeout += t * 3600; + else if (c == 'm') timeout += t * 60; + else if (c == 's') timeout += t; + else UTIL_THROW2("Can't parse specification '" << spec + << " at " << HERE); + t = 0; + } + } + return timeout; +} + +ServerOptions:: +ServerOptions() + : is_serial(false) + , numThreads(15) // why 15? + , sessionTimeout(1800) // = 30 min + , sessionCacheSize(25) + , port(8080) + , maxConn(15) + , maxConnBacklog(15) + , keepaliveTimeout(15) + , keepaliveMaxConn(30) + , timeout(15) +{ } + +ServerOptions:: +ServerOptions(Parameter const& P) +{ + init(P); +} + +bool +ServerOptions:: +init(Parameter const& P) +{ + // Settings for the abyss server + P.SetParameter(this->port, "server-port", 8080); + P.SetParameter(this->is_serial, "serial", false); + P.SetParameter(this->logfile, "server-log", std::string("/dev/null")); + P.SetParameter(this->numThreads, "threads", uint32_t(15)); + + // defaults reflect recommended defaults (according to Hieu) + // -> http://xmlrpc-c.sourceforge.net/doc/libxmlrpc_server_abyss.html#max_conn + P.SetParameter(this->maxConn,"server-maxconn", 15); + P.SetParameter(this->maxConnBacklog,"server-maxconn-backlog", 15); + P.SetParameter(this->keepaliveTimeout,"server-keepalive-timeout", 15); + P.SetParameter(this->keepaliveMaxConn,"server-keepalive-maxconn", 30); + P.SetParameter(this->timeout,"server-timeout",15); + + // the stuff below is related to Moses translation sessions + std::string timeout_spec; + P.SetParameter(timeout_spec, "session-timeout",std::string("30m")); + this->sessionTimeout = parse_timespec(timeout_spec); + P.SetParameter(this->sessionCacheSize, "session-cache_size", size_t(25)); + + return true; +} +} // namespace Moses diff --git a/mosesdecoder/moses2/parameters/ServerOptions.h b/mosesdecoder/moses2/parameters/ServerOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..2b67e5156cc34f1592ac21ccd86632961bd421d7 --- /dev/null +++ b/mosesdecoder/moses2/parameters/ServerOptions.h @@ -0,0 +1,40 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once + +#include "../legacy/xmlrpc-c.h" +#include +#include +#include + +namespace Moses2 +{ +class Parameter; + +struct + ServerOptions { + bool is_serial; + uint32_t numThreads; // might not be used any more, actually + + size_t sessionTimeout; // this is related to Moses translation sessions + size_t sessionCacheSize; // this is related to Moses translation sessions + + int port; // this is for the abyss server + std::string logfile; // this is for the abyss server + int maxConn; // this is for the abyss server + int maxConnBacklog; // this is for the abyss server + int keepaliveTimeout; // this is for the abyss server + int keepaliveMaxConn; // this is for the abyss server + int timeout; // this is for the abyss server + + bool init(Parameter const& param); + ServerOptions(Parameter const& param); + ServerOptions(); + + bool + update(std::mapconst& params) { + return true; + } + +}; + +} diff --git a/mosesdecoder/moses2/parameters/SyntaxOptions.cpp b/mosesdecoder/moses2/parameters/SyntaxOptions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4c67306154c2820b7ef1aaec6377e9c494bf5472 --- /dev/null +++ b/mosesdecoder/moses2/parameters/SyntaxOptions.cpp @@ -0,0 +1,47 @@ +/* + * SyntaxOptions.cpp + * + * Created on: 13 Apr 2016 + * Author: hieu + */ + +#include "SyntaxOptions.h" +#include "../legacy/Parameter.h" + +namespace Moses2 +{ +SyntaxOptions::SyntaxOptions() + : s2t_parsing_algo(RecursiveCYKPlus) + , default_non_term_only_for_empty_range(false) + , source_label_overlap(SourceLabelOverlapAdd) + , rule_limit(DEFAULT_MAX_TRANS_OPT_SIZE) +{} + +bool SyntaxOptions::init(Parameter const& param) +{ + param.SetParameter(rule_limit, "rule-limit", DEFAULT_MAX_TRANS_OPT_SIZE); + param.SetParameter(s2t_parsing_algo, "s2t-parsing-algorithm", + RecursiveCYKPlus); + param.SetParameter(default_non_term_only_for_empty_range, + "default-non-term-for-empty-range-only", false); + param.SetParameter(source_label_overlap, "source-label-overlap", + SourceLabelOverlapAdd); + return true; +} + +bool SyntaxOptions::update(std::mapconst& param) +{ + typedef std::map params_t; + // params_t::const_iterator si = param.find("xml-input"); + // if (si != param.end()) + // xml_policy = Scan(xmlrpc_c::value_string(si->second)); + return true; +} + +void SyntaxOptions::LoadNonTerminals(Parameter const& param, FactorCollection& factorCollection) +{ + +} + + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/parameters/SyntaxOptions.h b/mosesdecoder/moses2/parameters/SyntaxOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..c32a0c85caab0cbecc1d787e349d4b65ee7d6581 --- /dev/null +++ b/mosesdecoder/moses2/parameters/SyntaxOptions.h @@ -0,0 +1,39 @@ +/* + * SyntaxOptions.h + * + * Created on: 13 Apr 2016 + * Author: hieu + */ +#pragma once +#include +#include +#include "OptionsBaseClass.h" +#include "../SCFG/Word.h" + +namespace Moses2 +{ +class FactorCollection; +class Parameter; + +typedef std::pair UnknownLHSEntry; +typedef std::vector UnknownLHSList; + +struct + SyntaxOptions : public OptionsBaseClass { + S2TParsingAlgorithm s2t_parsing_algo; + SCFG::Word input_default_non_terminal; + SCFG::Word output_default_non_terminal; + bool default_non_term_only_for_empty_range; // whatever that means + UnknownLHSList unknown_lhs; + SourceLabelOverlap source_label_overlap; // m_sourceLabelOverlap; + size_t rule_limit; + + SyntaxOptions(); + + bool init(Parameter const& param); + bool update(std::mapconst& param); + void LoadNonTerminals(Parameter const& param, FactorCollection& factorCollection); +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/pugiconfig.hpp b/mosesdecoder/moses2/pugiconfig.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1e3bdd1f39bb5966b74cf9730c009b44ac7ec419 --- /dev/null +++ b/mosesdecoder/moses2/pugiconfig.hpp @@ -0,0 +1,74 @@ +/** + * pugixml parser - version 1.7 + * -------------------------------------------------------- + * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at http://pugixml.org/ + * + * This library is distributed under the MIT License. See notice at the end + * of this file. + * + * This work is based on the pugxml parser, which is: + * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) + */ + +#ifndef HEADER_PUGICONFIG_HPP +#define HEADER_PUGICONFIG_HPP + +// Uncomment this to enable wchar_t mode +// #define PUGIXML_WCHAR_MODE + +// Uncomment this to enable compact mode +// #define PUGIXML_COMPACT + +// Uncomment this to disable XPath +// #define PUGIXML_NO_XPATH + +// Uncomment this to disable STL +// #define PUGIXML_NO_STL + +// Uncomment this to disable exceptions +// #define PUGIXML_NO_EXCEPTIONS + +// Set this to control attributes for public classes/functions, i.e.: +// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL +// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL +// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall +// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead + +// Tune these constants to adjust memory-related behavior +// #define PUGIXML_MEMORY_PAGE_SIZE 32768 +// #define PUGIXML_MEMORY_OUTPUT_STACK 10240 +// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096 + +// Uncomment this to switch to header-only version +// #define PUGIXML_HEADER_ONLY + +// Uncomment this to enable long long support +// #define PUGIXML_HAS_LONG_LONG + +#endif + +/** + * Copyright (c) 2006-2015 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/mosesdecoder/moses2/pugixml.cpp b/mosesdecoder/moses2/pugixml.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a39f25880de738604ffe18dd7a6f781bc70afd6b --- /dev/null +++ b/mosesdecoder/moses2/pugixml.cpp @@ -0,0 +1,11456 @@ +/** + * pugixml parser - version 1.7 + * -------------------------------------------------------- + * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at http://pugixml.org/ + * + * This library is distributed under the MIT License. See notice at the end + * of this file. + * + * This work is based on the pugxml parser, which is: + * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) + */ + +#ifndef SOURCE_PUGIXML_CPP +#define SOURCE_PUGIXML_CPP + +#include "pugixml.hpp" + +#include +#include +#include +#include +#include + +#ifdef PUGIXML_WCHAR_MODE +# include +#endif + +#ifndef PUGIXML_NO_XPATH +# include +# include +# ifdef PUGIXML_NO_EXCEPTIONS +# include +# endif +#endif + +#ifndef PUGIXML_NO_STL +# include +# include +# include +#endif + +// For placement new +#include + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable: 4127) // conditional expression is constant +# pragma warning(disable: 4324) // structure was padded due to __declspec(align()) +# pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable +# pragma warning(disable: 4702) // unreachable code +# pragma warning(disable: 4996) // this function or variable may be unsafe +# pragma warning(disable: 4793) // function compiled as native: presence of '_setjmp' makes a function unmanaged +#endif + +#ifdef __INTEL_COMPILER +# pragma warning(disable: 177) // function was declared but never referenced +# pragma warning(disable: 279) // controlling expression is constant +# pragma warning(disable: 1478 1786) // function was declared "deprecated" +# pragma warning(disable: 1684) // conversion from pointer to same-sized integral type +#endif + +#if defined(__BORLANDC__) && defined(PUGIXML_HEADER_ONLY) +# pragma warn -8080 // symbol is declared but never used; disabling this inside push/pop bracket does not make the warning go away +#endif + +#ifdef __BORLANDC__ +# pragma option push +# pragma warn -8008 // condition is always false +# pragma warn -8066 // unreachable code +#endif + +#ifdef __SNC__ +// Using diag_push/diag_pop does not disable the warnings inside templates due to a compiler bug +# pragma diag_suppress=178 // function was declared but never referenced +# pragma diag_suppress=237 // controlling expression is constant +#endif + +// Inlining controls +#if defined(_MSC_VER) && _MSC_VER >= 1300 +# define PUGI__NO_INLINE __declspec(noinline) +#elif defined(__GNUC__) +# define PUGI__NO_INLINE __attribute__((noinline)) +#else +# define PUGI__NO_INLINE +#endif + +// Branch weight controls +#if defined(__GNUC__) +# define PUGI__UNLIKELY(cond) __builtin_expect(cond, 0) +#else +# define PUGI__UNLIKELY(cond) (cond) +#endif + +// Simple static assertion +#define PUGI__STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; } + +// Digital Mars C++ bug workaround for passing char loaded from memory via stack +#ifdef __DMC__ +# define PUGI__DMC_VOLATILE volatile +#else +# define PUGI__DMC_VOLATILE +#endif + +// Borland C++ bug workaround for not defining ::memcpy depending on header include order (can't always use std::memcpy because some compilers don't have it at all) +#if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST) +using std::memcpy; +using std::memmove; +using std::memset; +#endif + +// In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features +#if defined(_MSC_VER) && !defined(__S3E__) +# define PUGI__MSVC_CRT_VERSION _MSC_VER +#endif + +#ifdef PUGIXML_HEADER_ONLY +# define PUGI__NS_BEGIN namespace pugi { namespace impl { +# define PUGI__NS_END } } +# define PUGI__FN inline +# define PUGI__FN_NO_INLINE inline +#else +# if defined(_MSC_VER) && _MSC_VER < 1300 // MSVC6 seems to have an amusing bug with anonymous namespaces inside namespaces +# define PUGI__NS_BEGIN namespace pugi { namespace impl { +# define PUGI__NS_END } } +# else +# define PUGI__NS_BEGIN namespace pugi { namespace impl { namespace { +# define PUGI__NS_END } } } +# endif +# define PUGI__FN +# define PUGI__FN_NO_INLINE PUGI__NO_INLINE +#endif + +// uintptr_t +#if !defined(_MSC_VER) || _MSC_VER >= 1600 +# include +#else +namespace pugi +{ +# ifndef _UINTPTR_T_DEFINED +typedef size_t uintptr_t; +# endif + +typedef unsigned __int8 uint8_t; +typedef unsigned __int16 uint16_t; +typedef unsigned __int32 uint32_t; +} +#endif + +// Memory allocation +PUGI__NS_BEGIN +PUGI__FN void* default_allocate(size_t size) +{ + return malloc(size); +} + +PUGI__FN void default_deallocate(void* ptr) +{ + free(ptr); +} + +template +struct xml_memory_management_function_storage { + static allocation_function allocate; + static deallocation_function deallocate; +}; + +// Global allocation functions are stored in class statics so that in header mode linker deduplicates them +// Without a template<> we'll get multiple definitions of the same static +template allocation_function xml_memory_management_function_storage::allocate = default_allocate; +template deallocation_function xml_memory_management_function_storage::deallocate = default_deallocate; + +typedef xml_memory_management_function_storage xml_memory; +PUGI__NS_END + +// String utilities +PUGI__NS_BEGIN +// Get string length +PUGI__FN size_t strlength(const char_t* s) +{ + assert(s); + +#ifdef PUGIXML_WCHAR_MODE + return wcslen(s); +#else + return strlen(s); +#endif +} + +// Compare two strings +PUGI__FN bool strequal(const char_t* src, const char_t* dst) +{ + assert(src && dst); + +#ifdef PUGIXML_WCHAR_MODE + return wcscmp(src, dst) == 0; +#else + return strcmp(src, dst) == 0; +#endif +} + +// Compare lhs with [rhs_begin, rhs_end) +PUGI__FN bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count) +{ + for (size_t i = 0; i < count; ++i) + if (lhs[i] != rhs[i]) + return false; + + return lhs[count] == 0; +} + +// Get length of wide string, even if CRT lacks wide character support +PUGI__FN size_t strlength_wide(const wchar_t* s) +{ + assert(s); + +#ifdef PUGIXML_WCHAR_MODE + return wcslen(s); +#else + const wchar_t* end = s; + while (*end) end++; + return static_cast(end - s); +#endif +} +PUGI__NS_END + +// auto_ptr-like object for exception recovery +PUGI__NS_BEGIN +template struct auto_deleter { + T* data; + D deleter; + + auto_deleter(T* data_, D deleter_): data(data_), deleter(deleter_) { + } + + ~auto_deleter() { + if (data) deleter(data); + } + + T* release() { + T* result = data; + data = 0; + return result; + } +}; +PUGI__NS_END + +#ifdef PUGIXML_COMPACT +PUGI__NS_BEGIN +class compact_hash_table +{ +public: + compact_hash_table(): _items(0), _capacity(0), _count(0) { + } + + void clear() { + if (_items) { + xml_memory::deallocate(_items); + _items = 0; + _capacity = 0; + _count = 0; + } + } + + void** find(const void* key) { + assert(key); + + if (_capacity == 0) return 0; + + size_t hashmod = _capacity - 1; + size_t bucket = hash(key) & hashmod; + + for (size_t probe = 0; probe <= hashmod; ++probe) { + item_t& probe_item = _items[bucket]; + + if (probe_item.key == key) + return &probe_item.value; + + if (probe_item.key == 0) + return 0; + + // hash collision, quadratic probing + bucket = (bucket + probe + 1) & hashmod; + } + + assert(!"Hash table is full"); + return 0; + } + + void** insert(const void* key) { + assert(key); + assert(_capacity != 0 && _count < _capacity - _capacity / 4); + + size_t hashmod = _capacity - 1; + size_t bucket = hash(key) & hashmod; + + for (size_t probe = 0; probe <= hashmod; ++probe) { + item_t& probe_item = _items[bucket]; + + if (probe_item.key == 0) { + probe_item.key = key; + _count++; + return &probe_item.value; + } + + if (probe_item.key == key) + return &probe_item.value; + + // hash collision, quadratic probing + bucket = (bucket + probe + 1) & hashmod; + } + + assert(!"Hash table is full"); + return 0; + } + + bool reserve() { + if (_count + 16 >= _capacity - _capacity / 4) + return rehash(); + + return true; + } + +private: + struct item_t { + const void* key; + void* value; + }; + + item_t* _items; + size_t _capacity; + + size_t _count; + + bool rehash(); + + static unsigned int hash(const void* key) { + unsigned int h = static_cast(reinterpret_cast(key)); + + // MurmurHash3 32-bit finalizer + h ^= h >> 16; + h *= 0x85ebca6bu; + h ^= h >> 13; + h *= 0xc2b2ae35u; + h ^= h >> 16; + + return h; + } +}; + +PUGI__FN_NO_INLINE bool compact_hash_table::rehash() +{ + compact_hash_table rt; + rt._capacity = (_capacity == 0) ? 32 : _capacity * 2; + rt._items = static_cast(xml_memory::allocate(sizeof(item_t) * rt._capacity)); + + if (!rt._items) + return false; + + memset(rt._items, 0, sizeof(item_t) * rt._capacity); + + for (size_t i = 0; i < _capacity; ++i) + if (_items[i].key) + *rt.insert(_items[i].key) = _items[i].value; + + if (_items) + xml_memory::deallocate(_items); + + _capacity = rt._capacity; + _items = rt._items; + + assert(_count == rt._count); + + return true; +} + +PUGI__NS_END +#endif + +PUGI__NS_BEGIN +static const size_t xml_memory_page_size = +#ifdef PUGIXML_MEMORY_PAGE_SIZE + PUGIXML_MEMORY_PAGE_SIZE +#else + 32768 +#endif + ; + +#ifdef PUGIXML_COMPACT +static const uintptr_t xml_memory_block_alignment = 4; + +static const uintptr_t xml_memory_page_alignment = sizeof(void*); +#else +static const uintptr_t xml_memory_block_alignment = sizeof(void*); + +static const uintptr_t xml_memory_page_alignment = 64; +static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1); +#endif + +// extra metadata bits +static const uintptr_t xml_memory_page_contents_shared_mask = 32; +static const uintptr_t xml_memory_page_name_allocated_mask = 16; +static const uintptr_t xml_memory_page_value_allocated_mask = 8; +static const uintptr_t xml_memory_page_type_mask = 7; + +// combined masks for string uniqueness +static const uintptr_t xml_memory_page_name_allocated_or_shared_mask = xml_memory_page_name_allocated_mask | xml_memory_page_contents_shared_mask; +static const uintptr_t xml_memory_page_value_allocated_or_shared_mask = xml_memory_page_value_allocated_mask | xml_memory_page_contents_shared_mask; + +#ifdef PUGIXML_COMPACT +#define PUGI__GETPAGE_IMPL(header) (header).get_page() +#else +#define PUGI__GETPAGE_IMPL(header) reinterpret_cast((header) & impl::xml_memory_page_pointer_mask) +#endif + +#define PUGI__GETPAGE(n) PUGI__GETPAGE_IMPL((n)->header) +#define PUGI__NODETYPE(n) static_cast(((n)->header & impl::xml_memory_page_type_mask) + 1) + +struct xml_allocator; + +struct xml_memory_page { + static xml_memory_page* construct(void* memory) { + xml_memory_page* result = static_cast(memory); + + result->allocator = 0; + result->prev = 0; + result->next = 0; + result->busy_size = 0; + result->freed_size = 0; + +#ifdef PUGIXML_COMPACT + result->compact_string_base = 0; + result->compact_shared_parent = 0; + result->compact_page_marker = 0; +#endif + + return result; + } + + xml_allocator* allocator; + + xml_memory_page* prev; + xml_memory_page* next; + + size_t busy_size; + size_t freed_size; + +#ifdef PUGIXML_COMPACT + char_t* compact_string_base; + void* compact_shared_parent; + uint32_t* compact_page_marker; +#endif +}; + +struct xml_memory_string_header { + uint16_t page_offset; // offset from page->data + uint16_t full_size; // 0 if string occupies whole page +}; + +struct xml_allocator { + xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size) { +#ifdef PUGIXML_COMPACT + _hash = 0; +#endif + } + + xml_memory_page* allocate_page(size_t data_size) { + size_t size = sizeof(xml_memory_page) + data_size; + + // allocate block with some alignment, leaving memory for worst-case padding + void* memory = xml_memory::allocate(size + xml_memory_page_alignment); + if (!memory) return 0; + + // align to next page boundary (note: this guarantees at least 1 usable byte before the page) + char* page_memory = reinterpret_cast((reinterpret_cast(memory) + xml_memory_page_alignment) & ~(xml_memory_page_alignment - 1)); + + // prepare page structure + xml_memory_page* page = xml_memory_page::construct(page_memory); + assert(page); + + page->allocator = _root->allocator; + + // record the offset for freeing the memory block + assert(page_memory > memory && page_memory - static_cast(memory) <= 127); + page_memory[-1] = static_cast(page_memory - static_cast(memory)); + + return page; + } + + static void deallocate_page(xml_memory_page* page) { + char* page_memory = reinterpret_cast(page); + + xml_memory::deallocate(page_memory - page_memory[-1]); + } + + void* allocate_memory_oob(size_t size, xml_memory_page*& out_page); + + void* allocate_memory(size_t size, xml_memory_page*& out_page) { + if (PUGI__UNLIKELY(_busy_size + size > xml_memory_page_size)) + return allocate_memory_oob(size, out_page); + + void* buf = reinterpret_cast(_root) + sizeof(xml_memory_page) + _busy_size; + + _busy_size += size; + + out_page = _root; + + return buf; + } + +#ifdef PUGIXML_COMPACT + void* allocate_object(size_t size, xml_memory_page*& out_page) { + void* result = allocate_memory(size + sizeof(uint32_t), out_page); + if (!result) return 0; + + // adjust for marker + ptrdiff_t offset = static_cast(result) - reinterpret_cast(out_page->compact_page_marker); + + if (PUGI__UNLIKELY(static_cast(offset) >= 256 * xml_memory_block_alignment)) { + // insert new marker + uint32_t* marker = static_cast(result); + + *marker = static_cast(reinterpret_cast(marker) - reinterpret_cast(out_page)); + out_page->compact_page_marker = marker; + + // since we don't reuse the page space until we reallocate it, we can just pretend that we freed the marker block + // this will make sure deallocate_memory correctly tracks the size + out_page->freed_size += sizeof(uint32_t); + + return marker + 1; + } else { + // roll back uint32_t part + _busy_size -= sizeof(uint32_t); + + return result; + } + } +#else + void* allocate_object(size_t size, xml_memory_page*& out_page) { + return allocate_memory(size, out_page); + } +#endif + + void deallocate_memory(void* ptr, size_t size, xml_memory_page* page) { + if (page == _root) page->busy_size = _busy_size; + + assert(ptr >= reinterpret_cast(page) + sizeof(xml_memory_page) && ptr < reinterpret_cast(page) + sizeof(xml_memory_page) + page->busy_size); + (void)!ptr; + + page->freed_size += size; + assert(page->freed_size <= page->busy_size); + + if (page->freed_size == page->busy_size) { + if (page->next == 0) { + assert(_root == page); + + // top page freed, just reset sizes + page->busy_size = 0; + page->freed_size = 0; + +#ifdef PUGIXML_COMPACT + // reset compact state to maximize efficiency + page->compact_string_base = 0; + page->compact_shared_parent = 0; + page->compact_page_marker = 0; +#endif + + _busy_size = 0; + } else { + assert(_root != page); + assert(page->prev); + + // remove from the list + page->prev->next = page->next; + page->next->prev = page->prev; + + // deallocate + deallocate_page(page); + } + } + } + + char_t* allocate_string(size_t length) { + static const size_t max_encoded_offset = (1 << 16) * xml_memory_block_alignment; + + PUGI__STATIC_ASSERT(xml_memory_page_size <= max_encoded_offset); + + // allocate memory for string and header block + size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t); + + // round size up to block alignment boundary + size_t full_size = (size + (xml_memory_block_alignment - 1)) & ~(xml_memory_block_alignment - 1); + + xml_memory_page* page; + xml_memory_string_header* header = static_cast(allocate_memory(full_size, page)); + + if (!header) return 0; + + // setup header + ptrdiff_t page_offset = reinterpret_cast(header) - reinterpret_cast(page) - sizeof(xml_memory_page); + + assert(page_offset % xml_memory_block_alignment == 0); + assert(page_offset >= 0 && static_cast(page_offset) < max_encoded_offset); + header->page_offset = static_cast(static_cast(page_offset) / xml_memory_block_alignment); + + // full_size == 0 for large strings that occupy the whole page + assert(full_size % xml_memory_block_alignment == 0); + assert(full_size < max_encoded_offset || (page->busy_size == full_size && page_offset == 0)); + header->full_size = static_cast(full_size < max_encoded_offset ? full_size / xml_memory_block_alignment : 0); + + // round-trip through void* to avoid 'cast increases required alignment of target type' warning + // header is guaranteed a pointer-sized alignment, which should be enough for char_t + return static_cast(static_cast(header + 1)); + } + + void deallocate_string(char_t* string) { + // this function casts pointers through void* to avoid 'cast increases required alignment of target type' warnings + // we're guaranteed the proper (pointer-sized) alignment on the input string if it was allocated via allocate_string + + // get header + xml_memory_string_header* header = static_cast(static_cast(string)) - 1; + assert(header); + + // deallocate + size_t page_offset = sizeof(xml_memory_page) + header->page_offset * xml_memory_block_alignment; + xml_memory_page* page = reinterpret_cast(static_cast(reinterpret_cast(header) - page_offset)); + + // if full_size == 0 then this string occupies the whole page + size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size * xml_memory_block_alignment; + + deallocate_memory(header, full_size, page); + } + + bool reserve() { +#ifdef PUGIXML_COMPACT + return _hash->reserve(); +#else + return true; +#endif + } + + xml_memory_page* _root; + size_t _busy_size; + +#ifdef PUGIXML_COMPACT + compact_hash_table* _hash; +#endif +}; + +PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page) +{ + const size_t large_allocation_threshold = xml_memory_page_size / 4; + + xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size); + out_page = page; + + if (!page) return 0; + + if (size <= large_allocation_threshold) { + _root->busy_size = _busy_size; + + // insert page at the end of linked list + page->prev = _root; + _root->next = page; + _root = page; + + _busy_size = size; + } else { + // insert page before the end of linked list, so that it is deleted as soon as possible + // the last page is not deleted even if it's empty (see deallocate_memory) + assert(_root->prev); + + page->prev = _root->prev; + page->next = _root; + + _root->prev->next = page; + _root->prev = page; + + page->busy_size = size; + } + + return reinterpret_cast(page) + sizeof(xml_memory_page); +} +PUGI__NS_END + +#ifdef PUGIXML_COMPACT +PUGI__NS_BEGIN +static const uintptr_t compact_alignment_log2 = 2; +static const uintptr_t compact_alignment = 1 << compact_alignment_log2; + +class compact_header +{ +public: + compact_header(xml_memory_page* page, unsigned int flags) { + PUGI__STATIC_ASSERT(xml_memory_block_alignment == compact_alignment); + + ptrdiff_t offset = (reinterpret_cast(this) - reinterpret_cast(page->compact_page_marker)); + assert(offset % compact_alignment == 0 && static_cast(offset) < 256 * compact_alignment); + + _page = static_cast(offset >> compact_alignment_log2); + _flags = static_cast(flags); + } + + void operator&=(uintptr_t mod) { + _flags &= static_cast(mod); + } + + void operator|=(uintptr_t mod) { + _flags |= static_cast(mod); + } + + uintptr_t operator&(uintptr_t mod) const { + return _flags & mod; + } + + xml_memory_page* get_page() const { + // round-trip through void* to silence 'cast increases required alignment of target type' warnings + const char* page_marker = reinterpret_cast(this) - (_page << compact_alignment_log2); + const char* page = page_marker - *reinterpret_cast(static_cast(page_marker)); + + return const_cast(reinterpret_cast(static_cast(page))); + } + +private: + unsigned char _page; + unsigned char _flags; +}; + +PUGI__FN xml_memory_page* compact_get_page(const void* object, int header_offset) +{ + const compact_header* header = reinterpret_cast(static_cast(object) - header_offset); + + return header->get_page(); +} + +template PUGI__FN_NO_INLINE T* compact_get_value(const void* object) +{ + return static_cast(*compact_get_page(object, header_offset)->allocator->_hash->find(object)); +} + +template PUGI__FN_NO_INLINE void compact_set_value(const void* object, T* value) +{ + *compact_get_page(object, header_offset)->allocator->_hash->insert(object) = value; +} + +template class compact_pointer +{ +public: + compact_pointer(): _data(0) { + } + + void operator=(const compact_pointer& rhs) { + *this = rhs + 0; + } + + void operator=(T* value) { + if (value) { + // value is guaranteed to be compact-aligned; 'this' is not + // our decoding is based on 'this' aligned to compact alignment downwards (see operator T*) + // so for negative offsets (e.g. -3) we need to adjust the diff by compact_alignment - 1 to + // compensate for arithmetic shift rounding for negative values + ptrdiff_t diff = reinterpret_cast(value) - reinterpret_cast(this); + ptrdiff_t offset = ((diff + int(compact_alignment - 1)) >> compact_alignment_log2) - start; + + if (static_cast(offset) <= 253) + _data = static_cast(offset + 1); + else { + compact_set_value(this, value); + + _data = 255; + } + } else + _data = 0; + } + + operator T*() const { + if (_data) { + if (_data < 255) { + uintptr_t base = reinterpret_cast(this) & ~(compact_alignment - 1); + + return reinterpret_cast(base + ((_data - 1 + start) << compact_alignment_log2)); + } else + return compact_get_value(this); + } else + return 0; + } + + T* operator->() const { + return *this; + } + +private: + unsigned char _data; +}; + +template class compact_pointer_parent +{ +public: + compact_pointer_parent(): _data(0) { + } + + void operator=(const compact_pointer_parent& rhs) { + *this = rhs + 0; + } + + void operator=(T* value) { + if (value) { + // value is guaranteed to be compact-aligned; 'this' is not + // our decoding is based on 'this' aligned to compact alignment downwards (see operator T*) + // so for negative offsets (e.g. -3) we need to adjust the diff by compact_alignment - 1 to + // compensate for arithmetic shift behavior for negative values + ptrdiff_t diff = reinterpret_cast(value) - reinterpret_cast(this); + ptrdiff_t offset = ((diff + int(compact_alignment - 1)) >> compact_alignment_log2) + 65533; + + if (static_cast(offset) <= 65533) { + _data = static_cast(offset + 1); + } else { + xml_memory_page* page = compact_get_page(this, header_offset); + + if (PUGI__UNLIKELY(page->compact_shared_parent == 0)) + page->compact_shared_parent = value; + + if (page->compact_shared_parent == value) { + _data = 65534; + } else { + compact_set_value(this, value); + + _data = 65535; + } + } + } else { + _data = 0; + } + } + + operator T*() const { + if (_data) { + if (_data < 65534) { + uintptr_t base = reinterpret_cast(this) & ~(compact_alignment - 1); + + return reinterpret_cast(base + ((_data - 1 - 65533) << compact_alignment_log2)); + } else if (_data == 65534) + return static_cast(compact_get_page(this, header_offset)->compact_shared_parent); + else + return compact_get_value(this); + } else + return 0; + } + + T* operator->() const { + return *this; + } + +private: + uint16_t _data; +}; + +template class compact_string +{ +public: + compact_string(): _data(0) { + } + + void operator=(const compact_string& rhs) { + *this = rhs + 0; + } + + void operator=(char_t* value) { + if (value) { + xml_memory_page* page = compact_get_page(this, header_offset); + + if (PUGI__UNLIKELY(page->compact_string_base == 0)) + page->compact_string_base = value; + + ptrdiff_t offset = value - page->compact_string_base; + + if (static_cast(offset) < (65535 << 7)) { + // round-trip through void* to silence 'cast increases required alignment of target type' warnings + uint16_t* base = reinterpret_cast(static_cast(reinterpret_cast(this) - base_offset)); + + if (*base == 0) { + *base = static_cast((offset >> 7) + 1); + _data = static_cast((offset & 127) + 1); + } else { + ptrdiff_t remainder = offset - ((*base - 1) << 7); + + if (static_cast(remainder) <= 253) { + _data = static_cast(remainder + 1); + } else { + compact_set_value(this, value); + + _data = 255; + } + } + } else { + compact_set_value(this, value); + + _data = 255; + } + } else { + _data = 0; + } + } + + operator char_t*() const { + if (_data) { + if (_data < 255) { + xml_memory_page* page = compact_get_page(this, header_offset); + + // round-trip through void* to silence 'cast increases required alignment of target type' warnings + const uint16_t* base = reinterpret_cast(static_cast(reinterpret_cast(this) - base_offset)); + assert(*base); + + ptrdiff_t offset = ((*base - 1) << 7) + (_data - 1); + + return page->compact_string_base + offset; + } else { + return compact_get_value(this); + } + } else + return 0; + } + +private: + unsigned char _data; +}; +PUGI__NS_END +#endif + +#ifdef PUGIXML_COMPACT +namespace pugi +{ +struct xml_attribute_struct { + xml_attribute_struct(impl::xml_memory_page* page): header(page, 0), namevalue_base(0) { + PUGI__STATIC_ASSERT(sizeof(xml_attribute_struct) == 8); + } + + impl::compact_header header; + + uint16_t namevalue_base; + + impl::compact_string<4, 2> name; + impl::compact_string<5, 3> value; + + impl::compact_pointer prev_attribute_c; + impl::compact_pointer next_attribute; +}; + +struct xml_node_struct { + xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(page, type - 1), namevalue_base(0) { + PUGI__STATIC_ASSERT(sizeof(xml_node_struct) == 12); + } + + impl::compact_header header; + + uint16_t namevalue_base; + + impl::compact_string<4, 2> name; + impl::compact_string<5, 3> value; + + impl::compact_pointer_parent parent; + + impl::compact_pointer first_child; + + impl::compact_pointer prev_sibling_c; + impl::compact_pointer next_sibling; + + impl::compact_pointer first_attribute; +}; +} +#else +namespace pugi +{ +struct xml_attribute_struct { + xml_attribute_struct(impl::xml_memory_page* page): header(reinterpret_cast(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0) { + } + + uintptr_t header; + + char_t* name; + char_t* value; + + xml_attribute_struct* prev_attribute_c; + xml_attribute_struct* next_attribute; +}; + +struct xml_node_struct { + xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(reinterpret_cast(page) | (type - 1)), name(0), value(0), parent(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0) { + } + + uintptr_t header; + + char_t* name; + char_t* value; + + xml_node_struct* parent; + + xml_node_struct* first_child; + + xml_node_struct* prev_sibling_c; + xml_node_struct* next_sibling; + + xml_attribute_struct* first_attribute; +}; +} +#endif + +PUGI__NS_BEGIN +struct xml_extra_buffer { + char_t* buffer; + xml_extra_buffer* next; +}; + +struct xml_document_struct: public xml_node_struct, public xml_allocator { + xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0), extra_buffers(0) { +#ifdef PUGIXML_COMPACT + _hash = &hash; +#endif + } + + const char_t* buffer; + + xml_extra_buffer* extra_buffers; + +#ifdef PUGIXML_COMPACT + compact_hash_table hash; +#endif +}; + +template inline xml_allocator& get_allocator(const Object* object) +{ + assert(object); + + return *PUGI__GETPAGE(object)->allocator; +} + +template inline xml_document_struct& get_document(const Object* object) +{ + assert(object); + + return *static_cast(PUGI__GETPAGE(object)->allocator); +} +PUGI__NS_END + +// Low-level DOM operations +PUGI__NS_BEGIN +inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc) +{ + xml_memory_page* page; + void* memory = alloc.allocate_object(sizeof(xml_attribute_struct), page); + if (!memory) return 0; + + return new (memory) xml_attribute_struct(page); +} + +inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type) +{ + xml_memory_page* page; + void* memory = alloc.allocate_object(sizeof(xml_node_struct), page); + if (!memory) return 0; + + return new (memory) xml_node_struct(page, type); +} + +inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc) +{ + if (a->header & impl::xml_memory_page_name_allocated_mask) + alloc.deallocate_string(a->name); + + if (a->header & impl::xml_memory_page_value_allocated_mask) + alloc.deallocate_string(a->value); + + alloc.deallocate_memory(a, sizeof(xml_attribute_struct), PUGI__GETPAGE(a)); +} + +inline void destroy_node(xml_node_struct* n, xml_allocator& alloc) +{ + if (n->header & impl::xml_memory_page_name_allocated_mask) + alloc.deallocate_string(n->name); + + if (n->header & impl::xml_memory_page_value_allocated_mask) + alloc.deallocate_string(n->value); + + for (xml_attribute_struct* attr = n->first_attribute; attr; ) { + xml_attribute_struct* next = attr->next_attribute; + + destroy_attribute(attr, alloc); + + attr = next; + } + + for (xml_node_struct* child = n->first_child; child; ) { + xml_node_struct* next = child->next_sibling; + + destroy_node(child, alloc); + + child = next; + } + + alloc.deallocate_memory(n, sizeof(xml_node_struct), PUGI__GETPAGE(n)); +} + +inline void append_node(xml_node_struct* child, xml_node_struct* node) +{ + child->parent = node; + + xml_node_struct* head = node->first_child; + + if (head) { + xml_node_struct* tail = head->prev_sibling_c; + + tail->next_sibling = child; + child->prev_sibling_c = tail; + head->prev_sibling_c = child; + } else { + node->first_child = child; + child->prev_sibling_c = child; + } +} + +inline void prepend_node(xml_node_struct* child, xml_node_struct* node) +{ + child->parent = node; + + xml_node_struct* head = node->first_child; + + if (head) { + child->prev_sibling_c = head->prev_sibling_c; + head->prev_sibling_c = child; + } else + child->prev_sibling_c = child; + + child->next_sibling = head; + node->first_child = child; +} + +inline void insert_node_after(xml_node_struct* child, xml_node_struct* node) +{ + xml_node_struct* parent = node->parent; + + child->parent = parent; + + if (node->next_sibling) + node->next_sibling->prev_sibling_c = child; + else + parent->first_child->prev_sibling_c = child; + + child->next_sibling = node->next_sibling; + child->prev_sibling_c = node; + + node->next_sibling = child; +} + +inline void insert_node_before(xml_node_struct* child, xml_node_struct* node) +{ + xml_node_struct* parent = node->parent; + + child->parent = parent; + + if (node->prev_sibling_c->next_sibling) + node->prev_sibling_c->next_sibling = child; + else + parent->first_child = child; + + child->prev_sibling_c = node->prev_sibling_c; + child->next_sibling = node; + + node->prev_sibling_c = child; +} + +inline void remove_node(xml_node_struct* node) +{ + xml_node_struct* parent = node->parent; + + if (node->next_sibling) + node->next_sibling->prev_sibling_c = node->prev_sibling_c; + else + parent->first_child->prev_sibling_c = node->prev_sibling_c; + + if (node->prev_sibling_c->next_sibling) + node->prev_sibling_c->next_sibling = node->next_sibling; + else + parent->first_child = node->next_sibling; + + node->parent = 0; + node->prev_sibling_c = 0; + node->next_sibling = 0; +} + +inline void append_attribute(xml_attribute_struct* attr, xml_node_struct* node) +{ + xml_attribute_struct* head = node->first_attribute; + + if (head) { + xml_attribute_struct* tail = head->prev_attribute_c; + + tail->next_attribute = attr; + attr->prev_attribute_c = tail; + head->prev_attribute_c = attr; + } else { + node->first_attribute = attr; + attr->prev_attribute_c = attr; + } +} + +inline void prepend_attribute(xml_attribute_struct* attr, xml_node_struct* node) +{ + xml_attribute_struct* head = node->first_attribute; + + if (head) { + attr->prev_attribute_c = head->prev_attribute_c; + head->prev_attribute_c = attr; + } else + attr->prev_attribute_c = attr; + + attr->next_attribute = head; + node->first_attribute = attr; +} + +inline void insert_attribute_after(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node) +{ + if (place->next_attribute) + place->next_attribute->prev_attribute_c = attr; + else + node->first_attribute->prev_attribute_c = attr; + + attr->next_attribute = place->next_attribute; + attr->prev_attribute_c = place; + place->next_attribute = attr; +} + +inline void insert_attribute_before(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node) +{ + if (place->prev_attribute_c->next_attribute) + place->prev_attribute_c->next_attribute = attr; + else + node->first_attribute = attr; + + attr->prev_attribute_c = place->prev_attribute_c; + attr->next_attribute = place; + place->prev_attribute_c = attr; +} + +inline void remove_attribute(xml_attribute_struct* attr, xml_node_struct* node) +{ + if (attr->next_attribute) + attr->next_attribute->prev_attribute_c = attr->prev_attribute_c; + else + node->first_attribute->prev_attribute_c = attr->prev_attribute_c; + + if (attr->prev_attribute_c->next_attribute) + attr->prev_attribute_c->next_attribute = attr->next_attribute; + else + node->first_attribute = attr->next_attribute; + + attr->prev_attribute_c = 0; + attr->next_attribute = 0; +} + +PUGI__FN_NO_INLINE xml_node_struct* append_new_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element) +{ + if (!alloc.reserve()) return 0; + + xml_node_struct* child = allocate_node(alloc, type); + if (!child) return 0; + + append_node(child, node); + + return child; +} + +PUGI__FN_NO_INLINE xml_attribute_struct* append_new_attribute(xml_node_struct* node, xml_allocator& alloc) +{ + if (!alloc.reserve()) return 0; + + xml_attribute_struct* attr = allocate_attribute(alloc); + if (!attr) return 0; + + append_attribute(attr, node); + + return attr; +} +PUGI__NS_END + +// Helper classes for code generation +PUGI__NS_BEGIN +struct opt_false { + enum { value = 0 }; +}; + +struct opt_true { + enum { value = 1 }; +}; +PUGI__NS_END + +// Unicode utilities +PUGI__NS_BEGIN +inline uint16_t endian_swap(uint16_t value) +{ + return static_cast(((value & 0xff) << 8) | (value >> 8)); +} + +inline uint32_t endian_swap(uint32_t value) +{ + return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24); +} + +struct utf8_counter { + typedef size_t value_type; + + static value_type low(value_type result, uint32_t ch) { + // U+0000..U+007F + if (ch < 0x80) return result + 1; + // U+0080..U+07FF + else if (ch < 0x800) return result + 2; + // U+0800..U+FFFF + else return result + 3; + } + + static value_type high(value_type result, uint32_t) { + // U+10000..U+10FFFF + return result + 4; + } +}; + +struct utf8_writer { + typedef uint8_t* value_type; + + static value_type low(value_type result, uint32_t ch) { + // U+0000..U+007F + if (ch < 0x80) { + *result = static_cast(ch); + return result + 1; + } + // U+0080..U+07FF + else if (ch < 0x800) { + result[0] = static_cast(0xC0 | (ch >> 6)); + result[1] = static_cast(0x80 | (ch & 0x3F)); + return result + 2; + } + // U+0800..U+FFFF + else { + result[0] = static_cast(0xE0 | (ch >> 12)); + result[1] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[2] = static_cast(0x80 | (ch & 0x3F)); + return result + 3; + } + } + + static value_type high(value_type result, uint32_t ch) { + // U+10000..U+10FFFF + result[0] = static_cast(0xF0 | (ch >> 18)); + result[1] = static_cast(0x80 | ((ch >> 12) & 0x3F)); + result[2] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[3] = static_cast(0x80 | (ch & 0x3F)); + return result + 4; + } + + static value_type any(value_type result, uint32_t ch) { + return (ch < 0x10000) ? low(result, ch) : high(result, ch); + } +}; + +struct utf16_counter { + typedef size_t value_type; + + static value_type low(value_type result, uint32_t) { + return result + 1; + } + + static value_type high(value_type result, uint32_t) { + return result + 2; + } +}; + +struct utf16_writer { + typedef uint16_t* value_type; + + static value_type low(value_type result, uint32_t ch) { + *result = static_cast(ch); + + return result + 1; + } + + static value_type high(value_type result, uint32_t ch) { + uint32_t msh = static_cast(ch - 0x10000) >> 10; + uint32_t lsh = static_cast(ch - 0x10000) & 0x3ff; + + result[0] = static_cast(0xD800 + msh); + result[1] = static_cast(0xDC00 + lsh); + + return result + 2; + } + + static value_type any(value_type result, uint32_t ch) { + return (ch < 0x10000) ? low(result, ch) : high(result, ch); + } +}; + +struct utf32_counter { + typedef size_t value_type; + + static value_type low(value_type result, uint32_t) { + return result + 1; + } + + static value_type high(value_type result, uint32_t) { + return result + 1; + } +}; + +struct utf32_writer { + typedef uint32_t* value_type; + + static value_type low(value_type result, uint32_t ch) { + *result = ch; + + return result + 1; + } + + static value_type high(value_type result, uint32_t ch) { + *result = ch; + + return result + 1; + } + + static value_type any(value_type result, uint32_t ch) { + *result = ch; + + return result + 1; + } +}; + +struct latin1_writer { + typedef uint8_t* value_type; + + static value_type low(value_type result, uint32_t ch) { + *result = static_cast(ch > 255 ? '?' : ch); + + return result + 1; + } + + static value_type high(value_type result, uint32_t ch) { + (void)ch; + + *result = '?'; + + return result + 1; + } +}; + +struct utf8_decoder { + typedef uint8_t type; + + template static inline typename Traits::value_type process(const uint8_t* data, size_t size, typename Traits::value_type result, Traits) { + const uint8_t utf8_byte_mask = 0x3f; + + while (size) { + uint8_t lead = *data; + + // 0xxxxxxx -> U+0000..U+007F + if (lead < 0x80) { + result = Traits::low(result, lead); + data += 1; + size -= 1; + + // process aligned single-byte (ascii) blocks + if ((reinterpret_cast(data) & 3) == 0) { + // round-trip through void* to silence 'cast increases required alignment of target type' warnings + while (size >= 4 && (*static_cast(static_cast(data)) & 0x80808080) == 0) { + result = Traits::low(result, data[0]); + result = Traits::low(result, data[1]); + result = Traits::low(result, data[2]); + result = Traits::low(result, data[3]); + data += 4; + size -= 4; + } + } + } + // 110xxxxx -> U+0080..U+07FF + else if (static_cast(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80) { + result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask)); + data += 2; + size -= 2; + } + // 1110xxxx -> U+0800-U+FFFF + else if (static_cast(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80) { + result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask)); + data += 3; + size -= 3; + } + // 11110xxx -> U+10000..U+10FFFF + else if (static_cast(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80) { + result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask)); + data += 4; + size -= 4; + } + // 10xxxxxx or 11111xxx -> invalid + else { + data += 1; + size -= 1; + } + } + + return result; + } +}; + +template struct utf16_decoder { + typedef uint16_t type; + + template static inline typename Traits::value_type process(const uint16_t* data, size_t size, typename Traits::value_type result, Traits) { + while (size) { + uint16_t lead = opt_swap::value ? endian_swap(*data) : *data; + + // U+0000..U+D7FF + if (lead < 0xD800) { + result = Traits::low(result, lead); + data += 1; + size -= 1; + } + // U+E000..U+FFFF + else if (static_cast(lead - 0xE000) < 0x2000) { + result = Traits::low(result, lead); + data += 1; + size -= 1; + } + // surrogate pair lead + else if (static_cast(lead - 0xD800) < 0x400 && size >= 2) { + uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1]; + + if (static_cast(next - 0xDC00) < 0x400) { + result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff)); + data += 2; + size -= 2; + } else { + data += 1; + size -= 1; + } + } else { + data += 1; + size -= 1; + } + } + + return result; + } +}; + +template struct utf32_decoder { + typedef uint32_t type; + + template static inline typename Traits::value_type process(const uint32_t* data, size_t size, typename Traits::value_type result, Traits) { + while (size) { + uint32_t lead = opt_swap::value ? endian_swap(*data) : *data; + + // U+0000..U+FFFF + if (lead < 0x10000) { + result = Traits::low(result, lead); + data += 1; + size -= 1; + } + // U+10000..U+10FFFF + else { + result = Traits::high(result, lead); + data += 1; + size -= 1; + } + } + + return result; + } +}; + +struct latin1_decoder { + typedef uint8_t type; + + template static inline typename Traits::value_type process(const uint8_t* data, size_t size, typename Traits::value_type result, Traits) { + while (size) { + result = Traits::low(result, *data); + data += 1; + size -= 1; + } + + return result; + } +}; + +template struct wchar_selector; + +template <> struct wchar_selector<2> { + typedef uint16_t type; + typedef utf16_counter counter; + typedef utf16_writer writer; + typedef utf16_decoder decoder; +}; + +template <> struct wchar_selector<4> { + typedef uint32_t type; + typedef utf32_counter counter; + typedef utf32_writer writer; + typedef utf32_decoder decoder; +}; + +typedef wchar_selector::counter wchar_counter; +typedef wchar_selector::writer wchar_writer; + +struct wchar_decoder { + typedef wchar_t type; + + template static inline typename Traits::value_type process(const wchar_t* data, size_t size, typename Traits::value_type result, Traits traits) { + typedef wchar_selector::decoder decoder; + + return decoder::process(reinterpret_cast(data), size, result, traits); + } +}; + +#ifdef PUGIXML_WCHAR_MODE +PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length) +{ + for (size_t i = 0; i < length; ++i) + result[i] = static_cast(endian_swap(static_cast::type>(data[i]))); +} +#endif +PUGI__NS_END + +PUGI__NS_BEGIN +enum chartype_t { + ct_parse_pcdata = 1, // \0, &, \r, < + ct_parse_attr = 2, // \0, &, \r, ', " + ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, tab + ct_space = 8, // \r, \n, space, tab + ct_parse_cdata = 16, // \0, ], >, \r + ct_parse_comment = 32, // \0, -, >, \r + ct_symbol = 64, // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, . + ct_start_symbol = 128 // Any symbol > 127, a-z, A-Z, _, : +}; + +static const unsigned char chartype_table[256] = { + 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31 + 8, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47 + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63 + 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79 + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95 + 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 96-111 + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, // 112-127 + + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 128+ + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192 +}; + +enum chartypex_t { + ctx_special_pcdata = 1, // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, > + ctx_special_attr = 2, // Any symbol >= 0 and < 32 (except \t), &, <, >, " + ctx_start_symbol = 4, // Any symbol > 127, a-z, A-Z, _ + ctx_digit = 8, // 0-9 + ctx_symbol = 16 // Any symbol > 127, a-z, A-Z, 0-9, _, -, . +}; + +static const unsigned char chartypex_table[256] = { + 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31 + 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 16, 16, 0, // 32-47 + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 3, 0, 3, 0, // 48-63 + + 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 64-79 + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 20, // 80-95 + 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 96-111 + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, // 112-127 + + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 128+ + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 +}; + +#ifdef PUGIXML_WCHAR_MODE +#define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast(c) < 128 ? table[static_cast(c)] : table[128]) & (ct)) +#else +#define PUGI__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast(c)] & (ct)) +#endif + +#define PUGI__IS_CHARTYPE(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartype_table) +#define PUGI__IS_CHARTYPEX(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartypex_table) + +PUGI__FN bool is_little_endian() +{ + unsigned int ui = 1; + + return *reinterpret_cast(&ui) == 1; +} + +PUGI__FN xml_encoding get_wchar_encoding() +{ + PUGI__STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); + + if (sizeof(wchar_t) == 2) + return is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + else + return is_little_endian() ? encoding_utf32_le : encoding_utf32_be; +} + +PUGI__FN xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3) +{ + // look for BOM in first few bytes + if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be; + if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le; + if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be; + if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le; + if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8; + + // look for <, (contents); + + PUGI__DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3]; + + return guess_buffer_encoding(d0, d1, d2, d3); +} + +PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) +{ + size_t length = size / sizeof(char_t); + + if (is_mutable) { + out_buffer = static_cast(const_cast(contents)); + out_length = length; + } else { + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + if (contents) + memcpy(buffer, contents, length * sizeof(char_t)); + else + assert(length == 0); + + buffer[length] = 0; + + out_buffer = buffer; + out_length = length + 1; + } + + return true; +} + +#ifdef PUGIXML_WCHAR_MODE +PUGI__FN bool need_endian_swap_utf(xml_encoding le, xml_encoding re) +{ + return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) || + (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be); +} + +PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) +{ + const char_t* data = static_cast(contents); + size_t length = size / sizeof(char_t); + + if (is_mutable) { + char_t* buffer = const_cast(data); + + convert_wchar_endian_swap(buffer, data, length); + + out_buffer = buffer; + out_length = length; + } else { + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + convert_wchar_endian_swap(buffer, data, length); + buffer[length] = 0; + + out_buffer = buffer; + out_length = length + 1; + } + + return true; +} + +template PUGI__FN bool convert_buffer_generic(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, D) +{ + const typename D::type* data = static_cast(contents); + size_t data_length = size / sizeof(typename D::type); + + // first pass: get length in wchar_t units + size_t length = D::process(data, data_length, 0, wchar_counter()); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert utf16 input to wchar_t + wchar_writer::value_type obegin = reinterpret_cast(buffer); + wchar_writer::value_type oend = D::process(data, data_length, obegin, wchar_writer()); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; +} + +PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable) +{ + // get native encoding + xml_encoding wchar_encoding = get_wchar_encoding(); + + // fast path: no conversion required + if (encoding == wchar_encoding) + return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); + + // only endian-swapping is required + if (need_endian_swap_utf(encoding, wchar_encoding)) + return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable); + + // source encoding is utf8 + if (encoding == encoding_utf8) + return convert_buffer_generic(out_buffer, out_length, contents, size, utf8_decoder()); + + // source encoding is utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return (native_encoding == encoding) ? + convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder()) : + convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder()); + } + + // source encoding is utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return (native_encoding == encoding) ? + convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder()) : + convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder()); + } + + // source encoding is latin1 + if (encoding == encoding_latin1) + return convert_buffer_generic(out_buffer, out_length, contents, size, latin1_decoder()); + + assert(!"Invalid encoding"); + return false; +} +#else +template PUGI__FN bool convert_buffer_generic(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, D) +{ + const typename D::type* data = static_cast(contents); + size_t data_length = size / sizeof(typename D::type); + + // first pass: get length in utf8 units + size_t length = D::process(data, data_length, 0, utf8_counter()); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert utf16 input to utf8 + uint8_t* obegin = reinterpret_cast(buffer); + uint8_t* oend = D::process(data, data_length, obegin, utf8_writer()); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; +} + +PUGI__FN size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size) +{ + for (size_t i = 0; i < size; ++i) + if (data[i] > 127) + return i; + + return size; +} + +PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) +{ + const uint8_t* data = static_cast(contents); + size_t data_length = size; + + // get size of prefix that does not need utf8 conversion + size_t prefix_length = get_latin1_7bit_prefix_length(data, data_length); + assert(prefix_length <= data_length); + + const uint8_t* postfix = data + prefix_length; + size_t postfix_length = data_length - prefix_length; + + // if no conversion is needed, just return the original buffer + if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); + + // first pass: get length in utf8 units + size_t length = prefix_length + latin1_decoder::process(postfix, postfix_length, 0, utf8_counter()); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert latin1 input to utf8 + memcpy(buffer, data, prefix_length); + + uint8_t* obegin = reinterpret_cast(buffer); + uint8_t* oend = latin1_decoder::process(postfix, postfix_length, obegin + prefix_length, utf8_writer()); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; +} + +PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable) +{ + // fast path: no conversion required + if (encoding == encoding_utf8) + return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); + + // source encoding is utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return (native_encoding == encoding) ? + convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder()) : + convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder()); + } + + // source encoding is utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return (native_encoding == encoding) ? + convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder()) : + convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder()); + } + + // source encoding is latin1 + if (encoding == encoding_latin1) + return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable); + + assert(!"Invalid encoding"); + return false; +} +#endif + +PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length) +{ + // get length in utf8 characters + return wchar_decoder::process(str, length, 0, utf8_counter()); +} + +PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length) +{ + // convert to utf8 + uint8_t* begin = reinterpret_cast(buffer); + uint8_t* end = wchar_decoder::process(str, length, begin, utf8_writer()); + + assert(begin + size == end); + (void)!end; + (void)!size; +} + +#ifndef PUGIXML_NO_STL +PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length) +{ + // first pass: get length in utf8 characters + size_t size = as_utf8_begin(str, length); + + // allocate resulting string + std::string result; + result.resize(size); + + // second pass: convert to utf8 + if (size > 0) as_utf8_end(&result[0], size, str, length); + + return result; +} + +PUGI__FN std::basic_string as_wide_impl(const char* str, size_t size) +{ + const uint8_t* data = reinterpret_cast(str); + + // first pass: get length in wchar_t units + size_t length = utf8_decoder::process(data, size, 0, wchar_counter()); + + // allocate resulting string + std::basic_string result; + result.resize(length); + + // second pass: convert to wchar_t + if (length > 0) { + wchar_writer::value_type begin = reinterpret_cast(&result[0]); + wchar_writer::value_type end = utf8_decoder::process(data, size, begin, wchar_writer()); + + assert(begin + length == end); + (void)!end; + } + + return result; +} +#endif + +template +inline bool strcpy_insitu_allow(size_t length, const Header& header, uintptr_t header_mask, char_t* target) +{ + // never reuse shared memory + if (header & xml_memory_page_contents_shared_mask) return false; + + size_t target_length = strlength(target); + + // always reuse document buffer memory if possible + if ((header & header_mask) == 0) return target_length >= length; + + // reuse heap memory if waste is not too great + const size_t reuse_threshold = 32; + + return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2); +} + +template +PUGI__FN bool strcpy_insitu(String& dest, Header& header, uintptr_t header_mask, const char_t* source, size_t source_length) +{ + if (source_length == 0) { + // empty string and null pointer are equivalent, so just deallocate old memory + xml_allocator* alloc = PUGI__GETPAGE_IMPL(header)->allocator; + + if (header & header_mask) alloc->deallocate_string(dest); + + // mark the string as not allocated + dest = 0; + header &= ~header_mask; + + return true; + } else if (dest && strcpy_insitu_allow(source_length, header, header_mask, dest)) { + // we can reuse old buffer, so just copy the new data (including zero terminator) + memcpy(dest, source, source_length * sizeof(char_t)); + dest[source_length] = 0; + + return true; + } else { + xml_allocator* alloc = PUGI__GETPAGE_IMPL(header)->allocator; + + if (!alloc->reserve()) return false; + + // allocate new buffer + char_t* buf = alloc->allocate_string(source_length + 1); + if (!buf) return false; + + // copy the string (including zero terminator) + memcpy(buf, source, source_length * sizeof(char_t)); + buf[source_length] = 0; + + // deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures) + if (header & header_mask) alloc->deallocate_string(dest); + + // the string is now allocated, so set the flag + dest = buf; + header |= header_mask; + + return true; + } +} + +struct gap { + char_t* end; + size_t size; + + gap(): end(0), size(0) { + } + + // Push new gap, move s count bytes further (skipping the gap). + // Collapse previous gap. + void push(char_t*& s, size_t count) { + if (end) { // there was a gap already; collapse it + // Move [old_gap_end, new_gap_start) to [old_gap_start, ...) + assert(s >= end); + memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); + } + + s += count; // end of current gap + + // "merge" two gaps + end = s; + size += count; + } + + // Collapse all gaps, return past-the-end pointer + char_t* flush(char_t* s) { + if (end) { + // Move [old_gap_end, current_pos) to [old_gap_start, ...) + assert(s >= end); + memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); + + return s - size; + } else return s; + } +}; + +PUGI__FN char_t* strconv_escape(char_t* s, gap& g) +{ + char_t* stre = s + 1; + + switch (*stre) { + case '#': { // &#... + unsigned int ucsc = 0; + + if (stre[1] == 'x') { // &#x... (hex code) + stre += 2; + + char_t ch = *stre; + + if (ch == ';') return stre; + + for (;;) { + if (static_cast(ch - '0') <= 9) + ucsc = 16 * ucsc + (ch - '0'); + else if (static_cast((ch | ' ') - 'a') <= 5) + ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10); + else if (ch == ';') + break; + else // cancel + return stre; + + ch = *++stre; + } + + ++stre; + } else { // &#... (dec code) + char_t ch = *++stre; + + if (ch == ';') return stre; + + for (;;) { + if (static_cast(static_cast(ch) - '0') <= 9) + ucsc = 10 * ucsc + (ch - '0'); + else if (ch == ';') + break; + else // cancel + return stre; + + ch = *++stre; + } + + ++stre; + } + +#ifdef PUGIXML_WCHAR_MODE + s = reinterpret_cast(wchar_writer::any(reinterpret_cast(s), ucsc)); +#else + s = reinterpret_cast(utf8_writer::any(reinterpret_cast(s), ucsc)); +#endif + + g.push(s, stre - s); + return stre; + } + + case 'a': { // &a + ++stre; + + if (*stre == 'm') { // &am + if (*++stre == 'p' && *++stre == ';') { // & + *s++ = '&'; + ++stre; + + g.push(s, stre - s); + return stre; + } + } else if (*stre == 'p') { // &ap + if (*++stre == 'o' && *++stre == 's' && *++stre == ';') { // ' + *s++ = '\''; + ++stre; + + g.push(s, stre - s); + return stre; + } + } + break; + } + + case 'g': { // &g + if (*++stre == 't' && *++stre == ';') { // > + *s++ = '>'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + case 'l': { // &l + if (*++stre == 't' && *++stre == ';') { // < + *s++ = '<'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + case 'q': { // &q + if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') { // " + *s++ = '"'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + default: + break; + } + + return stre; +} + +// Parser utilities +#define PUGI__ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e))) +#define PUGI__SKIPWS() { while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; } +#define PUGI__OPTSET(OPT) ( optmsk & (OPT) ) +#define PUGI__PUSHNODE(TYPE) { cursor = append_new_node(cursor, alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); } +#define PUGI__POPNODE() { cursor = cursor->parent; } +#define PUGI__SCANFOR(X) { while (*s != 0 && !(X)) ++s; } +#define PUGI__SCANWHILE(X) { while (X) ++s; } +#define PUGI__SCANWHILE_UNROLL(X) { for (;;) { char_t ss = s[0]; if (PUGI__UNLIKELY(!(X))) { break; } ss = s[1]; if (PUGI__UNLIKELY(!(X))) { s += 1; break; } ss = s[2]; if (PUGI__UNLIKELY(!(X))) { s += 2; break; } ss = s[3]; if (PUGI__UNLIKELY(!(X))) { s += 3; break; } s += 4; } } +#define PUGI__ENDSEG() { ch = *s; *s = 0; ++s; } +#define PUGI__THROW_ERROR(err, m) return error_offset = m, error_status = err, static_cast(0) +#define PUGI__CHECK_ERROR(err, m) { if (*s == 0) PUGI__THROW_ERROR(err, m); } + +PUGI__FN char_t* strconv_comment(char_t* s, char_t endch) +{ + gap g; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_comment)); + + if (*s == '\r') { // Either a single 0x0d or 0x0d 0x0a pair + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); + } else if (s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')) { // comment ends here + *g.flush(s) = 0; + + return s + (s[2] == '>' ? 3 : 2); + } else if (*s == 0) { + return 0; + } else ++s; + } +} + +PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch) +{ + gap g; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_cdata)); + + if (*s == '\r') { // Either a single 0x0d or 0x0d 0x0a pair + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); + } else if (s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')) { // CDATA ends here + *g.flush(s) = 0; + + return s + 1; + } else if (*s == 0) { + return 0; + } else ++s; + } +} + +typedef char_t* (*strconv_pcdata_t)(char_t*); + +template struct strconv_pcdata_impl { + static char_t* parse(char_t* s) { + gap g; + + char_t* begin = s; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_pcdata)); + + if (*s == '<') { // PCDATA ends here + char_t* end = g.flush(s); + + if (opt_trim::value) + while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space)) + --end; + + *end = 0; + + return s + 1; + } else if (opt_eol::value && *s == '\r') { // Either a single 0x0d or 0x0d 0x0a pair + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); + } else if (opt_escape::value && *s == '&') { + s = strconv_escape(s, g); + } else if (*s == 0) { + char_t* end = g.flush(s); + + if (opt_trim::value) + while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space)) + --end; + + *end = 0; + + return s; + } else ++s; + } + } +}; + +PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask) +{ + PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800); + + switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) { // get bitmask for flags (eol escapes trim) + case 0: + return strconv_pcdata_impl::parse; + case 1: + return strconv_pcdata_impl::parse; + case 2: + return strconv_pcdata_impl::parse; + case 3: + return strconv_pcdata_impl::parse; + case 4: + return strconv_pcdata_impl::parse; + case 5: + return strconv_pcdata_impl::parse; + case 6: + return strconv_pcdata_impl::parse; + case 7: + return strconv_pcdata_impl::parse; + default: + assert(false); + return 0; // should not get here + } +} + +typedef char_t* (*strconv_attribute_t)(char_t*, char_t); + +template struct strconv_attribute_impl { + static char_t* parse_wnorm(char_t* s, char_t end_quote) { + gap g; + + // trim leading whitespaces + if (PUGI__IS_CHARTYPE(*s, ct_space)) { + char_t* str = s; + + do ++str; + while (PUGI__IS_CHARTYPE(*str, ct_space)); + + g.push(s, str - s); + } + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space)); + + if (*s == end_quote) { + char_t* str = g.flush(s); + + do *str-- = 0; + while (PUGI__IS_CHARTYPE(*str, ct_space)); + + return s + 1; + } else if (PUGI__IS_CHARTYPE(*s, ct_space)) { + *s++ = ' '; + + if (PUGI__IS_CHARTYPE(*s, ct_space)) { + char_t* str = s + 1; + while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str; + + g.push(s, str - s); + } + } else if (opt_escape::value && *s == '&') { + s = strconv_escape(s, g); + } else if (!*s) { + return 0; + } else ++s; + } + } + + static char_t* parse_wconv(char_t* s, char_t end_quote) { + gap g; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws)); + + if (*s == end_quote) { + *g.flush(s) = 0; + + return s + 1; + } else if (PUGI__IS_CHARTYPE(*s, ct_space)) { + if (*s == '\r') { + *s++ = ' '; + + if (*s == '\n') g.push(s, 1); + } else *s++ = ' '; + } else if (opt_escape::value && *s == '&') { + s = strconv_escape(s, g); + } else if (!*s) { + return 0; + } else ++s; + } + } + + static char_t* parse_eol(char_t* s, char_t end_quote) { + gap g; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr)); + + if (*s == end_quote) { + *g.flush(s) = 0; + + return s + 1; + } else if (*s == '\r') { + *s++ = '\n'; + + if (*s == '\n') g.push(s, 1); + } else if (opt_escape::value && *s == '&') { + s = strconv_escape(s, g); + } else if (!*s) { + return 0; + } else ++s; + } + } + + static char_t* parse_simple(char_t* s, char_t end_quote) { + gap g; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr)); + + if (*s == end_quote) { + *g.flush(s) = 0; + + return s + 1; + } else if (opt_escape::value && *s == '&') { + s = strconv_escape(s, g); + } else if (!*s) { + return 0; + } else ++s; + } + } +}; + +PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask) +{ + PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80); + + switch ((optmask >> 4) & 15) { // get bitmask for flags (wconv wnorm eol escapes) + case 0: + return strconv_attribute_impl::parse_simple; + case 1: + return strconv_attribute_impl::parse_simple; + case 2: + return strconv_attribute_impl::parse_eol; + case 3: + return strconv_attribute_impl::parse_eol; + case 4: + return strconv_attribute_impl::parse_wconv; + case 5: + return strconv_attribute_impl::parse_wconv; + case 6: + return strconv_attribute_impl::parse_wconv; + case 7: + return strconv_attribute_impl::parse_wconv; + case 8: + return strconv_attribute_impl::parse_wnorm; + case 9: + return strconv_attribute_impl::parse_wnorm; + case 10: + return strconv_attribute_impl::parse_wnorm; + case 11: + return strconv_attribute_impl::parse_wnorm; + case 12: + return strconv_attribute_impl::parse_wnorm; + case 13: + return strconv_attribute_impl::parse_wnorm; + case 14: + return strconv_attribute_impl::parse_wnorm; + case 15: + return strconv_attribute_impl::parse_wnorm; + default: + assert(false); + return 0; // should not get here + } +} + +inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0) +{ + xml_parse_result result; + result.status = status; + result.offset = offset; + + return result; +} + +struct xml_parser { + xml_allocator alloc; + xml_allocator* alloc_state; + char_t* error_offset; + xml_parse_status error_status; + + xml_parser(xml_allocator* alloc_): alloc(*alloc_), alloc_state(alloc_), error_offset(0), error_status(status_ok) { + } + + ~xml_parser() { + *alloc_state = alloc; + } + + // DOCTYPE consists of nested sections of the following possible types: + // , , "...", '...' + // + // + // First group can not contain nested groups + // Second group can contain nested groups of the same type + // Third group can contain all other groups + char_t* parse_doctype_primitive(char_t* s) { + if (*s == '"' || *s == '\'') { + // quoted string + char_t ch = *s++; + PUGI__SCANFOR(*s == ch); + if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s); + + s++; + } else if (s[0] == '<' && s[1] == '?') { + // + s += 2; + PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype + if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s); + + s += 2; + } else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-') { + s += 4; + PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype + if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s); + + s += 3; + } else PUGI__THROW_ERROR(status_bad_doctype, s); + + return s; + } + + char_t* parse_doctype_ignore(char_t* s) { + size_t depth = 0; + + assert(s[0] == '<' && s[1] == '!' && s[2] == '['); + s += 3; + + while (*s) { + if (s[0] == '<' && s[1] == '!' && s[2] == '[') { + // nested ignore section + s += 3; + depth++; + } else if (s[0] == ']' && s[1] == ']' && s[2] == '>') { + // ignore section end + s += 3; + + if (depth == 0) + return s; + + depth--; + } else s++; + } + + PUGI__THROW_ERROR(status_bad_doctype, s); + } + + char_t* parse_doctype_group(char_t* s, char_t endch) { + size_t depth = 0; + + assert((s[0] == '<' || s[0] == 0) && s[1] == '!'); + s += 2; + + while (*s) { + if (s[0] == '<' && s[1] == '!' && s[2] != '-') { + if (s[2] == '[') { + // ignore + s = parse_doctype_ignore(s); + if (!s) return s; + } else { + // some control group + s += 2; + depth++; + } + } else if (s[0] == '<' || s[0] == '"' || s[0] == '\'') { + // unknown tag (forbidden), or some primitive group + s = parse_doctype_primitive(s); + if (!s) return s; + } else if (*s == '>') { + if (depth == 0) + return s; + + depth--; + s++; + } else s++; + } + + if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s); + + return s; + } + + char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch) { + // parse node contents, starting with exclamation mark + ++s; + + if (*s == '-') { // 'value = s; // Save the offset. + } + + if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments)) { + s = strconv_comment(s, endch); + + if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value); + } else { + // Scan for terminating '-->'. + PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')); + PUGI__CHECK_ERROR(status_bad_comment, s); + + if (PUGI__OPTSET(parse_comments)) + *s = 0; // Zero-terminate this segment at the first terminating '-'. + + s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'. + } + } else PUGI__THROW_ERROR(status_bad_comment, s); + } else if (*s == '[') { + // 'value = s; // Save the offset. + + if (PUGI__OPTSET(parse_eol)) { + s = strconv_cdata(s, endch); + + if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value); + } else { + // Scan for terminating ']]>'. + PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')); + PUGI__CHECK_ERROR(status_bad_cdata, s); + + *s++ = 0; // Zero-terminate this segment. + } + } else { // Flagged for discard, but we still have to scan for the terminator. + // Scan for terminating ']]>'. + PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')); + PUGI__CHECK_ERROR(status_bad_cdata, s); + + ++s; + } + + s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'. + } else PUGI__THROW_ERROR(status_bad_cdata, s); + } else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && PUGI__ENDSWITH(s[6], 'E')) { + s -= 2; + + if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s); + + char_t* mark = s + 9; + + s = parse_doctype_group(s, endch); + if (!s) return s; + + assert((*s == 0 && endch == '>') || *s == '>'); + if (*s) *s++ = 0; + + if (PUGI__OPTSET(parse_doctype)) { + while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark; + + PUGI__PUSHNODE(node_doctype); + + cursor->value = mark; + } + } else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s); + else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s); + else PUGI__THROW_ERROR(status_unrecognized_tag, s); + + return s; + } + + char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch) { + // load into registers + xml_node_struct* cursor = ref_cursor; + char_t ch = 0; + + // parse node contents, starting with question mark + ++s; + + // read PI target + char_t* target = s; + + if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s); + + PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol)); + PUGI__CHECK_ERROR(status_bad_pi, s); + + // determine node type; stricmp / strcasecmp is not portable + bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s; + + if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi)) { + if (declaration) { + // disallow non top-level declarations + if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s); + + PUGI__PUSHNODE(node_declaration); + } else { + PUGI__PUSHNODE(node_pi); + } + + cursor->name = target; + + PUGI__ENDSEG(); + + // parse value/attributes + if (ch == '?') { + // empty node + if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s); + s += (*s == '>'); + + PUGI__POPNODE(); + } else if (PUGI__IS_CHARTYPE(ch, ct_space)) { + PUGI__SKIPWS(); + + // scan for tag end + char_t* value = s; + + PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>')); + PUGI__CHECK_ERROR(status_bad_pi, s); + + if (declaration) { + // replace ending ? with / so that 'element' terminates properly + *s = '/'; + + // we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES + s = value; + } else { + // store value and step over > + cursor->value = value; + + PUGI__POPNODE(); + + PUGI__ENDSEG(); + + s += (*s == '>'); + } + } else PUGI__THROW_ERROR(status_bad_pi, s); + } else { + // scan for tag end + PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>')); + PUGI__CHECK_ERROR(status_bad_pi, s); + + s += (s[1] == '>' ? 2 : 1); + } + + // store from registers + ref_cursor = cursor; + + return s; + } + + char_t* parse_tree(char_t* s, xml_node_struct* root, unsigned int optmsk, char_t endch) { + strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk); + strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk); + + char_t ch = 0; + xml_node_struct* cursor = root; + char_t* mark = s; + + while (*s != 0) { + if (*s == '<') { + ++s; + +LOC_TAG: + if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) { // '<#...' + PUGI__PUSHNODE(node_element); // Append a new node to the tree. + + cursor->name = s; + + PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator. + PUGI__ENDSEG(); // Save char in 'ch', terminate & step over. + + if (ch == '>') { + // end of tag + } else if (PUGI__IS_CHARTYPE(ch, ct_space)) { +LOC_ATTRIBUTES: + while (true) { + PUGI__SKIPWS(); // Eat any whitespace. + + if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) { // <... #... + xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for this attribute. + if (!a) PUGI__THROW_ERROR(status_out_of_memory, s); + + a->name = s; // Save the offset. + + PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator. + PUGI__ENDSEG(); // Save char in 'ch', terminate & step over. + + if (PUGI__IS_CHARTYPE(ch, ct_space)) { + PUGI__SKIPWS(); // Eat any whitespace. + + ch = *s; + ++s; + } + + if (ch == '=') { // '<... #=...' + PUGI__SKIPWS(); // Eat any whitespace. + + if (*s == '"' || *s == '\'') { // '<... #="...' + ch = *s; // Save quote char to avoid breaking on "''" -or- '""'. + ++s; // Step over the quote. + a->value = s; // Save the offset. + + s = strconv_attribute(s, ch); + + if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value); + + // After this line the loop continues from the start; + // Whitespaces, / and > are ok, symbols and EOF are wrong, + // everything else will be detected + if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s); + } else PUGI__THROW_ERROR(status_bad_attribute, s); + } else PUGI__THROW_ERROR(status_bad_attribute, s); + } else if (*s == '/') { + ++s; + + if (*s == '>') { + PUGI__POPNODE(); + s++; + break; + } else if (*s == 0 && endch == '>') { + PUGI__POPNODE(); + break; + } else PUGI__THROW_ERROR(status_bad_start_element, s); + } else if (*s == '>') { + ++s; + + break; + } else if (*s == 0 && endch == '>') { + break; + } else PUGI__THROW_ERROR(status_bad_start_element, s); + } + + // !!! + } else if (ch == '/') { // '<#.../' + if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s); + + PUGI__POPNODE(); // Pop. + + s += (*s == '>'); + } else if (ch == 0) { + // we stepped over null terminator, backtrack & handle closing tag + --s; + + if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s); + } else PUGI__THROW_ERROR(status_bad_start_element, s); + } else if (*s == '/') { + ++s; + + char_t* name = cursor->name; + if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s); + + while (PUGI__IS_CHARTYPE(*s, ct_symbol)) { + if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s); + } + + if (*name) { + if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s); + else PUGI__THROW_ERROR(status_end_element_mismatch, s); + } + + PUGI__POPNODE(); // Pop. + + PUGI__SKIPWS(); + + if (*s == 0) { + if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s); + } else { + if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s); + ++s; + } + } else if (*s == '?') { // 'first_child) continue; + } + } + + if (!PUGI__OPTSET(parse_trim_pcdata)) + s = mark; + + if (cursor->parent || PUGI__OPTSET(parse_fragment)) { + PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree. + cursor->value = s; // Save the offset. + + s = strconv_pcdata(s); + + PUGI__POPNODE(); // Pop since this is a standalone. + + if (!*s) break; + } else { + PUGI__SCANFOR(*s == '<'); // '...<' + if (!*s) break; + + ++s; + } + + // We're after '<' + goto LOC_TAG; + } + } + + // check that last tag is closed + if (cursor != root) PUGI__THROW_ERROR(status_end_element_mismatch, s); + + return s; + } + +#ifdef PUGIXML_WCHAR_MODE + static char_t* parse_skip_bom(char_t* s) { + unsigned int bom = 0xfeff; + return (s[0] == static_cast(bom)) ? s + 1 : s; + } +#else + static char_t* parse_skip_bom(char_t* s) { + return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s; + } +#endif + + static bool has_element_node_siblings(xml_node_struct* node) { + while (node) { + if (PUGI__NODETYPE(node) == node_element) return true; + + node = node->next_sibling; + } + + return false; + } + + static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk) { + // early-out for empty documents + if (length == 0) + return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element); + + // get last child of the root before parsing + xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c + 0 : 0; + + // create parser on stack + xml_parser parser(static_cast(xmldoc)); + + // save last character and make buffer zero-terminated (speeds up parsing) + char_t endch = buffer[length - 1]; + buffer[length - 1] = 0; + + // skip BOM to make sure it does not end up as part of parse output + char_t* buffer_data = parse_skip_bom(buffer); + + // perform actual parsing + parser.parse_tree(buffer_data, root, optmsk, endch); + + xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0); + assert(result.offset >= 0 && static_cast(result.offset) <= length); + + if (result) { + // since we removed last character, we have to handle the only possible false positive (stray <) + if (endch == '<') + return make_parse_result(status_unrecognized_tag, length - 1); + + // check if there are any element nodes parsed + xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling + 0 : root->first_child+ 0; + + if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed)) + return make_parse_result(status_no_document_element, length - 1); + } else { + // roll back offset if it occurs on a null terminator in the source buffer + if (result.offset > 0 && static_cast(result.offset) == length - 1 && endch == 0) + result.offset--; + } + + return result; + } +}; + +// Output facilities +PUGI__FN xml_encoding get_write_native_encoding() +{ +#ifdef PUGIXML_WCHAR_MODE + return get_wchar_encoding(); +#else + return encoding_utf8; +#endif +} + +PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding) +{ + // replace wchar encoding with utf implementation + if (encoding == encoding_wchar) return get_wchar_encoding(); + + // replace utf16 encoding with utf16 with specific endianness + if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + // replace utf32 encoding with utf32 with specific endianness + if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + // only do autodetection if no explicit encoding is requested + if (encoding != encoding_auto) return encoding; + + // assume utf8 encoding + return encoding_utf8; +} + +template PUGI__FN size_t convert_buffer_output_generic(typename T::value_type dest, const char_t* data, size_t length, D, T) +{ + PUGI__STATIC_ASSERT(sizeof(char_t) == sizeof(typename D::type)); + + typename T::value_type end = D::process(reinterpret_cast(data), length, dest, T()); + + return static_cast(end - dest) * sizeof(*dest); +} + +template PUGI__FN size_t convert_buffer_output_generic(typename T::value_type dest, const char_t* data, size_t length, D, T, bool opt_swap) +{ + PUGI__STATIC_ASSERT(sizeof(char_t) == sizeof(typename D::type)); + + typename T::value_type end = D::process(reinterpret_cast(data), length, dest, T()); + + if (opt_swap) { + for (typename T::value_type i = dest; i != end; ++i) + *i = endian_swap(*i); + } + + return static_cast(end - dest) * sizeof(*dest); +} + +#ifdef PUGIXML_WCHAR_MODE +PUGI__FN size_t get_valid_length(const char_t* data, size_t length) +{ + if (length < 1) return 0; + + // discard last character if it's the lead of a surrogate pair + return (sizeof(wchar_t) == 2 && static_cast(static_cast(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length; +} + +PUGI__FN size_t convert_buffer_output(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding) +{ + // only endian-swapping is required + if (need_endian_swap_utf(encoding, get_wchar_encoding())) { + convert_wchar_endian_swap(r_char, data, length); + + return length * sizeof(char_t); + } + + // convert to utf8 + if (encoding == encoding_utf8) + return convert_buffer_output_generic(r_u8, data, length, wchar_decoder(), utf8_writer()); + + // convert to utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return convert_buffer_output_generic(r_u16, data, length, wchar_decoder(), utf16_writer(), native_encoding != encoding); + } + + // convert to utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return convert_buffer_output_generic(r_u32, data, length, wchar_decoder(), utf32_writer(), native_encoding != encoding); + } + + // convert to latin1 + if (encoding == encoding_latin1) + return convert_buffer_output_generic(r_u8, data, length, wchar_decoder(), latin1_writer()); + + assert(!"Invalid encoding"); + return 0; +} +#else +PUGI__FN size_t get_valid_length(const char_t* data, size_t length) +{ + if (length < 5) return 0; + + for (size_t i = 1; i <= 4; ++i) { + uint8_t ch = static_cast(data[length - i]); + + // either a standalone character or a leading one + if ((ch & 0xc0) != 0x80) return length - i; + } + + // there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk + return length; +} + +PUGI__FN size_t convert_buffer_output(char_t* /* r_char */, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding) +{ + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return convert_buffer_output_generic(r_u16, data, length, utf8_decoder(), utf16_writer(), native_encoding != encoding); + } + + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return convert_buffer_output_generic(r_u32, data, length, utf8_decoder(), utf32_writer(), native_encoding != encoding); + } + + if (encoding == encoding_latin1) + return convert_buffer_output_generic(r_u8, data, length, utf8_decoder(), latin1_writer()); + + assert(!"Invalid encoding"); + return 0; +} +#endif + +class xml_buffered_writer +{ + xml_buffered_writer(const xml_buffered_writer&); + xml_buffered_writer& operator=(const xml_buffered_writer&); + +public: + xml_buffered_writer(xml_writer& writer_, xml_encoding user_encoding): writer(writer_), bufsize(0), encoding(get_write_encoding(user_encoding)) { + PUGI__STATIC_ASSERT(bufcapacity >= 8); + } + + size_t flush() { + flush(buffer, bufsize); + bufsize = 0; + return 0; + } + + void flush(const char_t* data, size_t size) { + if (size == 0) return; + + // fast path, just write data + if (encoding == get_write_native_encoding()) + writer.write(data, size * sizeof(char_t)); + else { + // convert chunk + size_t result = convert_buffer_output(scratch.data_char, scratch.data_u8, scratch.data_u16, scratch.data_u32, data, size, encoding); + assert(result <= sizeof(scratch)); + + // write data + writer.write(scratch.data_u8, result); + } + } + + void write_direct(const char_t* data, size_t length) { + // flush the remaining buffer contents + flush(); + + // handle large chunks + if (length > bufcapacity) { + if (encoding == get_write_native_encoding()) { + // fast path, can just write data chunk + writer.write(data, length * sizeof(char_t)); + return; + } + + // need to convert in suitable chunks + while (length > bufcapacity) { + // get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer + // and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary) + size_t chunk_size = get_valid_length(data, bufcapacity); + assert(chunk_size); + + // convert chunk and write + flush(data, chunk_size); + + // iterate + data += chunk_size; + length -= chunk_size; + } + + // small tail is copied below + bufsize = 0; + } + + memcpy(buffer + bufsize, data, length * sizeof(char_t)); + bufsize += length; + } + + void write_buffer(const char_t* data, size_t length) { + size_t offset = bufsize; + + if (offset + length <= bufcapacity) { + memcpy(buffer + offset, data, length * sizeof(char_t)); + bufsize = offset + length; + } else { + write_direct(data, length); + } + } + + void write_string(const char_t* data) { + // write the part of the string that fits in the buffer + size_t offset = bufsize; + + while (*data && offset < bufcapacity) + buffer[offset++] = *data++; + + // write the rest + if (offset < bufcapacity) { + bufsize = offset; + } else { + // backtrack a bit if we have split the codepoint + size_t length = offset - bufsize; + size_t extra = length - get_valid_length(data - length, length); + + bufsize = offset - extra; + + write_direct(data - extra, strlength(data) + extra); + } + } + + void write(char_t d0) { + size_t offset = bufsize; + if (offset > bufcapacity - 1) offset = flush(); + + buffer[offset + 0] = d0; + bufsize = offset + 1; + } + + void write(char_t d0, char_t d1) { + size_t offset = bufsize; + if (offset > bufcapacity - 2) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + bufsize = offset + 2; + } + + void write(char_t d0, char_t d1, char_t d2) { + size_t offset = bufsize; + if (offset > bufcapacity - 3) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + bufsize = offset + 3; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3) { + size_t offset = bufsize; + if (offset > bufcapacity - 4) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + buffer[offset + 3] = d3; + bufsize = offset + 4; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4) { + size_t offset = bufsize; + if (offset > bufcapacity - 5) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + buffer[offset + 3] = d3; + buffer[offset + 4] = d4; + bufsize = offset + 5; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5) { + size_t offset = bufsize; + if (offset > bufcapacity - 6) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + buffer[offset + 3] = d3; + buffer[offset + 4] = d4; + buffer[offset + 5] = d5; + bufsize = offset + 6; + } + + // utf8 maximum expansion: x4 (-> utf32) + // utf16 maximum expansion: x2 (-> utf32) + // utf32 maximum expansion: x1 + enum { + bufcapacitybytes = +#ifdef PUGIXML_MEMORY_OUTPUT_STACK + PUGIXML_MEMORY_OUTPUT_STACK +#else + 10240 +#endif + , + bufcapacity = bufcapacitybytes / (sizeof(char_t) + 4) + }; + + char_t buffer[bufcapacity]; + + union { + uint8_t data_u8[4 * bufcapacity]; + uint16_t data_u16[2 * bufcapacity]; + uint32_t data_u32[bufcapacity]; + char_t data_char[bufcapacity]; + } scratch; + + xml_writer& writer; + size_t bufsize; + xml_encoding encoding; +}; + +PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type) +{ + while (*s) { + const char_t* prev = s; + + // While *s is a usual symbol + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type)); + + writer.write_buffer(prev, static_cast(s - prev)); + + switch (*s) { + case 0: + break; + case '&': + writer.write('&', 'a', 'm', 'p', ';'); + ++s; + break; + case '<': + writer.write('&', 'l', 't', ';'); + ++s; + break; + case '>': + writer.write('&', 'g', 't', ';'); + ++s; + break; + case '"': + writer.write('&', 'q', 'u', 'o', 't', ';'); + ++s; + break; + default: { // s is not a usual symbol + unsigned int ch = static_cast(*s++); + assert(ch < 32); + + writer.write('&', '#', static_cast((ch / 10) + '0'), static_cast((ch % 10) + '0'), ';'); + } + } + } +} + +PUGI__FN void text_output(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags) +{ + if (flags & format_no_escapes) + writer.write_string(s); + else + text_output_escaped(writer, s, type); +} + +PUGI__FN void text_output_cdata(xml_buffered_writer& writer, const char_t* s) +{ + do { + writer.write('<', '!', '[', 'C', 'D'); + writer.write('A', 'T', 'A', '['); + + const char_t* prev = s; + + // look for ]]> sequence - we can't output it as is since it terminates CDATA + while (*s && !(s[0] == ']' && s[1] == ']' && s[2] == '>')) ++s; + + // skip ]] if we stopped at ]]>, > will go to the next CDATA section + if (*s) s += 2; + + writer.write_buffer(prev, static_cast(s - prev)); + + writer.write(']', ']', '>'); + } while (*s); +} + +PUGI__FN void text_output_indent(xml_buffered_writer& writer, const char_t* indent, size_t indent_length, unsigned int depth) +{ + switch (indent_length) { + case 1: { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0]); + break; + } + + case 2: { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0], indent[1]); + break; + } + + case 3: { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0], indent[1], indent[2]); + break; + } + + case 4: { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0], indent[1], indent[2], indent[3]); + break; + } + + default: { + for (unsigned int i = 0; i < depth; ++i) + writer.write_buffer(indent, indent_length); + } + } +} + +PUGI__FN void node_output_comment(xml_buffered_writer& writer, const char_t* s) +{ + writer.write('<', '!', '-', '-'); + + while (*s) { + const char_t* prev = s; + + // look for -\0 or -- sequence - we can't output it since -- is illegal in comment body + while (*s && !(s[0] == '-' && (s[1] == '-' || s[1] == 0))) ++s; + + writer.write_buffer(prev, static_cast(s - prev)); + + if (*s) { + assert(*s == '-'); + + writer.write('-', ' '); + ++s; + } + } + + writer.write('-', '-', '>'); +} + +PUGI__FN void node_output_pi_value(xml_buffered_writer& writer, const char_t* s) +{ + while (*s) { + const char_t* prev = s; + + // look for ?> sequence - we can't output it since ?> terminates PI + while (*s && !(s[0] == '?' && s[1] == '>')) ++s; + + writer.write_buffer(prev, static_cast(s - prev)); + + if (*s) { + assert(s[0] == '?' && s[1] == '>'); + + writer.write('?', ' ', '>'); + s += 2; + } + } +} + +PUGI__FN void node_output_attributes(xml_buffered_writer& writer, xml_node_struct* node, const char_t* indent, size_t indent_length, unsigned int flags, unsigned int depth) +{ + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + + for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute) { + if ((flags & (format_indent_attributes | format_raw)) == format_indent_attributes) { + writer.write('\n'); + + text_output_indent(writer, indent, indent_length, depth + 1); + } else { + writer.write(' '); + } + + writer.write_string(a->name ? a->name + 0 : default_name); + writer.write('=', '"'); + + if (a->value) + text_output(writer, a->value, ctx_special_attr, flags); + + writer.write('"'); + } +} + +PUGI__FN bool node_output_start(xml_buffered_writer& writer, xml_node_struct* node, const char_t* indent, size_t indent_length, unsigned int flags, unsigned int depth) +{ + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + const char_t* name = node->name ? node->name + 0 : default_name; + + writer.write('<'); + writer.write_string(name); + + if (node->first_attribute) + node_output_attributes(writer, node, indent, indent_length, flags, depth); + + if (!node->first_child) { + writer.write(' ', '/', '>'); + + return false; + } else { + writer.write('>'); + + return true; + } +} + +PUGI__FN void node_output_end(xml_buffered_writer& writer, xml_node_struct* node) +{ + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + const char_t* name = node->name ? node->name + 0 : default_name; + + writer.write('<', '/'); + writer.write_string(name); + writer.write('>'); +} + +PUGI__FN void node_output_simple(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags) +{ + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + + switch (PUGI__NODETYPE(node)) { + case node_pcdata: + text_output(writer, node->value ? node->value + 0 : PUGIXML_TEXT(""), ctx_special_pcdata, flags); + break; + + case node_cdata: + text_output_cdata(writer, node->value ? node->value + 0 : PUGIXML_TEXT("")); + break; + + case node_comment: + node_output_comment(writer, node->value ? node->value + 0 : PUGIXML_TEXT("")); + break; + + case node_pi: + writer.write('<', '?'); + writer.write_string(node->name ? node->name + 0 : default_name); + + if (node->value) { + writer.write(' '); + node_output_pi_value(writer, node->value); + } + + writer.write('?', '>'); + break; + + case node_declaration: + writer.write('<', '?'); + writer.write_string(node->name ? node->name + 0 : default_name); + node_output_attributes(writer, node, PUGIXML_TEXT(""), 0, flags | format_raw, 0); + writer.write('?', '>'); + break; + + case node_doctype: + writer.write('<', '!', 'D', 'O', 'C'); + writer.write('T', 'Y', 'P', 'E'); + + if (node->value) { + writer.write(' '); + writer.write_string(node->value); + } + + writer.write('>'); + break; + + default: + assert(!"Invalid node type"); + } +} + +enum indent_flags_t { + indent_newline = 1, + indent_indent = 2 +}; + +PUGI__FN void node_output(xml_buffered_writer& writer, xml_node_struct* root, const char_t* indent, unsigned int flags, unsigned int depth) +{ + size_t indent_length = ((flags & (format_indent | format_indent_attributes)) && (flags & format_raw) == 0) ? strlength(indent) : 0; + unsigned int indent_flags = indent_indent; + + xml_node_struct* node = root; + + do { + assert(node); + + // begin writing current node + if (PUGI__NODETYPE(node) == node_pcdata || PUGI__NODETYPE(node) == node_cdata) { + node_output_simple(writer, node, flags); + + indent_flags = 0; + } else { + if ((indent_flags & indent_newline) && (flags & format_raw) == 0) + writer.write('\n'); + + if ((indent_flags & indent_indent) && indent_length) + text_output_indent(writer, indent, indent_length, depth); + + if (PUGI__NODETYPE(node) == node_element) { + indent_flags = indent_newline | indent_indent; + + if (node_output_start(writer, node, indent, indent_length, flags, depth)) { + node = node->first_child; + depth++; + continue; + } + } else if (PUGI__NODETYPE(node) == node_document) { + indent_flags = indent_indent; + + if (node->first_child) { + node = node->first_child; + continue; + } + } else { + node_output_simple(writer, node, flags); + + indent_flags = indent_newline | indent_indent; + } + } + + // continue to the next node + while (node != root) { + if (node->next_sibling) { + node = node->next_sibling; + break; + } + + node = node->parent; + + // write closing node + if (PUGI__NODETYPE(node) == node_element) { + depth--; + + if ((indent_flags & indent_newline) && (flags & format_raw) == 0) + writer.write('\n'); + + if ((indent_flags & indent_indent) && indent_length) + text_output_indent(writer, indent, indent_length, depth); + + node_output_end(writer, node); + + indent_flags = indent_newline | indent_indent; + } + } + } while (node != root); + + if ((indent_flags & indent_newline) && (flags & format_raw) == 0) + writer.write('\n'); +} + +PUGI__FN bool has_declaration(xml_node_struct* node) +{ + for (xml_node_struct* child = node->first_child; child; child = child->next_sibling) { + xml_node_type type = PUGI__NODETYPE(child); + + if (type == node_declaration) return true; + if (type == node_element) return false; + } + + return false; +} + +PUGI__FN bool is_attribute_of(xml_attribute_struct* attr, xml_node_struct* node) +{ + for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute) + if (a == attr) + return true; + + return false; +} + +PUGI__FN bool allow_insert_attribute(xml_node_type parent) +{ + return parent == node_element || parent == node_declaration; +} + +PUGI__FN bool allow_insert_child(xml_node_type parent, xml_node_type child) +{ + if (parent != node_document && parent != node_element) return false; + if (child == node_document || child == node_null) return false; + if (parent != node_document && (child == node_declaration || child == node_doctype)) return false; + + return true; +} + +PUGI__FN bool allow_move(xml_node parent, xml_node child) +{ + // check that child can be a child of parent + if (!allow_insert_child(parent.type(), child.type())) + return false; + + // check that node is not moved between documents + if (parent.root() != child.root()) + return false; + + // check that new parent is not in the child subtree + xml_node cur = parent; + + while (cur) { + if (cur == child) + return false; + + cur = cur.parent(); + } + + return true; +} + +template +PUGI__FN void node_copy_string(String& dest, Header& header, uintptr_t header_mask, char_t* source, Header& source_header, xml_allocator* alloc) +{ + assert(!dest && (header & header_mask) == 0); + + if (source) { + if (alloc && (source_header & header_mask) == 0) { + dest = source; + + // since strcpy_insitu can reuse document buffer memory we need to mark both source and dest as shared + header |= xml_memory_page_contents_shared_mask; + source_header |= xml_memory_page_contents_shared_mask; + } else + strcpy_insitu(dest, header, header_mask, source, strlength(source)); + } +} + +PUGI__FN void node_copy_contents(xml_node_struct* dn, xml_node_struct* sn, xml_allocator* shared_alloc) +{ + node_copy_string(dn->name, dn->header, xml_memory_page_name_allocated_mask, sn->name, sn->header, shared_alloc); + node_copy_string(dn->value, dn->header, xml_memory_page_value_allocated_mask, sn->value, sn->header, shared_alloc); + + for (xml_attribute_struct* sa = sn->first_attribute; sa; sa = sa->next_attribute) { + xml_attribute_struct* da = append_new_attribute(dn, get_allocator(dn)); + + if (da) { + node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, shared_alloc); + node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, shared_alloc); + } + } +} + +PUGI__FN void node_copy_tree(xml_node_struct* dn, xml_node_struct* sn) +{ + xml_allocator& alloc = get_allocator(dn); + xml_allocator* shared_alloc = (&alloc == &get_allocator(sn)) ? &alloc : 0; + + node_copy_contents(dn, sn, shared_alloc); + + xml_node_struct* dit = dn; + xml_node_struct* sit = sn->first_child; + + while (sit && sit != sn) { + if (sit != dn) { + xml_node_struct* copy = append_new_node(dit, alloc, PUGI__NODETYPE(sit)); + + if (copy) { + node_copy_contents(copy, sit, shared_alloc); + + if (sit->first_child) { + dit = copy; + sit = sit->first_child; + continue; + } + } + } + + // continue to the next node + do { + if (sit->next_sibling) { + sit = sit->next_sibling; + break; + } + + sit = sit->parent; + dit = dit->parent; + } while (sit != sn); + } +} + +PUGI__FN void node_copy_attribute(xml_attribute_struct* da, xml_attribute_struct* sa) +{ + xml_allocator& alloc = get_allocator(da); + xml_allocator* shared_alloc = (&alloc == &get_allocator(sa)) ? &alloc : 0; + + node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, shared_alloc); + node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, shared_alloc); +} + +inline bool is_text_node(xml_node_struct* node) +{ + xml_node_type type = PUGI__NODETYPE(node); + + return type == node_pcdata || type == node_cdata; +} + +// get value with conversion functions +template U string_to_integer(const char_t* value, U minneg, U maxpos) +{ + U result = 0; + const char_t* s = value; + + while (PUGI__IS_CHARTYPE(*s, ct_space)) + s++; + + bool negative = (*s == '-'); + + s += (*s == '+' || *s == '-'); + + bool overflow = false; + + if (s[0] == '0' && (s[1] | ' ') == 'x') { + s += 2; + + // since overflow detection relies on length of the sequence skip leading zeros + while (*s == '0') + s++; + + const char_t* start = s; + + for (;;) { + if (static_cast(*s - '0') < 10) + result = result * 16 + (*s - '0'); + else if (static_cast((*s | ' ') - 'a') < 6) + result = result * 16 + ((*s | ' ') - 'a' + 10); + else + break; + + s++; + } + + size_t digits = static_cast(s - start); + + overflow = digits > sizeof(U) * 2; + } else { + // since overflow detection relies on length of the sequence skip leading zeros + while (*s == '0') + s++; + + const char_t* start = s; + + for (;;) { + if (static_cast(*s - '0') < 10) + result = result * 10 + (*s - '0'); + else + break; + + s++; + } + + size_t digits = static_cast(s - start); + + PUGI__STATIC_ASSERT(sizeof(U) == 8 || sizeof(U) == 4 || sizeof(U) == 2); + + const size_t max_digits10 = sizeof(U) == 8 ? 20 : sizeof(U) == 4 ? 10 : 5; + const char_t max_lead = sizeof(U) == 8 ? '1' : sizeof(U) == 4 ? '4' : '6'; + const size_t high_bit = sizeof(U) * 8 - 1; + + overflow = digits >= max_digits10 && !(digits == max_digits10 && (*start < max_lead || (*start == max_lead && result >> high_bit))); + } + + if (negative) + return (overflow || result > minneg) ? 0 - minneg : 0 - result; + else + return (overflow || result > maxpos) ? maxpos : result; +} + +PUGI__FN int get_value_int(const char_t* value) +{ + return string_to_integer(value, 0 - static_cast(INT_MIN), INT_MAX); +} + +PUGI__FN unsigned int get_value_uint(const char_t* value) +{ + return string_to_integer(value, 0, UINT_MAX); +} + +PUGI__FN double get_value_double(const char_t* value) +{ +#ifdef PUGIXML_WCHAR_MODE + return wcstod(value, 0); +#else + return strtod(value, 0); +#endif +} + +PUGI__FN float get_value_float(const char_t* value) +{ +#ifdef PUGIXML_WCHAR_MODE + return static_cast(wcstod(value, 0)); +#else + return static_cast(strtod(value, 0)); +#endif +} + +PUGI__FN bool get_value_bool(const char_t* value) +{ + // only look at first char + char_t first = *value; + + // 1*, t* (true), T* (True), y* (yes), Y* (YES) + return (first == '1' || first == 't' || first == 'T' || first == 'y' || first == 'Y'); +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN long long get_value_llong(const char_t* value) +{ + return string_to_integer(value, 0 - static_cast(LLONG_MIN), LLONG_MAX); +} + +PUGI__FN unsigned long long get_value_ullong(const char_t* value) +{ + return string_to_integer(value, 0, ULLONG_MAX); +} +#endif + +template +PUGI__FN char_t* integer_to_string(char_t* begin, char_t* end, U value, bool negative) +{ + char_t* result = end - 1; + U rest = negative ? 0 - value : value; + + do { + *result-- = static_cast('0' + (rest % 10)); + rest /= 10; + } while (rest); + + assert(result >= begin); + (void)begin; + + *result = '-'; + + return result + !negative; +} + +// set value with conversion functions +template +PUGI__FN bool set_value_ascii(String& dest, Header& header, uintptr_t header_mask, char* buf) +{ +#ifdef PUGIXML_WCHAR_MODE + char_t wbuf[128]; + assert(strlen(buf) < sizeof(wbuf) / sizeof(wbuf[0])); + + size_t offset = 0; + for (; buf[offset]; ++offset) wbuf[offset] = buf[offset]; + + return strcpy_insitu(dest, header, header_mask, wbuf, offset); +#else + return strcpy_insitu(dest, header, header_mask, buf, strlen(buf)); +#endif +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, int value) +{ + char_t buf[64]; + char_t* end = buf + sizeof(buf) / sizeof(buf[0]); + char_t* begin = integer_to_string(buf, end, value, value < 0); + + return strcpy_insitu(dest, header, header_mask, begin, end - begin); +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, unsigned int value) +{ + char_t buf[64]; + char_t* end = buf + sizeof(buf) / sizeof(buf[0]); + char_t* begin = integer_to_string(buf, end, value, false); + + return strcpy_insitu(dest, header, header_mask, begin, end - begin); +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, float value) +{ + char buf[128]; + sprintf(buf, "%.9g", value); + + return set_value_ascii(dest, header, header_mask, buf); +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, double value) +{ + char buf[128]; + sprintf(buf, "%.17g", value); + + return set_value_ascii(dest, header, header_mask, buf); +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, bool value) +{ + return strcpy_insitu(dest, header, header_mask, value ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"), value ? 4 : 5); +} + +#ifdef PUGIXML_HAS_LONG_LONG +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, long long value) +{ + char_t buf[64]; + char_t* end = buf + sizeof(buf) / sizeof(buf[0]); + char_t* begin = integer_to_string(buf, end, value, value < 0); + + return strcpy_insitu(dest, header, header_mask, begin, end - begin); +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, unsigned long long value) +{ + char_t buf[64]; + char_t* end = buf + sizeof(buf) / sizeof(buf[0]); + char_t* begin = integer_to_string(buf, end, value, false); + + return strcpy_insitu(dest, header, header_mask, begin, end - begin); +} +#endif + +PUGI__FN xml_parse_result load_buffer_impl(xml_document_struct* doc, xml_node_struct* root, void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own, char_t** out_buffer) +{ + // check input buffer + if (!contents && size) return make_parse_result(status_io_error); + + // get actual encoding + xml_encoding buffer_encoding = impl::get_buffer_encoding(encoding, contents, size); + + // get private buffer + char_t* buffer = 0; + size_t length = 0; + + if (!impl::convert_buffer(buffer, length, buffer_encoding, contents, size, is_mutable)) return impl::make_parse_result(status_out_of_memory); + + // delete original buffer if we performed a conversion + if (own && buffer != contents && contents) impl::xml_memory::deallocate(contents); + + // grab onto buffer if it's our buffer, user is responsible for deallocating contents himself + if (own || buffer != contents) *out_buffer = buffer; + + // store buffer for offset_debug + doc->buffer = buffer; + + // parse + xml_parse_result res = impl::xml_parser::parse(buffer, length, doc, root, options); + + // remember encoding + res.encoding = buffer_encoding; + + return res; +} + +// we need to get length of entire file to load it in memory; the only (relatively) sane way to do it is via seek/tell trick +PUGI__FN xml_parse_status get_file_size(FILE* file, size_t& out_result) +{ +#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE) + // there are 64-bit versions of fseek/ftell, let's use them + typedef __int64 length_type; + + _fseeki64(file, 0, SEEK_END); + length_type length = _ftelli64(file); + _fseeki64(file, 0, SEEK_SET); +#elif defined(__MINGW32__) && !defined(__NO_MINGW_LFS) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR)) + // there are 64-bit versions of fseek/ftell, let's use them + typedef off64_t length_type; + + fseeko64(file, 0, SEEK_END); + length_type length = ftello64(file); + fseeko64(file, 0, SEEK_SET); +#else + // if this is a 32-bit OS, long is enough; if this is a unix system, long is 64-bit, which is enough; otherwise we can't do anything anyway. + typedef long length_type; + + fseek(file, 0, SEEK_END); + length_type length = ftell(file); + fseek(file, 0, SEEK_SET); +#endif + + // check for I/O errors + if (length < 0) return status_io_error; + + // check for overflow + size_t result = static_cast(length); + + if (static_cast(result) != length) return status_out_of_memory; + + // finalize + out_result = result; + + return status_ok; +} + +// This function assumes that buffer has extra sizeof(char_t) writable bytes after size +PUGI__FN size_t zero_terminate_buffer(void* buffer, size_t size, xml_encoding encoding) +{ + // We only need to zero-terminate if encoding conversion does not do it for us +#ifdef PUGIXML_WCHAR_MODE + xml_encoding wchar_encoding = get_wchar_encoding(); + + if (encoding == wchar_encoding || need_endian_swap_utf(encoding, wchar_encoding)) { + size_t length = size / sizeof(char_t); + + static_cast(buffer)[length] = 0; + return (length + 1) * sizeof(char_t); + } +#else + if (encoding == encoding_utf8) { + static_cast(buffer)[size] = 0; + return size + 1; + } +#endif + + return size; +} + +PUGI__FN xml_parse_result load_file_impl(xml_document_struct* doc, FILE* file, unsigned int options, xml_encoding encoding, char_t** out_buffer) +{ + if (!file) return make_parse_result(status_file_not_found); + + // get file size (can result in I/O errors) + size_t size = 0; + xml_parse_status size_status = get_file_size(file, size); + if (size_status != status_ok) return make_parse_result(size_status); + + size_t max_suffix_size = sizeof(char_t); + + // allocate buffer for the whole file + char* contents = static_cast(xml_memory::allocate(size + max_suffix_size)); + if (!contents) return make_parse_result(status_out_of_memory); + + // read file in memory + size_t read_size = fread(contents, 1, size, file); + + if (read_size != size) { + xml_memory::deallocate(contents); + return make_parse_result(status_io_error); + } + + xml_encoding real_encoding = get_buffer_encoding(encoding, contents, size); + + return load_buffer_impl(doc, doc, contents, zero_terminate_buffer(contents, size, real_encoding), options, real_encoding, true, true, out_buffer); +} + +#ifndef PUGIXML_NO_STL +template struct xml_stream_chunk { + static xml_stream_chunk* create() { + void* memory = xml_memory::allocate(sizeof(xml_stream_chunk)); + if (!memory) return 0; + + return new (memory) xml_stream_chunk(); + } + + static void destroy(xml_stream_chunk* chunk) { + // free chunk chain + while (chunk) { + xml_stream_chunk* next_ = chunk->next; + + xml_memory::deallocate(chunk); + + chunk = next_; + } + } + + xml_stream_chunk(): next(0), size(0) { + } + + xml_stream_chunk* next; + size_t size; + + T data[xml_memory_page_size / sizeof(T)]; +}; + +template PUGI__FN xml_parse_status load_stream_data_noseek(std::basic_istream& stream, void** out_buffer, size_t* out_size) +{ + auto_deleter > chunks(0, xml_stream_chunk::destroy); + + // read file to a chunk list + size_t total = 0; + xml_stream_chunk* last = 0; + + while (!stream.eof()) { + // allocate new chunk + xml_stream_chunk* chunk = xml_stream_chunk::create(); + if (!chunk) return status_out_of_memory; + + // append chunk to list + if (last) last = last->next = chunk; + else chunks.data = last = chunk; + + // read data to chunk + stream.read(chunk->data, static_cast(sizeof(chunk->data) / sizeof(T))); + chunk->size = static_cast(stream.gcount()) * sizeof(T); + + // read may set failbit | eofbit in case gcount() is less than read length, so check for other I/O errors + if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error; + + // guard against huge files (chunk size is small enough to make this overflow check work) + if (total + chunk->size < total) return status_out_of_memory; + total += chunk->size; + } + + size_t max_suffix_size = sizeof(char_t); + + // copy chunk list to a contiguous buffer + char* buffer = static_cast(xml_memory::allocate(total + max_suffix_size)); + if (!buffer) return status_out_of_memory; + + char* write = buffer; + + for (xml_stream_chunk* chunk = chunks.data; chunk; chunk = chunk->next) { + assert(write + chunk->size <= buffer + total); + memcpy(write, chunk->data, chunk->size); + write += chunk->size; + } + + assert(write == buffer + total); + + // return buffer + *out_buffer = buffer; + *out_size = total; + + return status_ok; +} + +template PUGI__FN xml_parse_status load_stream_data_seek(std::basic_istream& stream, void** out_buffer, size_t* out_size) +{ + // get length of remaining data in stream + typename std::basic_istream::pos_type pos = stream.tellg(); + stream.seekg(0, std::ios::end); + std::streamoff length = stream.tellg() - pos; + stream.seekg(pos); + + if (stream.fail() || pos < 0) return status_io_error; + + // guard against huge files + size_t read_length = static_cast(length); + + if (static_cast(read_length) != length || length < 0) return status_out_of_memory; + + size_t max_suffix_size = sizeof(char_t); + + // read stream data into memory (guard against stream exceptions with buffer holder) + auto_deleter buffer(xml_memory::allocate(read_length * sizeof(T) + max_suffix_size), xml_memory::deallocate); + if (!buffer.data) return status_out_of_memory; + + stream.read(static_cast(buffer.data), static_cast(read_length)); + + // read may set failbit | eofbit in case gcount() is less than read_length (i.e. line ending conversion), so check for other I/O errors + if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error; + + // return buffer + size_t actual_length = static_cast(stream.gcount()); + assert(actual_length <= read_length); + + *out_buffer = buffer.release(); + *out_size = actual_length * sizeof(T); + + return status_ok; +} + +template PUGI__FN xml_parse_result load_stream_impl(xml_document_struct* doc, std::basic_istream& stream, unsigned int options, xml_encoding encoding, char_t** out_buffer) +{ + void* buffer = 0; + size_t size = 0; + xml_parse_status status = status_ok; + + // if stream has an error bit set, bail out (otherwise tellg() can fail and we'll clear error bits) + if (stream.fail()) return make_parse_result(status_io_error); + + // load stream to memory (using seek-based implementation if possible, since it's faster and takes less memory) + if (stream.tellg() < 0) { + stream.clear(); // clear error flags that could be set by a failing tellg + status = load_stream_data_noseek(stream, &buffer, &size); + } else + status = load_stream_data_seek(stream, &buffer, &size); + + if (status != status_ok) return make_parse_result(status); + + xml_encoding real_encoding = get_buffer_encoding(encoding, buffer, size); + + return load_buffer_impl(doc, doc, buffer, zero_terminate_buffer(buffer, size, real_encoding), options, real_encoding, true, true, out_buffer); +} +#endif + +#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) || (defined(__MINGW32__) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR))) +PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode) +{ + return _wfopen(path, mode); +} +#else +PUGI__FN char* convert_path_heap(const wchar_t* str) +{ + assert(str); + + // first pass: get length in utf8 characters + size_t length = strlength_wide(str); + size_t size = as_utf8_begin(str, length); + + // allocate resulting string + char* result = static_cast(xml_memory::allocate(size + 1)); + if (!result) return 0; + + // second pass: convert to utf8 + as_utf8_end(result, size, str, length); + + // zero-terminate + result[size] = 0; + + return result; +} + +PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode) +{ + // there is no standard function to open wide paths, so our best bet is to try utf8 path + char* path_utf8 = convert_path_heap(path); + if (!path_utf8) return 0; + + // convert mode to ASCII (we mirror _wfopen interface) + char mode_ascii[4] = {0}; + for (size_t i = 0; mode[i]; ++i) mode_ascii[i] = static_cast(mode[i]); + + // try to open the utf8 path + FILE* result = fopen(path_utf8, mode_ascii); + + // free dummy buffer + xml_memory::deallocate(path_utf8); + + return result; +} +#endif + +PUGI__FN bool save_file_impl(const xml_document& doc, FILE* file, const char_t* indent, unsigned int flags, xml_encoding encoding) +{ + if (!file) return false; + + xml_writer_file writer(file); + doc.save(writer, indent, flags, encoding); + + return ferror(file) == 0; +} + +struct name_null_sentry { + xml_node_struct* node; + char_t* name; + + name_null_sentry(xml_node_struct* node_): node(node_), name(node_->name) { + node->name = 0; + } + + ~name_null_sentry() { + node->name = name; + } +}; +PUGI__NS_END + +namespace pugi +{ +PUGI__FN xml_writer_file::xml_writer_file(void* file_): file(file_) +{ +} + +PUGI__FN void xml_writer_file::write(const void* data, size_t size) +{ + size_t result = fwrite(data, 1, size, static_cast(file)); + (void)!result; // unfortunately we can't do proper error handling here +} + +#ifndef PUGIXML_NO_STL +PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream >& stream): narrow_stream(&stream), wide_stream(0) +{ +} + +PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream >& stream): narrow_stream(0), wide_stream(&stream) +{ +} + +PUGI__FN void xml_writer_stream::write(const void* data, size_t size) +{ + if (narrow_stream) { + assert(!wide_stream); + narrow_stream->write(reinterpret_cast(data), static_cast(size)); + } else { + assert(wide_stream); + assert(size % sizeof(wchar_t) == 0); + + wide_stream->write(reinterpret_cast(data), static_cast(size / sizeof(wchar_t))); + } +} +#endif + +PUGI__FN xml_tree_walker::xml_tree_walker(): _depth(0) +{ +} + +PUGI__FN xml_tree_walker::~xml_tree_walker() +{ +} + +PUGI__FN int xml_tree_walker::depth() const +{ + return _depth; +} + +PUGI__FN bool xml_tree_walker::begin(xml_node&) +{ + return true; +} + +PUGI__FN bool xml_tree_walker::end(xml_node&) +{ + return true; +} + +PUGI__FN xml_attribute::xml_attribute(): _attr(0) +{ +} + +PUGI__FN xml_attribute::xml_attribute(xml_attribute_struct* attr): _attr(attr) +{ +} + +PUGI__FN static void unspecified_bool_xml_attribute(xml_attribute***) +{ +} + +PUGI__FN xml_attribute::operator xml_attribute::unspecified_bool_type() const +{ + return _attr ? unspecified_bool_xml_attribute : 0; +} + +PUGI__FN bool xml_attribute::operator!() const +{ + return !_attr; +} + +PUGI__FN bool xml_attribute::operator==(const xml_attribute& r) const +{ + return (_attr == r._attr); +} + +PUGI__FN bool xml_attribute::operator!=(const xml_attribute& r) const +{ + return (_attr != r._attr); +} + +PUGI__FN bool xml_attribute::operator<(const xml_attribute& r) const +{ + return (_attr < r._attr); +} + +PUGI__FN bool xml_attribute::operator>(const xml_attribute& r) const +{ + return (_attr > r._attr); +} + +PUGI__FN bool xml_attribute::operator<=(const xml_attribute& r) const +{ + return (_attr <= r._attr); +} + +PUGI__FN bool xml_attribute::operator>=(const xml_attribute& r) const +{ + return (_attr >= r._attr); +} + +PUGI__FN xml_attribute xml_attribute::next_attribute() const +{ + return _attr ? xml_attribute(_attr->next_attribute) : xml_attribute(); +} + +PUGI__FN xml_attribute xml_attribute::previous_attribute() const +{ + return _attr && _attr->prev_attribute_c->next_attribute ? xml_attribute(_attr->prev_attribute_c) : xml_attribute(); +} + +PUGI__FN const char_t* xml_attribute::as_string(const char_t* def) const +{ + return (_attr && _attr->value) ? _attr->value + 0 : def; +} + +PUGI__FN int xml_attribute::as_int(int def) const +{ + return (_attr && _attr->value) ? impl::get_value_int(_attr->value) : def; +} + +PUGI__FN unsigned int xml_attribute::as_uint(unsigned int def) const +{ + return (_attr && _attr->value) ? impl::get_value_uint(_attr->value) : def; +} + +PUGI__FN double xml_attribute::as_double(double def) const +{ + return (_attr && _attr->value) ? impl::get_value_double(_attr->value) : def; +} + +PUGI__FN float xml_attribute::as_float(float def) const +{ + return (_attr && _attr->value) ? impl::get_value_float(_attr->value) : def; +} + +PUGI__FN bool xml_attribute::as_bool(bool def) const +{ + return (_attr && _attr->value) ? impl::get_value_bool(_attr->value) : def; +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN long long xml_attribute::as_llong(long long def) const +{ + return (_attr && _attr->value) ? impl::get_value_llong(_attr->value) : def; +} + +PUGI__FN unsigned long long xml_attribute::as_ullong(unsigned long long def) const +{ + return (_attr && _attr->value) ? impl::get_value_ullong(_attr->value) : def; +} +#endif + +PUGI__FN bool xml_attribute::empty() const +{ + return !_attr; +} + +PUGI__FN const char_t* xml_attribute::name() const +{ + return (_attr && _attr->name) ? _attr->name + 0 : PUGIXML_TEXT(""); +} + +PUGI__FN const char_t* xml_attribute::value() const +{ + return (_attr && _attr->value) ? _attr->value + 0 : PUGIXML_TEXT(""); +} + +PUGI__FN size_t xml_attribute::hash_value() const +{ + return static_cast(reinterpret_cast(_attr) / sizeof(xml_attribute_struct)); +} + +PUGI__FN xml_attribute_struct* xml_attribute::internal_object() const +{ + return _attr; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(const char_t* rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(int rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(unsigned int rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(double rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(float rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(bool rhs) +{ + set_value(rhs); + return *this; +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN xml_attribute& xml_attribute::operator=(long long rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(unsigned long long rhs) +{ + set_value(rhs); + return *this; +} +#endif + +PUGI__FN bool xml_attribute::set_name(const char_t* rhs) +{ + if (!_attr) return false; + + return impl::strcpy_insitu(_attr->name, _attr->header, impl::xml_memory_page_name_allocated_mask, rhs, impl::strlength(rhs)); +} + +PUGI__FN bool xml_attribute::set_value(const char_t* rhs) +{ + if (!_attr) return false; + + return impl::strcpy_insitu(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs, impl::strlength(rhs)); +} + +PUGI__FN bool xml_attribute::set_value(int rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +PUGI__FN bool xml_attribute::set_value(unsigned int rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +PUGI__FN bool xml_attribute::set_value(double rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +PUGI__FN bool xml_attribute::set_value(float rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +PUGI__FN bool xml_attribute::set_value(bool rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN bool xml_attribute::set_value(long long rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +PUGI__FN bool xml_attribute::set_value(unsigned long long rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} +#endif + +#ifdef __BORLANDC__ +PUGI__FN bool operator&&(const xml_attribute& lhs, bool rhs) +{ + return (bool)lhs && rhs; +} + +PUGI__FN bool operator||(const xml_attribute& lhs, bool rhs) +{ + return (bool)lhs || rhs; +} +#endif + +PUGI__FN xml_node::xml_node(): _root(0) +{ +} + +PUGI__FN xml_node::xml_node(xml_node_struct* p): _root(p) +{ +} + +PUGI__FN static void unspecified_bool_xml_node(xml_node***) +{ +} + +PUGI__FN xml_node::operator xml_node::unspecified_bool_type() const +{ + return _root ? unspecified_bool_xml_node : 0; +} + +PUGI__FN bool xml_node::operator!() const +{ + return !_root; +} + +PUGI__FN xml_node::iterator xml_node::begin() const +{ + return iterator(_root ? _root->first_child + 0 : 0, _root); +} + +PUGI__FN xml_node::iterator xml_node::end() const +{ + return iterator(0, _root); +} + +PUGI__FN xml_node::attribute_iterator xml_node::attributes_begin() const +{ + return attribute_iterator(_root ? _root->first_attribute + 0 : 0, _root); +} + +PUGI__FN xml_node::attribute_iterator xml_node::attributes_end() const +{ + return attribute_iterator(0, _root); +} + +PUGI__FN xml_object_range xml_node::children() const +{ + return xml_object_range(begin(), end()); +} + +PUGI__FN xml_object_range xml_node::children(const char_t* name_) const +{ + return xml_object_range(xml_named_node_iterator(child(name_)._root, _root, name_), xml_named_node_iterator(0, _root, name_)); +} + +PUGI__FN xml_object_range xml_node::attributes() const +{ + return xml_object_range(attributes_begin(), attributes_end()); +} + +PUGI__FN bool xml_node::operator==(const xml_node& r) const +{ + return (_root == r._root); +} + +PUGI__FN bool xml_node::operator!=(const xml_node& r) const +{ + return (_root != r._root); +} + +PUGI__FN bool xml_node::operator<(const xml_node& r) const +{ + return (_root < r._root); +} + +PUGI__FN bool xml_node::operator>(const xml_node& r) const +{ + return (_root > r._root); +} + +PUGI__FN bool xml_node::operator<=(const xml_node& r) const +{ + return (_root <= r._root); +} + +PUGI__FN bool xml_node::operator>=(const xml_node& r) const +{ + return (_root >= r._root); +} + +PUGI__FN bool xml_node::empty() const +{ + return !_root; +} + +PUGI__FN const char_t* xml_node::name() const +{ + return (_root && _root->name) ? _root->name + 0 : PUGIXML_TEXT(""); +} + +PUGI__FN xml_node_type xml_node::type() const +{ + return _root ? PUGI__NODETYPE(_root) : node_null; +} + +PUGI__FN const char_t* xml_node::value() const +{ + return (_root && _root->value) ? _root->value + 0 : PUGIXML_TEXT(""); +} + +PUGI__FN xml_node xml_node::child(const char_t* name_) const +{ + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (i->name && impl::strequal(name_, i->name)) return xml_node(i); + + return xml_node(); +} + +PUGI__FN xml_attribute xml_node::attribute(const char_t* name_) const +{ + if (!_root) return xml_attribute(); + + for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute) + if (i->name && impl::strequal(name_, i->name)) + return xml_attribute(i); + + return xml_attribute(); +} + +PUGI__FN xml_node xml_node::next_sibling(const char_t* name_) const +{ + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling) + if (i->name && impl::strequal(name_, i->name)) return xml_node(i); + + return xml_node(); +} + +PUGI__FN xml_node xml_node::next_sibling() const +{ + return _root ? xml_node(_root->next_sibling) : xml_node(); +} + +PUGI__FN xml_node xml_node::previous_sibling(const char_t* name_) const +{ + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->prev_sibling_c; i->next_sibling; i = i->prev_sibling_c) + if (i->name && impl::strequal(name_, i->name)) return xml_node(i); + + return xml_node(); +} + +PUGI__FN xml_attribute xml_node::attribute(const char_t* name_, xml_attribute& hint_) const +{ + xml_attribute_struct* hint = hint_._attr; + + // if hint is not an attribute of node, behavior is not defined + assert(!hint || (_root && impl::is_attribute_of(hint, _root))); + + if (!_root) return xml_attribute(); + + // optimistically search from hint up until the end + for (xml_attribute_struct* i = hint; i; i = i->next_attribute) + if (i->name && impl::strequal(name_, i->name)) { + // update hint to maximize efficiency of searching for consecutive attributes + hint_._attr = i->next_attribute; + + return xml_attribute(i); + } + + // wrap around and search from the first attribute until the hint + // 'j' null pointer check is technically redundant, but it prevents a crash in case the assertion above fails + for (xml_attribute_struct* j = _root->first_attribute; j && j != hint; j = j->next_attribute) + if (j->name && impl::strequal(name_, j->name)) { + // update hint to maximize efficiency of searching for consecutive attributes + hint_._attr = j->next_attribute; + + return xml_attribute(j); + } + + return xml_attribute(); +} + +PUGI__FN xml_node xml_node::previous_sibling() const +{ + if (!_root) return xml_node(); + + if (_root->prev_sibling_c->next_sibling) return xml_node(_root->prev_sibling_c); + else return xml_node(); +} + +PUGI__FN xml_node xml_node::parent() const +{ + return _root ? xml_node(_root->parent) : xml_node(); +} + +PUGI__FN xml_node xml_node::root() const +{ + return _root ? xml_node(&impl::get_document(_root)) : xml_node(); +} + +PUGI__FN xml_text xml_node::text() const +{ + return xml_text(_root); +} + +PUGI__FN const char_t* xml_node::child_value() const +{ + if (!_root) return PUGIXML_TEXT(""); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (impl::is_text_node(i) && i->value) + return i->value; + + return PUGIXML_TEXT(""); +} + +PUGI__FN const char_t* xml_node::child_value(const char_t* name_) const +{ + return child(name_).child_value(); +} + +PUGI__FN xml_attribute xml_node::first_attribute() const +{ + return _root ? xml_attribute(_root->first_attribute) : xml_attribute(); +} + +PUGI__FN xml_attribute xml_node::last_attribute() const +{ + return _root && _root->first_attribute ? xml_attribute(_root->first_attribute->prev_attribute_c) : xml_attribute(); +} + +PUGI__FN xml_node xml_node::first_child() const +{ + return _root ? xml_node(_root->first_child) : xml_node(); +} + +PUGI__FN xml_node xml_node::last_child() const +{ + return _root && _root->first_child ? xml_node(_root->first_child->prev_sibling_c) : xml_node(); +} + +PUGI__FN bool xml_node::set_name(const char_t* rhs) +{ + xml_node_type type_ = _root ? PUGI__NODETYPE(_root) : node_null; + + if (type_ != node_element && type_ != node_pi && type_ != node_declaration) + return false; + + return impl::strcpy_insitu(_root->name, _root->header, impl::xml_memory_page_name_allocated_mask, rhs, impl::strlength(rhs)); +} + +PUGI__FN bool xml_node::set_value(const char_t* rhs) +{ + xml_node_type type_ = _root ? PUGI__NODETYPE(_root) : node_null; + + if (type_ != node_pcdata && type_ != node_cdata && type_ != node_comment && type_ != node_pi && type_ != node_doctype) + return false; + + return impl::strcpy_insitu(_root->value, _root->header, impl::xml_memory_page_value_allocated_mask, rhs, impl::strlength(rhs)); +} + +PUGI__FN xml_attribute xml_node::append_attribute(const char_t* name_) +{ + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::append_attribute(a._attr, _root); + + a.set_name(name_); + + return a; +} + +PUGI__FN xml_attribute xml_node::prepend_attribute(const char_t* name_) +{ + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::prepend_attribute(a._attr, _root); + + a.set_name(name_); + + return a; +} + +PUGI__FN xml_attribute xml_node::insert_attribute_after(const char_t* name_, const xml_attribute& attr) +{ + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::insert_attribute_after(a._attr, attr._attr, _root); + + a.set_name(name_); + + return a; +} + +PUGI__FN xml_attribute xml_node::insert_attribute_before(const char_t* name_, const xml_attribute& attr) +{ + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::insert_attribute_before(a._attr, attr._attr, _root); + + a.set_name(name_); + + return a; +} + +PUGI__FN xml_attribute xml_node::append_copy(const xml_attribute& proto) +{ + if (!proto) return xml_attribute(); + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::append_attribute(a._attr, _root); + impl::node_copy_attribute(a._attr, proto._attr); + + return a; +} + +PUGI__FN xml_attribute xml_node::prepend_copy(const xml_attribute& proto) +{ + if (!proto) return xml_attribute(); + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::prepend_attribute(a._attr, _root); + impl::node_copy_attribute(a._attr, proto._attr); + + return a; +} + +PUGI__FN xml_attribute xml_node::insert_copy_after(const xml_attribute& proto, const xml_attribute& attr) +{ + if (!proto) return xml_attribute(); + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::insert_attribute_after(a._attr, attr._attr, _root); + impl::node_copy_attribute(a._attr, proto._attr); + + return a; +} + +PUGI__FN xml_attribute xml_node::insert_copy_before(const xml_attribute& proto, const xml_attribute& attr) +{ + if (!proto) return xml_attribute(); + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::insert_attribute_before(a._attr, attr._attr, _root); + impl::node_copy_attribute(a._attr, proto._attr); + + return a; +} + +PUGI__FN xml_node xml_node::append_child(xml_node_type type_) +{ + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::append_node(n._root, _root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; +} + +PUGI__FN xml_node xml_node::prepend_child(xml_node_type type_) +{ + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::prepend_node(n._root, _root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; +} + +PUGI__FN xml_node xml_node::insert_child_before(xml_node_type type_, const xml_node& node) +{ + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::insert_node_before(n._root, node._root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; +} + +PUGI__FN xml_node xml_node::insert_child_after(xml_node_type type_, const xml_node& node) +{ + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::insert_node_after(n._root, node._root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; +} + +PUGI__FN xml_node xml_node::append_child(const char_t* name_) +{ + xml_node result = append_child(node_element); + + result.set_name(name_); + + return result; +} + +PUGI__FN xml_node xml_node::prepend_child(const char_t* name_) +{ + xml_node result = prepend_child(node_element); + + result.set_name(name_); + + return result; +} + +PUGI__FN xml_node xml_node::insert_child_after(const char_t* name_, const xml_node& node) +{ + xml_node result = insert_child_after(node_element, node); + + result.set_name(name_); + + return result; +} + +PUGI__FN xml_node xml_node::insert_child_before(const char_t* name_, const xml_node& node) +{ + xml_node result = insert_child_before(node_element, node); + + result.set_name(name_); + + return result; +} + +PUGI__FN xml_node xml_node::append_copy(const xml_node& proto) +{ + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::append_node(n._root, _root); + impl::node_copy_tree(n._root, proto._root); + + return n; +} + +PUGI__FN xml_node xml_node::prepend_copy(const xml_node& proto) +{ + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::prepend_node(n._root, _root); + impl::node_copy_tree(n._root, proto._root); + + return n; +} + +PUGI__FN xml_node xml_node::insert_copy_after(const xml_node& proto, const xml_node& node) +{ + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::insert_node_after(n._root, node._root); + impl::node_copy_tree(n._root, proto._root); + + return n; +} + +PUGI__FN xml_node xml_node::insert_copy_before(const xml_node& proto, const xml_node& node) +{ + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::insert_node_before(n._root, node._root); + impl::node_copy_tree(n._root, proto._root); + + return n; +} + +PUGI__FN xml_node xml_node::append_move(const xml_node& moved) +{ + if (!impl::allow_move(*this, moved)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::append_node(moved._root, _root); + + return moved; +} + +PUGI__FN xml_node xml_node::prepend_move(const xml_node& moved) +{ + if (!impl::allow_move(*this, moved)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::prepend_node(moved._root, _root); + + return moved; +} + +PUGI__FN xml_node xml_node::insert_move_after(const xml_node& moved, const xml_node& node) +{ + if (!impl::allow_move(*this, moved)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + if (moved._root == node._root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::insert_node_after(moved._root, node._root); + + return moved; +} + +PUGI__FN xml_node xml_node::insert_move_before(const xml_node& moved, const xml_node& node) +{ + if (!impl::allow_move(*this, moved)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + if (moved._root == node._root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::insert_node_before(moved._root, node._root); + + return moved; +} + +PUGI__FN bool xml_node::remove_attribute(const char_t* name_) +{ + return remove_attribute(attribute(name_)); +} + +PUGI__FN bool xml_node::remove_attribute(const xml_attribute& a) +{ + if (!_root || !a._attr) return false; + if (!impl::is_attribute_of(a._attr, _root)) return false; + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return false; + + impl::remove_attribute(a._attr, _root); + impl::destroy_attribute(a._attr, alloc); + + return true; +} + +PUGI__FN bool xml_node::remove_child(const char_t* name_) +{ + return remove_child(child(name_)); +} + +PUGI__FN bool xml_node::remove_child(const xml_node& n) +{ + if (!_root || !n._root || n._root->parent != _root) return false; + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return false; + + impl::remove_node(n._root); + impl::destroy_node(n._root, alloc); + + return true; +} + +PUGI__FN xml_parse_result xml_node::append_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding) +{ + // append_buffer is only valid for elements/documents + if (!impl::allow_insert_child(type(), node_element)) return impl::make_parse_result(status_append_invalid_root); + + // get document node + impl::xml_document_struct* doc = &impl::get_document(_root); + + // disable document_buffer_order optimization since in a document with multiple buffers comparing buffer pointers does not make sense + doc->header |= impl::xml_memory_page_contents_shared_mask; + + // get extra buffer element (we'll store the document fragment buffer there so that we can deallocate it later) + impl::xml_memory_page* page = 0; + impl::xml_extra_buffer* extra = static_cast(doc->allocate_memory(sizeof(impl::xml_extra_buffer), page)); + (void)page; + + if (!extra) return impl::make_parse_result(status_out_of_memory); + + // add extra buffer to the list + extra->buffer = 0; + extra->next = doc->extra_buffers; + doc->extra_buffers = extra; + + // name of the root has to be NULL before parsing - otherwise closing node mismatches will not be detected at the top level + impl::name_null_sentry sentry(_root); + + return impl::load_buffer_impl(doc, _root, const_cast(contents), size, options, encoding, false, false, &extra->buffer); +} + +PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* name_, const char_t* attr_name, const char_t* attr_value) const +{ + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (i->name && impl::strequal(name_, i->name)) { + for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute) + if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value + 0 : PUGIXML_TEXT(""))) + return xml_node(i); + } + + return xml_node(); +} + +PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const +{ + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute) + if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value + 0 : PUGIXML_TEXT(""))) + return xml_node(i); + + return xml_node(); +} + +#ifndef PUGIXML_NO_STL +PUGI__FN string_t xml_node::path(char_t delimiter) const +{ + if (!_root) return string_t(); + + size_t offset = 0; + + for (xml_node_struct* i = _root; i; i = i->parent) { + offset += (i != _root); + offset += i->name ? impl::strlength(i->name) : 0; + } + + string_t result; + result.resize(offset); + + for (xml_node_struct* j = _root; j; j = j->parent) { + if (j != _root) + result[--offset] = delimiter; + + if (j->name && *j->name) { + size_t length = impl::strlength(j->name); + + offset -= length; + memcpy(&result[offset], j->name, length * sizeof(char_t)); + } + } + + assert(offset == 0); + + return result; +} +#endif + +PUGI__FN xml_node xml_node::first_element_by_path(const char_t* path_, char_t delimiter) const +{ + xml_node found = *this; // Current search context. + + if (!_root || !path_ || !path_[0]) return found; + + if (path_[0] == delimiter) { + // Absolute path; e.g. '/foo/bar' + found = found.root(); + ++path_; + } + + const char_t* path_segment = path_; + + while (*path_segment == delimiter) ++path_segment; + + const char_t* path_segment_end = path_segment; + + while (*path_segment_end && *path_segment_end != delimiter) ++path_segment_end; + + if (path_segment == path_segment_end) return found; + + const char_t* next_segment = path_segment_end; + + while (*next_segment == delimiter) ++next_segment; + + if (*path_segment == '.' && path_segment + 1 == path_segment_end) + return found.first_element_by_path(next_segment, delimiter); + else if (*path_segment == '.' && *(path_segment+1) == '.' && path_segment + 2 == path_segment_end) + return found.parent().first_element_by_path(next_segment, delimiter); + else { + for (xml_node_struct* j = found._root->first_child; j; j = j->next_sibling) { + if (j->name && impl::strequalrange(j->name, path_segment, static_cast(path_segment_end - path_segment))) { + xml_node subsearch = xml_node(j).first_element_by_path(next_segment, delimiter); + + if (subsearch) return subsearch; + } + } + + return xml_node(); + } +} + +PUGI__FN bool xml_node::traverse(xml_tree_walker& walker) +{ + walker._depth = -1; + + xml_node arg_begin = *this; + if (!walker.begin(arg_begin)) return false; + + xml_node cur = first_child(); + + if (cur) { + ++walker._depth; + + do { + xml_node arg_for_each = cur; + if (!walker.for_each(arg_for_each)) + return false; + + if (cur.first_child()) { + ++walker._depth; + cur = cur.first_child(); + } else if (cur.next_sibling()) + cur = cur.next_sibling(); + else { + // Borland C++ workaround + while (!cur.next_sibling() && cur != *this && !cur.parent().empty()) { + --walker._depth; + cur = cur.parent(); + } + + if (cur != *this) + cur = cur.next_sibling(); + } + } while (cur && cur != *this); + } + + assert(walker._depth == -1); + + xml_node arg_end = *this; + return walker.end(arg_end); +} + +PUGI__FN size_t xml_node::hash_value() const +{ + return static_cast(reinterpret_cast(_root) / sizeof(xml_node_struct)); +} + +PUGI__FN xml_node_struct* xml_node::internal_object() const +{ + return _root; +} + +PUGI__FN void xml_node::print(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const +{ + if (!_root) return; + + impl::xml_buffered_writer buffered_writer(writer, encoding); + + impl::node_output(buffered_writer, _root, indent, flags, depth); + + buffered_writer.flush(); +} + +#ifndef PUGIXML_NO_STL +PUGI__FN void xml_node::print(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const +{ + xml_writer_stream writer(stream); + + print(writer, indent, flags, encoding, depth); +} + +PUGI__FN void xml_node::print(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const +{ + xml_writer_stream writer(stream); + + print(writer, indent, flags, encoding_wchar, depth); +} +#endif + +PUGI__FN ptrdiff_t xml_node::offset_debug() const +{ + if (!_root) return -1; + + impl::xml_document_struct& doc = impl::get_document(_root); + + // we can determine the offset reliably only if there is exactly once parse buffer + if (!doc.buffer || doc.extra_buffers) return -1; + + switch (type()) { + case node_document: + return 0; + + case node_element: + case node_declaration: + case node_pi: + return _root->name && (_root->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0 ? _root->name - doc.buffer : -1; + + case node_pcdata: + case node_cdata: + case node_comment: + case node_doctype: + return _root->value && (_root->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0 ? _root->value - doc.buffer : -1; + + default: + return -1; + } +} + +#ifdef __BORLANDC__ +PUGI__FN bool operator&&(const xml_node& lhs, bool rhs) +{ + return (bool)lhs && rhs; +} + +PUGI__FN bool operator||(const xml_node& lhs, bool rhs) +{ + return (bool)lhs || rhs; +} +#endif + +PUGI__FN xml_text::xml_text(xml_node_struct* root): _root(root) +{ +} + +PUGI__FN xml_node_struct* xml_text::_data() const +{ + if (!_root || impl::is_text_node(_root)) return _root; + + for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling) + if (impl::is_text_node(node)) + return node; + + return 0; +} + +PUGI__FN xml_node_struct* xml_text::_data_new() +{ + xml_node_struct* d = _data(); + if (d) return d; + + return xml_node(_root).append_child(node_pcdata).internal_object(); +} + +PUGI__FN xml_text::xml_text(): _root(0) +{ +} + +PUGI__FN static void unspecified_bool_xml_text(xml_text***) +{ +} + +PUGI__FN xml_text::operator xml_text::unspecified_bool_type() const +{ + return _data() ? unspecified_bool_xml_text : 0; +} + +PUGI__FN bool xml_text::operator!() const +{ + return !_data(); +} + +PUGI__FN bool xml_text::empty() const +{ + return _data() == 0; +} + +PUGI__FN const char_t* xml_text::get() const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? d->value + 0 : PUGIXML_TEXT(""); +} + +PUGI__FN const char_t* xml_text::as_string(const char_t* def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? d->value + 0 : def; +} + +PUGI__FN int xml_text::as_int(int def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_int(d->value) : def; +} + +PUGI__FN unsigned int xml_text::as_uint(unsigned int def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_uint(d->value) : def; +} + +PUGI__FN double xml_text::as_double(double def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_double(d->value) : def; +} + +PUGI__FN float xml_text::as_float(float def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_float(d->value) : def; +} + +PUGI__FN bool xml_text::as_bool(bool def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_bool(d->value) : def; +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN long long xml_text::as_llong(long long def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_llong(d->value) : def; +} + +PUGI__FN unsigned long long xml_text::as_ullong(unsigned long long def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_ullong(d->value) : def; +} +#endif + +PUGI__FN bool xml_text::set(const char_t* rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::strcpy_insitu(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs, impl::strlength(rhs)) : false; +} + +PUGI__FN bool xml_text::set(int rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +PUGI__FN bool xml_text::set(unsigned int rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +PUGI__FN bool xml_text::set(float rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +PUGI__FN bool xml_text::set(double rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +PUGI__FN bool xml_text::set(bool rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN bool xml_text::set(long long rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +PUGI__FN bool xml_text::set(unsigned long long rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} +#endif + +PUGI__FN xml_text& xml_text::operator=(const char_t* rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(int rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(unsigned int rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(double rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(float rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(bool rhs) +{ + set(rhs); + return *this; +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN xml_text& xml_text::operator=(long long rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(unsigned long long rhs) +{ + set(rhs); + return *this; +} +#endif + +PUGI__FN xml_node xml_text::data() const +{ + return xml_node(_data()); +} + +#ifdef __BORLANDC__ +PUGI__FN bool operator&&(const xml_text& lhs, bool rhs) +{ + return (bool)lhs && rhs; +} + +PUGI__FN bool operator||(const xml_text& lhs, bool rhs) +{ + return (bool)lhs || rhs; +} +#endif + +PUGI__FN xml_node_iterator::xml_node_iterator() +{ +} + +PUGI__FN xml_node_iterator::xml_node_iterator(const xml_node& node): _wrap(node), _parent(node.parent()) +{ +} + +PUGI__FN xml_node_iterator::xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent) +{ +} + +PUGI__FN bool xml_node_iterator::operator==(const xml_node_iterator& rhs) const +{ + return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root; +} + +PUGI__FN bool xml_node_iterator::operator!=(const xml_node_iterator& rhs) const +{ + return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root; +} + +PUGI__FN xml_node& xml_node_iterator::operator*() const +{ + assert(_wrap._root); + return _wrap; +} + +PUGI__FN xml_node* xml_node_iterator::operator->() const +{ + assert(_wrap._root); + return const_cast(&_wrap); // BCC32 workaround +} + +PUGI__FN const xml_node_iterator& xml_node_iterator::operator++() +{ + assert(_wrap._root); + _wrap._root = _wrap._root->next_sibling; + return *this; +} + +PUGI__FN xml_node_iterator xml_node_iterator::operator++(int) +{ + xml_node_iterator temp = *this; + ++*this; + return temp; +} + +PUGI__FN const xml_node_iterator& xml_node_iterator::operator--() +{ + _wrap = _wrap._root ? _wrap.previous_sibling() : _parent.last_child(); + return *this; +} + +PUGI__FN xml_node_iterator xml_node_iterator::operator--(int) +{ + xml_node_iterator temp = *this; + --*this; + return temp; +} + +PUGI__FN xml_attribute_iterator::xml_attribute_iterator() +{ +} + +PUGI__FN xml_attribute_iterator::xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent): _wrap(attr), _parent(parent) +{ +} + +PUGI__FN xml_attribute_iterator::xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent) +{ +} + +PUGI__FN bool xml_attribute_iterator::operator==(const xml_attribute_iterator& rhs) const +{ + return _wrap._attr == rhs._wrap._attr && _parent._root == rhs._parent._root; +} + +PUGI__FN bool xml_attribute_iterator::operator!=(const xml_attribute_iterator& rhs) const +{ + return _wrap._attr != rhs._wrap._attr || _parent._root != rhs._parent._root; +} + +PUGI__FN xml_attribute& xml_attribute_iterator::operator*() const +{ + assert(_wrap._attr); + return _wrap; +} + +PUGI__FN xml_attribute* xml_attribute_iterator::operator->() const +{ + assert(_wrap._attr); + return const_cast(&_wrap); // BCC32 workaround +} + +PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator++() +{ + assert(_wrap._attr); + _wrap._attr = _wrap._attr->next_attribute; + return *this; +} + +PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator++(int) +{ + xml_attribute_iterator temp = *this; + ++*this; + return temp; +} + +PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator--() +{ + _wrap = _wrap._attr ? _wrap.previous_attribute() : _parent.last_attribute(); + return *this; +} + +PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator--(int) +{ + xml_attribute_iterator temp = *this; + --*this; + return temp; +} + +PUGI__FN xml_named_node_iterator::xml_named_node_iterator(): _name(0) +{ +} + +PUGI__FN xml_named_node_iterator::xml_named_node_iterator(const xml_node& node, const char_t* name): _wrap(node), _parent(node.parent()), _name(name) +{ +} + +PUGI__FN xml_named_node_iterator::xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name): _wrap(ref), _parent(parent), _name(name) +{ +} + +PUGI__FN bool xml_named_node_iterator::operator==(const xml_named_node_iterator& rhs) const +{ + return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root; +} + +PUGI__FN bool xml_named_node_iterator::operator!=(const xml_named_node_iterator& rhs) const +{ + return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root; +} + +PUGI__FN xml_node& xml_named_node_iterator::operator*() const +{ + assert(_wrap._root); + return _wrap; +} + +PUGI__FN xml_node* xml_named_node_iterator::operator->() const +{ + assert(_wrap._root); + return const_cast(&_wrap); // BCC32 workaround +} + +PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator++() +{ + assert(_wrap._root); + _wrap = _wrap.next_sibling(_name); + return *this; +} + +PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator++(int) +{ + xml_named_node_iterator temp = *this; + ++*this; + return temp; +} + +PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator--() +{ + if (_wrap._root) + _wrap = _wrap.previous_sibling(_name); + else { + _wrap = _parent.last_child(); + + if (!impl::strequal(_wrap.name(), _name)) + _wrap = _wrap.previous_sibling(_name); + } + + return *this; +} + +PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator--(int) +{ + xml_named_node_iterator temp = *this; + --*this; + return temp; +} + +PUGI__FN xml_parse_result::xml_parse_result(): status(status_internal_error), offset(0), encoding(encoding_auto) +{ +} + +PUGI__FN xml_parse_result::operator bool() const +{ + return status == status_ok; +} + +PUGI__FN const char* xml_parse_result::description() const +{ + switch (status) { + case status_ok: + return "No error"; + + case status_file_not_found: + return "File was not found"; + case status_io_error: + return "Error reading from file/stream"; + case status_out_of_memory: + return "Could not allocate memory"; + case status_internal_error: + return "Internal error occurred"; + + case status_unrecognized_tag: + return "Could not determine tag type"; + + case status_bad_pi: + return "Error parsing document declaration/processing instruction"; + case status_bad_comment: + return "Error parsing comment"; + case status_bad_cdata: + return "Error parsing CDATA section"; + case status_bad_doctype: + return "Error parsing document type declaration"; + case status_bad_pcdata: + return "Error parsing PCDATA section"; + case status_bad_start_element: + return "Error parsing start element tag"; + case status_bad_attribute: + return "Error parsing element attribute"; + case status_bad_end_element: + return "Error parsing end element tag"; + case status_end_element_mismatch: + return "Start-end tags mismatch"; + + case status_append_invalid_root: + return "Unable to append nodes: root is not an element or document"; + + case status_no_document_element: + return "No document element found"; + + default: + return "Unknown error"; + } +} + +PUGI__FN xml_document::xml_document(): _buffer(0) +{ + create(); +} + +PUGI__FN xml_document::~xml_document() +{ + destroy(); +} + +PUGI__FN void xml_document::reset() +{ + destroy(); + create(); +} + +PUGI__FN void xml_document::reset(const xml_document& proto) +{ + reset(); + + for (xml_node cur = proto.first_child(); cur; cur = cur.next_sibling()) + append_copy(cur); +} + +PUGI__FN void xml_document::create() +{ + assert(!_root); + +#ifdef PUGIXML_COMPACT + const size_t page_offset = sizeof(uint32_t); +#else + const size_t page_offset = 0; +#endif + + // initialize sentinel page + PUGI__STATIC_ASSERT(sizeof(impl::xml_memory_page) + sizeof(impl::xml_document_struct) + impl::xml_memory_page_alignment - sizeof(void*) + page_offset <= sizeof(_memory)); + + // align upwards to page boundary + void* page_memory = reinterpret_cast((reinterpret_cast(_memory) + (impl::xml_memory_page_alignment - 1)) & ~(impl::xml_memory_page_alignment - 1)); + + // prepare page structure + impl::xml_memory_page* page = impl::xml_memory_page::construct(page_memory); + assert(page); + + page->busy_size = impl::xml_memory_page_size; + + // setup first page marker +#ifdef PUGIXML_COMPACT + // round-trip through void* to avoid 'cast increases required alignment of target type' warning + page->compact_page_marker = reinterpret_cast(static_cast(reinterpret_cast(page) + sizeof(impl::xml_memory_page))); + *page->compact_page_marker = sizeof(impl::xml_memory_page); +#endif + + // allocate new root + _root = new (reinterpret_cast(page) + sizeof(impl::xml_memory_page) + page_offset) impl::xml_document_struct(page); + _root->prev_sibling_c = _root; + + // setup sentinel page + page->allocator = static_cast(_root); + + // verify the document allocation + assert(reinterpret_cast(_root) + sizeof(impl::xml_document_struct) <= _memory + sizeof(_memory)); +} + +PUGI__FN void xml_document::destroy() +{ + assert(_root); + + // destroy static storage + if (_buffer) { + impl::xml_memory::deallocate(_buffer); + _buffer = 0; + } + + // destroy extra buffers (note: no need to destroy linked list nodes, they're allocated using document allocator) + for (impl::xml_extra_buffer* extra = static_cast(_root)->extra_buffers; extra; extra = extra->next) { + if (extra->buffer) impl::xml_memory::deallocate(extra->buffer); + } + + // destroy dynamic storage, leave sentinel page (it's in static memory) + impl::xml_memory_page* root_page = PUGI__GETPAGE(_root); + assert(root_page && !root_page->prev); + assert(reinterpret_cast(root_page) >= _memory && reinterpret_cast(root_page) < _memory + sizeof(_memory)); + + for (impl::xml_memory_page* page = root_page->next; page; ) { + impl::xml_memory_page* next = page->next; + + impl::xml_allocator::deallocate_page(page); + + page = next; + } + +#ifdef PUGIXML_COMPACT + // destroy hash table + static_cast(_root)->hash.clear(); +#endif + + _root = 0; +} + +#ifndef PUGIXML_NO_STL +PUGI__FN xml_parse_result xml_document::load(std::basic_istream >& stream, unsigned int options, xml_encoding encoding) +{ + reset(); + + return impl::load_stream_impl(static_cast(_root), stream, options, encoding, &_buffer); +} + +PUGI__FN xml_parse_result xml_document::load(std::basic_istream >& stream, unsigned int options) +{ + reset(); + + return impl::load_stream_impl(static_cast(_root), stream, options, encoding_wchar, &_buffer); +} +#endif + +PUGI__FN xml_parse_result xml_document::load_string(const char_t* contents, unsigned int options) +{ + // Force native encoding (skip autodetection) +#ifdef PUGIXML_WCHAR_MODE + xml_encoding encoding = encoding_wchar; +#else + xml_encoding encoding = encoding_utf8; +#endif + + return load_buffer(contents, impl::strlength(contents) * sizeof(char_t), options, encoding); +} + +PUGI__FN xml_parse_result xml_document::load(const char_t* contents, unsigned int options) +{ + return load_string(contents, options); +} + +PUGI__FN xml_parse_result xml_document::load_file(const char* path_, unsigned int options, xml_encoding encoding) +{ + reset(); + + using impl::auto_deleter; // MSVC7 workaround + auto_deleter file(fopen(path_, "rb"), fclose); + + return impl::load_file_impl(static_cast(_root), file.data, options, encoding, &_buffer); +} + +PUGI__FN xml_parse_result xml_document::load_file(const wchar_t* path_, unsigned int options, xml_encoding encoding) +{ + reset(); + + using impl::auto_deleter; // MSVC7 workaround + auto_deleter file(impl::open_file_wide(path_, L"rb"), fclose); + + return impl::load_file_impl(static_cast(_root), file.data, options, encoding, &_buffer); +} + +PUGI__FN xml_parse_result xml_document::load_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding) +{ + reset(); + + return impl::load_buffer_impl(static_cast(_root), _root, const_cast(contents), size, options, encoding, false, false, &_buffer); +} + +PUGI__FN xml_parse_result xml_document::load_buffer_inplace(void* contents, size_t size, unsigned int options, xml_encoding encoding) +{ + reset(); + + return impl::load_buffer_impl(static_cast(_root), _root, contents, size, options, encoding, true, false, &_buffer); +} + +PUGI__FN xml_parse_result xml_document::load_buffer_inplace_own(void* contents, size_t size, unsigned int options, xml_encoding encoding) +{ + reset(); + + return impl::load_buffer_impl(static_cast(_root), _root, contents, size, options, encoding, true, true, &_buffer); +} + +PUGI__FN void xml_document::save(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding) const +{ + impl::xml_buffered_writer buffered_writer(writer, encoding); + + if ((flags & format_write_bom) && encoding != encoding_latin1) { + // BOM always represents the codepoint U+FEFF, so just write it in native encoding +#ifdef PUGIXML_WCHAR_MODE + unsigned int bom = 0xfeff; + buffered_writer.write(static_cast(bom)); +#else + buffered_writer.write('\xef', '\xbb', '\xbf'); +#endif + } + + if (!(flags & format_no_declaration) && !impl::has_declaration(_root)) { + buffered_writer.write_string(PUGIXML_TEXT("'); + if (!(flags & format_raw)) buffered_writer.write('\n'); + } + + impl::node_output(buffered_writer, _root, indent, flags, 0); + + buffered_writer.flush(); +} + +#ifndef PUGIXML_NO_STL +PUGI__FN void xml_document::save(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding) const +{ + xml_writer_stream writer(stream); + + save(writer, indent, flags, encoding); +} + +PUGI__FN void xml_document::save(std::basic_ostream >& stream, const char_t* indent, unsigned int flags) const +{ + xml_writer_stream writer(stream); + + save(writer, indent, flags, encoding_wchar); +} +#endif + +PUGI__FN bool xml_document::save_file(const char* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const +{ + using impl::auto_deleter; // MSVC7 workaround + auto_deleter file(fopen(path_, (flags & format_save_file_text) ? "w" : "wb"), fclose); + + return impl::save_file_impl(*this, file.data, indent, flags, encoding); +} + +PUGI__FN bool xml_document::save_file(const wchar_t* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const +{ + using impl::auto_deleter; // MSVC7 workaround + auto_deleter file(impl::open_file_wide(path_, (flags & format_save_file_text) ? L"w" : L"wb"), fclose); + + return impl::save_file_impl(*this, file.data, indent, flags, encoding); +} + +PUGI__FN xml_node xml_document::document_element() const +{ + assert(_root); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (PUGI__NODETYPE(i) == node_element) + return xml_node(i); + + return xml_node(); +} + +#ifndef PUGIXML_NO_STL +PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str) +{ + assert(str); + + return impl::as_utf8_impl(str, impl::strlength_wide(str)); +} + +PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const std::basic_string& str) +{ + return impl::as_utf8_impl(str.c_str(), str.size()); +} + +PUGI__FN std::basic_string PUGIXML_FUNCTION as_wide(const char* str) +{ + assert(str); + + return impl::as_wide_impl(str, strlen(str)); +} + +PUGI__FN std::basic_string PUGIXML_FUNCTION as_wide(const std::string& str) +{ + return impl::as_wide_impl(str.c_str(), str.size()); +} +#endif + +PUGI__FN void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate) +{ + impl::xml_memory::allocate = allocate; + impl::xml_memory::deallocate = deallocate; +} + +PUGI__FN allocation_function PUGIXML_FUNCTION get_memory_allocation_function() +{ + return impl::xml_memory::allocate; +} + +PUGI__FN deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function() +{ + return impl::xml_memory::deallocate; +} +} + +#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC)) +namespace std +{ +// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier) +PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_node_iterator&) +{ + return std::bidirectional_iterator_tag(); +} + +PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_attribute_iterator&) +{ + return std::bidirectional_iterator_tag(); +} + +PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_named_node_iterator&) +{ + return std::bidirectional_iterator_tag(); +} +} +#endif + +#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC) +namespace std +{ +// Workarounds for (non-standard) iterator category detection +PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_node_iterator&) +{ + return std::bidirectional_iterator_tag(); +} + +PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_attribute_iterator&) +{ + return std::bidirectional_iterator_tag(); +} + +PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_named_node_iterator&) +{ + return std::bidirectional_iterator_tag(); +} +} +#endif + +#ifndef PUGIXML_NO_XPATH +// STL replacements +PUGI__NS_BEGIN +struct equal_to { + template bool operator()(const T& lhs, const T& rhs) const { + return lhs == rhs; + } +}; + +struct not_equal_to { + template bool operator()(const T& lhs, const T& rhs) const { + return lhs != rhs; + } +}; + +struct less { + template bool operator()(const T& lhs, const T& rhs) const { + return lhs < rhs; + } +}; + +struct less_equal { + template bool operator()(const T& lhs, const T& rhs) const { + return lhs <= rhs; + } +}; + +template void swap(T& lhs, T& rhs) +{ + T temp = lhs; + lhs = rhs; + rhs = temp; +} + +template I min_element(I begin, I end, const Pred& pred) +{ + I result = begin; + + for (I it = begin + 1; it != end; ++it) + if (pred(*it, *result)) + result = it; + + return result; +} + +template void reverse(I begin, I end) +{ + while (end - begin > 1) swap(*begin++, *--end); +} + +template I unique(I begin, I end) +{ + // fast skip head + while (end - begin > 1 && *begin != *(begin + 1)) begin++; + + if (begin == end) return begin; + + // last written element + I write = begin++; + + // merge unique elements + while (begin != end) { + if (*begin != *write) + *++write = *begin++; + else + begin++; + } + + // past-the-end (write points to live element) + return write + 1; +} + +template void copy_backwards(I begin, I end, I target) +{ + while (begin != end) *--target = *--end; +} + +template void insertion_sort(I begin, I end, const Pred& pred, T*) +{ + assert(begin != end); + + for (I it = begin + 1; it != end; ++it) { + T val = *it; + + if (pred(val, *begin)) { + // move to front + copy_backwards(begin, it, it + 1); + *begin = val; + } else { + I hole = it; + + // move hole backwards + while (pred(val, *(hole - 1))) { + *hole = *(hole - 1); + hole--; + } + + // fill hole with element + *hole = val; + } + } +} + +// std variant for elements with == +template void partition(I begin, I middle, I end, const Pred& pred, I* out_eqbeg, I* out_eqend) +{ + I eqbeg = middle, eqend = middle + 1; + + // expand equal range + while (eqbeg != begin && *(eqbeg - 1) == *eqbeg) --eqbeg; + while (eqend != end && *eqend == *eqbeg) ++eqend; + + // process outer elements + I ltend = eqbeg, gtbeg = eqend; + + for (;;) { + // find the element from the right side that belongs to the left one + for (; gtbeg != end; ++gtbeg) + if (!pred(*eqbeg, *gtbeg)) { + if (*gtbeg == *eqbeg) swap(*gtbeg, *eqend++); + else break; + } + + // find the element from the left side that belongs to the right one + for (; ltend != begin; --ltend) + if (!pred(*(ltend - 1), *eqbeg)) { + if (*eqbeg == *(ltend - 1)) swap(*(ltend - 1), *--eqbeg); + else break; + } + + // scanned all elements + if (gtbeg == end && ltend == begin) { + *out_eqbeg = eqbeg; + *out_eqend = eqend; + return; + } + + // make room for elements by moving equal area + if (gtbeg == end) { + if (--ltend != --eqbeg) swap(*ltend, *eqbeg); + swap(*eqbeg, *--eqend); + } else if (ltend == begin) { + if (eqend != gtbeg) swap(*eqbeg, *eqend); + ++eqend; + swap(*gtbeg++, *eqbeg++); + } else swap(*gtbeg++, *--ltend); + } +} + +template void median3(I first, I middle, I last, const Pred& pred) +{ + if (pred(*middle, *first)) swap(*middle, *first); + if (pred(*last, *middle)) swap(*last, *middle); + if (pred(*middle, *first)) swap(*middle, *first); +} + +template void median(I first, I middle, I last, const Pred& pred) +{ + if (last - first <= 40) { + // median of three for small chunks + median3(first, middle, last, pred); + } else { + // median of nine + size_t step = (last - first + 1) / 8; + + median3(first, first + step, first + 2 * step, pred); + median3(middle - step, middle, middle + step, pred); + median3(last - 2 * step, last - step, last, pred); + median3(first + step, middle, last - step, pred); + } +} + +template void sort(I begin, I end, const Pred& pred) +{ + // sort large chunks + while (end - begin > 32) { + // find median element + I middle = begin + (end - begin) / 2; + median(begin, middle, end - 1, pred); + + // partition in three chunks (< = >) + I eqbeg, eqend; + partition(begin, middle, end, pred, &eqbeg, &eqend); + + // loop on larger half + if (eqbeg - begin > end - eqend) { + sort(eqend, end, pred); + end = eqbeg; + } else { + sort(begin, eqbeg, pred); + begin = eqend; + } + } + + // insertion sort small chunk + if (begin != end) insertion_sort(begin, end, pred, &*begin); +} +PUGI__NS_END + +// Allocator used for AST and evaluation stacks +PUGI__NS_BEGIN +static const size_t xpath_memory_page_size = +#ifdef PUGIXML_MEMORY_XPATH_PAGE_SIZE + PUGIXML_MEMORY_XPATH_PAGE_SIZE +#else + 4096 +#endif + ; + +static const uintptr_t xpath_memory_block_alignment = sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*); + +struct xpath_memory_block { + xpath_memory_block* next; + size_t capacity; + + union { + char data[xpath_memory_page_size]; + double alignment; + }; +}; + +class xpath_allocator +{ + xpath_memory_block* _root; + size_t _root_size; + +public: +#ifdef PUGIXML_NO_EXCEPTIONS + jmp_buf* error_handler; +#endif + + xpath_allocator(xpath_memory_block* root, size_t root_size = 0): _root(root), _root_size(root_size) { +#ifdef PUGIXML_NO_EXCEPTIONS + error_handler = 0; +#endif + } + + void* allocate_nothrow(size_t size) { + // round size up to block alignment boundary + size = (size + xpath_memory_block_alignment - 1) & ~(xpath_memory_block_alignment - 1); + + if (_root_size + size <= _root->capacity) { + void* buf = &_root->data[0] + _root_size; + _root_size += size; + return buf; + } else { + // make sure we have at least 1/4th of the page free after allocation to satisfy subsequent allocation requests + size_t block_capacity_base = sizeof(_root->data); + size_t block_capacity_req = size + block_capacity_base / 4; + size_t block_capacity = (block_capacity_base > block_capacity_req) ? block_capacity_base : block_capacity_req; + + size_t block_size = block_capacity + offsetof(xpath_memory_block, data); + + xpath_memory_block* block = static_cast(xml_memory::allocate(block_size)); + if (!block) return 0; + + block->next = _root; + block->capacity = block_capacity; + + _root = block; + _root_size = size; + + return block->data; + } + } + + void* allocate(size_t size) { + void* result = allocate_nothrow(size); + + if (!result) { +#ifdef PUGIXML_NO_EXCEPTIONS + assert(error_handler); + longjmp(*error_handler, 1); +#else + throw std::bad_alloc(); +#endif + } + + return result; + } + + void* reallocate(void* ptr, size_t old_size, size_t new_size) { + // round size up to block alignment boundary + old_size = (old_size + xpath_memory_block_alignment - 1) & ~(xpath_memory_block_alignment - 1); + new_size = (new_size + xpath_memory_block_alignment - 1) & ~(xpath_memory_block_alignment - 1); + + // we can only reallocate the last object + assert(ptr == 0 || static_cast(ptr) + old_size == &_root->data[0] + _root_size); + + // adjust root size so that we have not allocated the object at all + bool only_object = (_root_size == old_size); + + if (ptr) _root_size -= old_size; + + // allocate a new version (this will obviously reuse the memory if possible) + void* result = allocate(new_size); + assert(result); + + // we have a new block + if (result != ptr && ptr) { + // copy old data + assert(new_size >= old_size); + memcpy(result, ptr, old_size); + + // free the previous page if it had no other objects + if (only_object) { + assert(_root->data == result); + assert(_root->next); + + xpath_memory_block* next = _root->next->next; + + if (next) { + // deallocate the whole page, unless it was the first one + xml_memory::deallocate(_root->next); + _root->next = next; + } + } + } + + return result; + } + + void revert(const xpath_allocator& state) { + // free all new pages + xpath_memory_block* cur = _root; + + while (cur != state._root) { + xpath_memory_block* next = cur->next; + + xml_memory::deallocate(cur); + + cur = next; + } + + // restore state + _root = state._root; + _root_size = state._root_size; + } + + void release() { + xpath_memory_block* cur = _root; + assert(cur); + + while (cur->next) { + xpath_memory_block* next = cur->next; + + xml_memory::deallocate(cur); + + cur = next; + } + } +}; + +struct xpath_allocator_capture { + xpath_allocator_capture(xpath_allocator* alloc): _target(alloc), _state(*alloc) { + } + + ~xpath_allocator_capture() { + _target->revert(_state); + } + + xpath_allocator* _target; + xpath_allocator _state; +}; + +struct xpath_stack { + xpath_allocator* result; + xpath_allocator* temp; +}; + +struct xpath_stack_data { + xpath_memory_block blocks[2]; + xpath_allocator result; + xpath_allocator temp; + xpath_stack stack; + +#ifdef PUGIXML_NO_EXCEPTIONS + jmp_buf error_handler; +#endif + + xpath_stack_data(): result(blocks + 0), temp(blocks + 1) { + blocks[0].next = blocks[1].next = 0; + blocks[0].capacity = blocks[1].capacity = sizeof(blocks[0].data); + + stack.result = &result; + stack.temp = &temp; + +#ifdef PUGIXML_NO_EXCEPTIONS + result.error_handler = temp.error_handler = &error_handler; +#endif + } + + ~xpath_stack_data() { + result.release(); + temp.release(); + } +}; +PUGI__NS_END + +// String class +PUGI__NS_BEGIN +class xpath_string +{ + const char_t* _buffer; + bool _uses_heap; + size_t _length_heap; + + static char_t* duplicate_string(const char_t* string, size_t length, xpath_allocator* alloc) { + char_t* result = static_cast(alloc->allocate((length + 1) * sizeof(char_t))); + assert(result); + + memcpy(result, string, length * sizeof(char_t)); + result[length] = 0; + + return result; + } + + xpath_string(const char_t* buffer, bool uses_heap_, size_t length_heap): _buffer(buffer), _uses_heap(uses_heap_), _length_heap(length_heap) { + } + +public: + static xpath_string from_const(const char_t* str) { + return xpath_string(str, false, 0); + } + + static xpath_string from_heap_preallocated(const char_t* begin, const char_t* end) { + assert(begin <= end && *end == 0); + + return xpath_string(begin, true, static_cast(end - begin)); + } + + static xpath_string from_heap(const char_t* begin, const char_t* end, xpath_allocator* alloc) { + assert(begin <= end); + + size_t length = static_cast(end - begin); + + return length == 0 ? xpath_string() : xpath_string(duplicate_string(begin, length, alloc), true, length); + } + + xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false), _length_heap(0) { + } + + void append(const xpath_string& o, xpath_allocator* alloc) { + // skip empty sources + if (!*o._buffer) return; + + // fast append for constant empty target and constant source + if (!*_buffer && !_uses_heap && !o._uses_heap) { + _buffer = o._buffer; + } else { + // need to make heap copy + size_t target_length = length(); + size_t source_length = o.length(); + size_t result_length = target_length + source_length; + + // allocate new buffer + char_t* result = static_cast(alloc->reallocate(_uses_heap ? const_cast(_buffer) : 0, (target_length + 1) * sizeof(char_t), (result_length + 1) * sizeof(char_t))); + assert(result); + + // append first string to the new buffer in case there was no reallocation + if (!_uses_heap) memcpy(result, _buffer, target_length * sizeof(char_t)); + + // append second string to the new buffer + memcpy(result + target_length, o._buffer, source_length * sizeof(char_t)); + result[result_length] = 0; + + // finalize + _buffer = result; + _uses_heap = true; + _length_heap = result_length; + } + } + + const char_t* c_str() const { + return _buffer; + } + + size_t length() const { + return _uses_heap ? _length_heap : strlength(_buffer); + } + + char_t* data(xpath_allocator* alloc) { + // make private heap copy + if (!_uses_heap) { + size_t length_ = strlength(_buffer); + + _buffer = duplicate_string(_buffer, length_, alloc); + _uses_heap = true; + _length_heap = length_; + } + + return const_cast(_buffer); + } + + bool empty() const { + return *_buffer == 0; + } + + bool operator==(const xpath_string& o) const { + return strequal(_buffer, o._buffer); + } + + bool operator!=(const xpath_string& o) const { + return !strequal(_buffer, o._buffer); + } + + bool uses_heap() const { + return _uses_heap; + } +}; +PUGI__NS_END + +PUGI__NS_BEGIN +PUGI__FN bool starts_with(const char_t* string, const char_t* pattern) +{ + while (*pattern && *string == *pattern) { + string++; + pattern++; + } + + return *pattern == 0; +} + +PUGI__FN const char_t* find_char(const char_t* s, char_t c) +{ +#ifdef PUGIXML_WCHAR_MODE + return wcschr(s, c); +#else + return strchr(s, c); +#endif +} + +PUGI__FN const char_t* find_substring(const char_t* s, const char_t* p) +{ +#ifdef PUGIXML_WCHAR_MODE + // MSVC6 wcsstr bug workaround (if s is empty it always returns 0) + return (*p == 0) ? s : wcsstr(s, p); +#else + return strstr(s, p); +#endif +} + +// Converts symbol to lower case, if it is an ASCII one +PUGI__FN char_t tolower_ascii(char_t ch) +{ + return static_cast(ch - 'A') < 26 ? static_cast(ch | ' ') : ch; +} + +PUGI__FN xpath_string string_value(const xpath_node& na, xpath_allocator* alloc) +{ + if (na.attribute()) + return xpath_string::from_const(na.attribute().value()); + else { + xml_node n = na.node(); + + switch (n.type()) { + case node_pcdata: + case node_cdata: + case node_comment: + case node_pi: + return xpath_string::from_const(n.value()); + + case node_document: + case node_element: { + xpath_string result; + + xml_node cur = n.first_child(); + + while (cur && cur != n) { + if (cur.type() == node_pcdata || cur.type() == node_cdata) + result.append(xpath_string::from_const(cur.value()), alloc); + + if (cur.first_child()) + cur = cur.first_child(); + else if (cur.next_sibling()) + cur = cur.next_sibling(); + else { + while (!cur.next_sibling() && cur != n) + cur = cur.parent(); + + if (cur != n) cur = cur.next_sibling(); + } + } + + return result; + } + + default: + return xpath_string(); + } + } +} + +PUGI__FN bool node_is_before_sibling(xml_node_struct* ln, xml_node_struct* rn) +{ + assert(ln->parent == rn->parent); + + // there is no common ancestor (the shared parent is null), nodes are from different documents + if (!ln->parent) return ln < rn; + + // determine sibling order + xml_node_struct* ls = ln; + xml_node_struct* rs = rn; + + while (ls && rs) { + if (ls == rn) return true; + if (rs == ln) return false; + + ls = ls->next_sibling; + rs = rs->next_sibling; + } + + // if rn sibling chain ended ln must be before rn + return !rs; +} + +PUGI__FN bool node_is_before(xml_node_struct* ln, xml_node_struct* rn) +{ + // find common ancestor at the same depth, if any + xml_node_struct* lp = ln; + xml_node_struct* rp = rn; + + while (lp && rp && lp->parent != rp->parent) { + lp = lp->parent; + rp = rp->parent; + } + + // parents are the same! + if (lp && rp) return node_is_before_sibling(lp, rp); + + // nodes are at different depths, need to normalize heights + bool left_higher = !lp; + + while (lp) { + lp = lp->parent; + ln = ln->parent; + } + + while (rp) { + rp = rp->parent; + rn = rn->parent; + } + + // one node is the ancestor of the other + if (ln == rn) return left_higher; + + // find common ancestor... again + while (ln->parent != rn->parent) { + ln = ln->parent; + rn = rn->parent; + } + + return node_is_before_sibling(ln, rn); +} + +PUGI__FN bool node_is_ancestor(xml_node_struct* parent, xml_node_struct* node) +{ + while (node && node != parent) node = node->parent; + + return parent && node == parent; +} + +PUGI__FN const void* document_buffer_order(const xpath_node& xnode) +{ + xml_node_struct* node = xnode.node().internal_object(); + + if (node) { + if ((get_document(node).header & xml_memory_page_contents_shared_mask) == 0) { + if (node->name && (node->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return node->name; + if (node->value && (node->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return node->value; + } + + return 0; + } + + xml_attribute_struct* attr = xnode.attribute().internal_object(); + + if (attr) { + if ((get_document(attr).header & xml_memory_page_contents_shared_mask) == 0) { + if ((attr->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return attr->name; + if ((attr->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return attr->value; + } + + return 0; + } + + return 0; +} + +struct document_order_comparator { + bool operator()(const xpath_node& lhs, const xpath_node& rhs) const { + // optimized document order based check + const void* lo = document_buffer_order(lhs); + const void* ro = document_buffer_order(rhs); + + if (lo && ro) return lo < ro; + + // slow comparison + xml_node ln = lhs.node(), rn = rhs.node(); + + // compare attributes + if (lhs.attribute() && rhs.attribute()) { + // shared parent + if (lhs.parent() == rhs.parent()) { + // determine sibling order + for (xml_attribute a = lhs.attribute(); a; a = a.next_attribute()) + if (a == rhs.attribute()) + return true; + + return false; + } + + // compare attribute parents + ln = lhs.parent(); + rn = rhs.parent(); + } else if (lhs.attribute()) { + // attributes go after the parent element + if (lhs.parent() == rhs.node()) return false; + + ln = lhs.parent(); + } else if (rhs.attribute()) { + // attributes go after the parent element + if (rhs.parent() == lhs.node()) return true; + + rn = rhs.parent(); + } + + if (ln == rn) return false; + + if (!ln || !rn) return ln < rn; + + return node_is_before(ln.internal_object(), rn.internal_object()); + } +}; + +struct duplicate_comparator { + bool operator()(const xpath_node& lhs, const xpath_node& rhs) const { + if (lhs.attribute()) return rhs.attribute() ? lhs.attribute() < rhs.attribute() : true; + else return rhs.attribute() ? false : lhs.node() < rhs.node(); + } +}; + +PUGI__FN double gen_nan() +{ +#if defined(__STDC_IEC_559__) || ((FLT_RADIX - 0 == 2) && (FLT_MAX_EXP - 0 == 128) && (FLT_MANT_DIG - 0 == 24)) + union { + float f; + uint32_t i; + } u[sizeof(float) == sizeof(uint32_t) ? 1 : -1]; + u[0].i = 0x7fc00000; + return u[0].f; +#else + // fallback + const volatile double zero = 0.0; + return zero / zero; +#endif +} + +PUGI__FN bool is_nan(double value) +{ +#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) + return !!_isnan(value); +#elif defined(fpclassify) && defined(FP_NAN) + return fpclassify(value) == FP_NAN; +#else + // fallback + const volatile double v = value; + return v != v; +#endif +} + +PUGI__FN const char_t* convert_number_to_string_special(double value) +{ +#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) + if (_finite(value)) return (value == 0) ? PUGIXML_TEXT("0") : 0; + if (_isnan(value)) return PUGIXML_TEXT("NaN"); + return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity"); +#elif defined(fpclassify) && defined(FP_NAN) && defined(FP_INFINITE) && defined(FP_ZERO) + switch (fpclassify(value)) { + case FP_NAN: + return PUGIXML_TEXT("NaN"); + + case FP_INFINITE: + return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity"); + + case FP_ZERO: + return PUGIXML_TEXT("0"); + + default: + return 0; + } +#else + // fallback + const volatile double v = value; + + if (v == 0) return PUGIXML_TEXT("0"); + if (v != v) return PUGIXML_TEXT("NaN"); + if (v * 2 == v) return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity"); + return 0; +#endif +} + +PUGI__FN bool convert_number_to_boolean(double value) +{ + return (value != 0 && !is_nan(value)); +} + +PUGI__FN void truncate_zeros(char* begin, char* end) +{ + while (begin != end && end[-1] == '0') end--; + + *end = 0; +} + +// gets mantissa digits in the form of 0.xxxxx with 0. implied and the exponent +#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE) +PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent) +{ + // get base values + int sign, exponent; + _ecvt_s(buffer, buffer_size, value, DBL_DIG + 1, &exponent, &sign); + + // truncate redundant zeros + truncate_zeros(buffer, buffer + strlen(buffer)); + + // fill results + *out_mantissa = buffer; + *out_exponent = exponent; +} +#else +PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent) +{ + // get a scientific notation value with IEEE DBL_DIG decimals + sprintf(buffer, "%.*e", DBL_DIG, value); + assert(strlen(buffer) < buffer_size); + (void)!buffer_size; + + // get the exponent (possibly negative) + char* exponent_string = strchr(buffer, 'e'); + assert(exponent_string); + + int exponent = atoi(exponent_string + 1); + + // extract mantissa string: skip sign + char* mantissa = buffer[0] == '-' ? buffer + 1 : buffer; + assert(mantissa[0] != '0' && mantissa[1] == '.'); + + // divide mantissa by 10 to eliminate integer part + mantissa[1] = mantissa[0]; + mantissa++; + exponent++; + + // remove extra mantissa digits and zero-terminate mantissa + truncate_zeros(mantissa, exponent_string); + + // fill results + *out_mantissa = mantissa; + *out_exponent = exponent; +} +#endif + +PUGI__FN xpath_string convert_number_to_string(double value, xpath_allocator* alloc) +{ + // try special number conversion + const char_t* special = convert_number_to_string_special(value); + if (special) return xpath_string::from_const(special); + + // get mantissa + exponent form + char mantissa_buffer[32]; + + char* mantissa; + int exponent; + convert_number_to_mantissa_exponent(value, mantissa_buffer, sizeof(mantissa_buffer), &mantissa, &exponent); + + // allocate a buffer of suitable length for the number + size_t result_size = strlen(mantissa_buffer) + (exponent > 0 ? exponent : -exponent) + 4; + char_t* result = static_cast(alloc->allocate(sizeof(char_t) * result_size)); + assert(result); + + // make the number! + char_t* s = result; + + // sign + if (value < 0) *s++ = '-'; + + // integer part + if (exponent <= 0) { + *s++ = '0'; + } else { + while (exponent > 0) { + assert(*mantissa == 0 || static_cast(static_cast(*mantissa) - '0') <= 9); + *s++ = *mantissa ? *mantissa++ : '0'; + exponent--; + } + } + + // fractional part + if (*mantissa) { + // decimal point + *s++ = '.'; + + // extra zeroes from negative exponent + while (exponent < 0) { + *s++ = '0'; + exponent++; + } + + // extra mantissa digits + while (*mantissa) { + assert(static_cast(*mantissa - '0') <= 9); + *s++ = *mantissa++; + } + } + + // zero-terminate + assert(s < result + result_size); + *s = 0; + + return xpath_string::from_heap_preallocated(result, s); +} + +PUGI__FN bool check_string_to_number_format(const char_t* string) +{ + // parse leading whitespace + while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string; + + // parse sign + if (*string == '-') ++string; + + if (!*string) return false; + + // if there is no integer part, there should be a decimal part with at least one digit + if (!PUGI__IS_CHARTYPEX(string[0], ctx_digit) && (string[0] != '.' || !PUGI__IS_CHARTYPEX(string[1], ctx_digit))) return false; + + // parse integer part + while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string; + + // parse decimal part + if (*string == '.') { + ++string; + + while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string; + } + + // parse trailing whitespace + while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string; + + return *string == 0; +} + +PUGI__FN double convert_string_to_number(const char_t* string) +{ + // check string format + if (!check_string_to_number_format(string)) return gen_nan(); + + // parse string +#ifdef PUGIXML_WCHAR_MODE + return wcstod(string, 0); +#else + return strtod(string, 0); +#endif +} + +PUGI__FN bool convert_string_to_number_scratch(char_t (&buffer)[32], const char_t* begin, const char_t* end, double* out_result) +{ + size_t length = static_cast(end - begin); + char_t* scratch = buffer; + + if (length >= sizeof(buffer) / sizeof(buffer[0])) { + // need to make dummy on-heap copy + scratch = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!scratch) return false; + } + + // copy string to zero-terminated buffer and perform conversion + memcpy(scratch, begin, length * sizeof(char_t)); + scratch[length] = 0; + + *out_result = convert_string_to_number(scratch); + + // free dummy buffer + if (scratch != buffer) xml_memory::deallocate(scratch); + + return true; +} + +PUGI__FN double round_nearest(double value) +{ + return floor(value + 0.5); +} + +PUGI__FN double round_nearest_nzero(double value) +{ + // same as round_nearest, but returns -0 for [-0.5, -0] + // ceil is used to differentiate between +0 and -0 (we return -0 for [-0.5, -0] and +0 for +0) + return (value >= -0.5 && value <= 0) ? ceil(value) : floor(value + 0.5); +} + +PUGI__FN const char_t* qualified_name(const xpath_node& node) +{ + return node.attribute() ? node.attribute().name() : node.node().name(); +} + +PUGI__FN const char_t* local_name(const xpath_node& node) +{ + const char_t* name = qualified_name(node); + const char_t* p = find_char(name, ':'); + + return p ? p + 1 : name; +} + +struct namespace_uri_predicate { + const char_t* prefix; + size_t prefix_length; + + namespace_uri_predicate(const char_t* name) { + const char_t* pos = find_char(name, ':'); + + prefix = pos ? name : 0; + prefix_length = pos ? static_cast(pos - name) : 0; + } + + bool operator()(xml_attribute a) const { + const char_t* name = a.name(); + + if (!starts_with(name, PUGIXML_TEXT("xmlns"))) return false; + + return prefix ? name[5] == ':' && strequalrange(name + 6, prefix, prefix_length) : name[5] == 0; + } +}; + +PUGI__FN const char_t* namespace_uri(xml_node node) +{ + namespace_uri_predicate pred = node.name(); + + xml_node p = node; + + while (p) { + xml_attribute a = p.find_attribute(pred); + + if (a) return a.value(); + + p = p.parent(); + } + + return PUGIXML_TEXT(""); +} + +PUGI__FN const char_t* namespace_uri(xml_attribute attr, xml_node parent) +{ + namespace_uri_predicate pred = attr.name(); + + // Default namespace does not apply to attributes + if (!pred.prefix) return PUGIXML_TEXT(""); + + xml_node p = parent; + + while (p) { + xml_attribute a = p.find_attribute(pred); + + if (a) return a.value(); + + p = p.parent(); + } + + return PUGIXML_TEXT(""); +} + +PUGI__FN const char_t* namespace_uri(const xpath_node& node) +{ + return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node()); +} + +PUGI__FN char_t* normalize_space(char_t* buffer) +{ + char_t* write = buffer; + + for (char_t* it = buffer; *it; ) { + char_t ch = *it++; + + if (PUGI__IS_CHARTYPE(ch, ct_space)) { + // replace whitespace sequence with single space + while (PUGI__IS_CHARTYPE(*it, ct_space)) it++; + + // avoid leading spaces + if (write != buffer) *write++ = ' '; + } else *write++ = ch; + } + + // remove trailing space + if (write != buffer && PUGI__IS_CHARTYPE(write[-1], ct_space)) write--; + + // zero-terminate + *write = 0; + + return write; +} + +PUGI__FN char_t* translate(char_t* buffer, const char_t* from, const char_t* to, size_t to_length) +{ + char_t* write = buffer; + + while (*buffer) { + PUGI__DMC_VOLATILE char_t ch = *buffer++; + + const char_t* pos = find_char(from, ch); + + if (!pos) + *write++ = ch; // do not process + else if (static_cast(pos - from) < to_length) + *write++ = to[pos - from]; // replace + } + + // zero-terminate + *write = 0; + + return write; +} + +PUGI__FN unsigned char* translate_table_generate(xpath_allocator* alloc, const char_t* from, const char_t* to) +{ + unsigned char table[128] = {0}; + + while (*from) { + unsigned int fc = static_cast(*from); + unsigned int tc = static_cast(*to); + + if (fc >= 128 || tc >= 128) + return 0; + + // code=128 means "skip character" + if (!table[fc]) + table[fc] = static_cast(tc ? tc : 128); + + from++; + if (tc) to++; + } + + for (int i = 0; i < 128; ++i) + if (!table[i]) + table[i] = static_cast(i); + + void* result = alloc->allocate_nothrow(sizeof(table)); + + if (result) { + memcpy(result, table, sizeof(table)); + } + + return static_cast(result); +} + +PUGI__FN char_t* translate_table(char_t* buffer, const unsigned char* table) +{ + char_t* write = buffer; + + while (*buffer) { + char_t ch = *buffer++; + unsigned int index = static_cast(ch); + + if (index < 128) { + unsigned char code = table[index]; + + // code=128 means "skip character" (table size is 128 so 128 can be a special value) + // this code skips these characters without extra branches + *write = static_cast(code); + write += 1 - (code >> 7); + } else { + *write++ = ch; + } + } + + // zero-terminate + *write = 0; + + return write; +} + +inline bool is_xpath_attribute(const char_t* name) +{ + return !(starts_with(name, PUGIXML_TEXT("xmlns")) && (name[5] == 0 || name[5] == ':')); +} + +struct xpath_variable_boolean: xpath_variable { + xpath_variable_boolean(): xpath_variable(xpath_type_boolean), value(false) { + } + + bool value; + char_t name[1]; +}; + +struct xpath_variable_number: xpath_variable { + xpath_variable_number(): xpath_variable(xpath_type_number), value(0) { + } + + double value; + char_t name[1]; +}; + +struct xpath_variable_string: xpath_variable { + xpath_variable_string(): xpath_variable(xpath_type_string), value(0) { + } + + ~xpath_variable_string() { + if (value) xml_memory::deallocate(value); + } + + char_t* value; + char_t name[1]; +}; + +struct xpath_variable_node_set: xpath_variable { + xpath_variable_node_set(): xpath_variable(xpath_type_node_set) { + } + + xpath_node_set value; + char_t name[1]; +}; + +static const xpath_node_set dummy_node_set; + +PUGI__FN unsigned int hash_string(const char_t* str) +{ + // Jenkins one-at-a-time hash (http://en.wikipedia.org/wiki/Jenkins_hash_function#one-at-a-time) + unsigned int result = 0; + + while (*str) { + result += static_cast(*str++); + result += result << 10; + result ^= result >> 6; + } + + result += result << 3; + result ^= result >> 11; + result += result << 15; + + return result; +} + +template PUGI__FN T* new_xpath_variable(const char_t* name) +{ + size_t length = strlength(name); + if (length == 0) return 0; // empty variable names are invalid + + // $$ we can't use offsetof(T, name) because T is non-POD, so we just allocate additional length characters + void* memory = xml_memory::allocate(sizeof(T) + length * sizeof(char_t)); + if (!memory) return 0; + + T* result = new (memory) T(); + + memcpy(result->name, name, (length + 1) * sizeof(char_t)); + + return result; +} + +PUGI__FN xpath_variable* new_xpath_variable(xpath_value_type type, const char_t* name) +{ + switch (type) { + case xpath_type_node_set: + return new_xpath_variable(name); + + case xpath_type_number: + return new_xpath_variable(name); + + case xpath_type_string: + return new_xpath_variable(name); + + case xpath_type_boolean: + return new_xpath_variable(name); + + default: + return 0; + } +} + +template PUGI__FN void delete_xpath_variable(T* var) +{ + var->~T(); + xml_memory::deallocate(var); +} + +PUGI__FN void delete_xpath_variable(xpath_value_type type, xpath_variable* var) +{ + switch (type) { + case xpath_type_node_set: + delete_xpath_variable(static_cast(var)); + break; + + case xpath_type_number: + delete_xpath_variable(static_cast(var)); + break; + + case xpath_type_string: + delete_xpath_variable(static_cast(var)); + break; + + case xpath_type_boolean: + delete_xpath_variable(static_cast(var)); + break; + + default: + assert(!"Invalid variable type"); + } +} + +PUGI__FN bool copy_xpath_variable(xpath_variable* lhs, const xpath_variable* rhs) +{ + switch (rhs->type()) { + case xpath_type_node_set: + return lhs->set(static_cast(rhs)->value); + + case xpath_type_number: + return lhs->set(static_cast(rhs)->value); + + case xpath_type_string: + return lhs->set(static_cast(rhs)->value); + + case xpath_type_boolean: + return lhs->set(static_cast(rhs)->value); + + default: + assert(!"Invalid variable type"); + return false; + } +} + +PUGI__FN bool get_variable_scratch(char_t (&buffer)[32], xpath_variable_set* set, const char_t* begin, const char_t* end, xpath_variable** out_result) +{ + size_t length = static_cast(end - begin); + char_t* scratch = buffer; + + if (length >= sizeof(buffer) / sizeof(buffer[0])) { + // need to make dummy on-heap copy + scratch = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!scratch) return false; + } + + // copy string to zero-terminated buffer and perform lookup + memcpy(scratch, begin, length * sizeof(char_t)); + scratch[length] = 0; + + *out_result = set->get(scratch); + + // free dummy buffer + if (scratch != buffer) xml_memory::deallocate(scratch); + + return true; +} +PUGI__NS_END + +// Internal node set class +PUGI__NS_BEGIN +PUGI__FN xpath_node_set::type_t xpath_get_order(const xpath_node* begin, const xpath_node* end) +{ + if (end - begin < 2) + return xpath_node_set::type_sorted; + + document_order_comparator cmp; + + bool first = cmp(begin[0], begin[1]); + + for (const xpath_node* it = begin + 1; it + 1 < end; ++it) + if (cmp(it[0], it[1]) != first) + return xpath_node_set::type_unsorted; + + return first ? xpath_node_set::type_sorted : xpath_node_set::type_sorted_reverse; +} + +PUGI__FN xpath_node_set::type_t xpath_sort(xpath_node* begin, xpath_node* end, xpath_node_set::type_t type, bool rev) +{ + xpath_node_set::type_t order = rev ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted; + + if (type == xpath_node_set::type_unsorted) { + xpath_node_set::type_t sorted = xpath_get_order(begin, end); + + if (sorted == xpath_node_set::type_unsorted) { + sort(begin, end, document_order_comparator()); + + type = xpath_node_set::type_sorted; + } else + type = sorted; + } + + if (type != order) reverse(begin, end); + + return order; +} + +PUGI__FN xpath_node xpath_first(const xpath_node* begin, const xpath_node* end, xpath_node_set::type_t type) +{ + if (begin == end) return xpath_node(); + + switch (type) { + case xpath_node_set::type_sorted: + return *begin; + + case xpath_node_set::type_sorted_reverse: + return *(end - 1); + + case xpath_node_set::type_unsorted: + return *min_element(begin, end, document_order_comparator()); + + default: + assert(!"Invalid node set type"); + return xpath_node(); + } +} + +class xpath_node_set_raw +{ + xpath_node_set::type_t _type; + + xpath_node* _begin; + xpath_node* _end; + xpath_node* _eos; + +public: + xpath_node_set_raw(): _type(xpath_node_set::type_unsorted), _begin(0), _end(0), _eos(0) { + } + + xpath_node* begin() const { + return _begin; + } + + xpath_node* end() const { + return _end; + } + + bool empty() const { + return _begin == _end; + } + + size_t size() const { + return static_cast(_end - _begin); + } + + xpath_node first() const { + return xpath_first(_begin, _end, _type); + } + + void push_back_grow(const xpath_node& node, xpath_allocator* alloc); + + void push_back(const xpath_node& node, xpath_allocator* alloc) { + if (_end != _eos) + *_end++ = node; + else + push_back_grow(node, alloc); + } + + void append(const xpath_node* begin_, const xpath_node* end_, xpath_allocator* alloc) { + if (begin_ == end_) return; + + size_t size_ = static_cast(_end - _begin); + size_t capacity = static_cast(_eos - _begin); + size_t count = static_cast(end_ - begin_); + + if (size_ + count > capacity) { + // reallocate the old array or allocate a new one + xpath_node* data = static_cast(alloc->reallocate(_begin, capacity * sizeof(xpath_node), (size_ + count) * sizeof(xpath_node))); + assert(data); + + // finalize + _begin = data; + _end = data + size_; + _eos = data + size_ + count; + } + + memcpy(_end, begin_, count * sizeof(xpath_node)); + _end += count; + } + + void sort_do() { + _type = xpath_sort(_begin, _end, _type, false); + } + + void truncate(xpath_node* pos) { + assert(_begin <= pos && pos <= _end); + + _end = pos; + } + + void remove_duplicates() { + if (_type == xpath_node_set::type_unsorted) + sort(_begin, _end, duplicate_comparator()); + + _end = unique(_begin, _end); + } + + xpath_node_set::type_t type() const { + return _type; + } + + void set_type(xpath_node_set::type_t value) { + _type = value; + } +}; + +PUGI__FN_NO_INLINE void xpath_node_set_raw::push_back_grow(const xpath_node& node, xpath_allocator* alloc) +{ + size_t capacity = static_cast(_eos - _begin); + + // get new capacity (1.5x rule) + size_t new_capacity = capacity + capacity / 2 + 1; + + // reallocate the old array or allocate a new one + xpath_node* data = static_cast(alloc->reallocate(_begin, capacity * sizeof(xpath_node), new_capacity * sizeof(xpath_node))); + assert(data); + + // finalize + _begin = data; + _end = data + capacity; + _eos = data + new_capacity; + + // push + *_end++ = node; +} +PUGI__NS_END + +PUGI__NS_BEGIN +struct xpath_context { + xpath_node n; + size_t position, size; + + xpath_context(const xpath_node& n_, size_t position_, size_t size_): n(n_), position(position_), size(size_) { + } +}; + +enum lexeme_t { + lex_none = 0, + lex_equal, + lex_not_equal, + lex_less, + lex_greater, + lex_less_or_equal, + lex_greater_or_equal, + lex_plus, + lex_minus, + lex_multiply, + lex_union, + lex_var_ref, + lex_open_brace, + lex_close_brace, + lex_quoted_string, + lex_number, + lex_slash, + lex_double_slash, + lex_open_square_brace, + lex_close_square_brace, + lex_string, + lex_comma, + lex_axis_attribute, + lex_dot, + lex_double_dot, + lex_double_colon, + lex_eof +}; + +struct xpath_lexer_string { + const char_t* begin; + const char_t* end; + + xpath_lexer_string(): begin(0), end(0) { + } + + bool operator==(const char_t* other) const { + size_t length = static_cast(end - begin); + + return strequalrange(other, begin, length); + } +}; + +class xpath_lexer +{ + const char_t* _cur; + const char_t* _cur_lexeme_pos; + xpath_lexer_string _cur_lexeme_contents; + + lexeme_t _cur_lexeme; + +public: + explicit xpath_lexer(const char_t* query): _cur(query) { + next(); + } + + const char_t* state() const { + return _cur; + } + + void next() { + const char_t* cur = _cur; + + while (PUGI__IS_CHARTYPE(*cur, ct_space)) ++cur; + + // save lexeme position for error reporting + _cur_lexeme_pos = cur; + + switch (*cur) { + case 0: + _cur_lexeme = lex_eof; + break; + + case '>': + if (*(cur+1) == '=') { + cur += 2; + _cur_lexeme = lex_greater_or_equal; + } else { + cur += 1; + _cur_lexeme = lex_greater; + } + break; + + case '<': + if (*(cur+1) == '=') { + cur += 2; + _cur_lexeme = lex_less_or_equal; + } else { + cur += 1; + _cur_lexeme = lex_less; + } + break; + + case '!': + if (*(cur+1) == '=') { + cur += 2; + _cur_lexeme = lex_not_equal; + } else { + _cur_lexeme = lex_none; + } + break; + + case '=': + cur += 1; + _cur_lexeme = lex_equal; + + break; + + case '+': + cur += 1; + _cur_lexeme = lex_plus; + + break; + + case '-': + cur += 1; + _cur_lexeme = lex_minus; + + break; + + case '*': + cur += 1; + _cur_lexeme = lex_multiply; + + break; + + case '|': + cur += 1; + _cur_lexeme = lex_union; + + break; + + case '$': + cur += 1; + + if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol)) { + _cur_lexeme_contents.begin = cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + + if (cur[0] == ':' && PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) { // qname + cur++; // : + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + } + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_var_ref; + } else { + _cur_lexeme = lex_none; + } + + break; + + case '(': + cur += 1; + _cur_lexeme = lex_open_brace; + + break; + + case ')': + cur += 1; + _cur_lexeme = lex_close_brace; + + break; + + case '[': + cur += 1; + _cur_lexeme = lex_open_square_brace; + + break; + + case ']': + cur += 1; + _cur_lexeme = lex_close_square_brace; + + break; + + case ',': + cur += 1; + _cur_lexeme = lex_comma; + + break; + + case '/': + if (*(cur+1) == '/') { + cur += 2; + _cur_lexeme = lex_double_slash; + } else { + cur += 1; + _cur_lexeme = lex_slash; + } + break; + + case '.': + if (*(cur+1) == '.') { + cur += 2; + _cur_lexeme = lex_double_dot; + } else if (PUGI__IS_CHARTYPEX(*(cur+1), ctx_digit)) { + _cur_lexeme_contents.begin = cur; // . + + ++cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++; + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_number; + } else { + cur += 1; + _cur_lexeme = lex_dot; + } + break; + + case '@': + cur += 1; + _cur_lexeme = lex_axis_attribute; + + break; + + case '"': + case '\'': { + char_t terminator = *cur; + + ++cur; + + _cur_lexeme_contents.begin = cur; + while (*cur && *cur != terminator) cur++; + _cur_lexeme_contents.end = cur; + + if (!*cur) + _cur_lexeme = lex_none; + else { + cur += 1; + _cur_lexeme = lex_quoted_string; + } + + break; + } + + case ':': + if (*(cur+1) == ':') { + cur += 2; + _cur_lexeme = lex_double_colon; + } else { + _cur_lexeme = lex_none; + } + break; + + default: + if (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) { + _cur_lexeme_contents.begin = cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++; + + if (*cur == '.') { + cur++; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++; + } + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_number; + } else if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol)) { + _cur_lexeme_contents.begin = cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + + if (cur[0] == ':') { + if (cur[1] == '*') { // namespace test ncname:* + cur += 2; // :* + } else if (PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) { // namespace test qname + cur++; // : + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + } + } + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_string; + } else { + _cur_lexeme = lex_none; + } + } + + _cur = cur; + } + + lexeme_t current() const { + return _cur_lexeme; + } + + const char_t* current_pos() const { + return _cur_lexeme_pos; + } + + const xpath_lexer_string& contents() const { + assert(_cur_lexeme == lex_var_ref || _cur_lexeme == lex_number || _cur_lexeme == lex_string || _cur_lexeme == lex_quoted_string); + + return _cur_lexeme_contents; + } +}; + +enum ast_type_t { + ast_unknown, + ast_op_or, // left or right + ast_op_and, // left and right + ast_op_equal, // left = right + ast_op_not_equal, // left != right + ast_op_less, // left < right + ast_op_greater, // left > right + ast_op_less_or_equal, // left <= right + ast_op_greater_or_equal, // left >= right + ast_op_add, // left + right + ast_op_subtract, // left - right + ast_op_multiply, // left * right + ast_op_divide, // left / right + ast_op_mod, // left % right + ast_op_negate, // left - right + ast_op_union, // left | right + ast_predicate, // apply predicate to set; next points to next predicate + ast_filter, // select * from left where right + ast_string_constant, // string constant + ast_number_constant, // number constant + ast_variable, // variable + ast_func_last, // last() + ast_func_position, // position() + ast_func_count, // count(left) + ast_func_id, // id(left) + ast_func_local_name_0, // local-name() + ast_func_local_name_1, // local-name(left) + ast_func_namespace_uri_0, // namespace-uri() + ast_func_namespace_uri_1, // namespace-uri(left) + ast_func_name_0, // name() + ast_func_name_1, // name(left) + ast_func_string_0, // string() + ast_func_string_1, // string(left) + ast_func_concat, // concat(left, right, siblings) + ast_func_starts_with, // starts_with(left, right) + ast_func_contains, // contains(left, right) + ast_func_substring_before, // substring-before(left, right) + ast_func_substring_after, // substring-after(left, right) + ast_func_substring_2, // substring(left, right) + ast_func_substring_3, // substring(left, right, third) + ast_func_string_length_0, // string-length() + ast_func_string_length_1, // string-length(left) + ast_func_normalize_space_0, // normalize-space() + ast_func_normalize_space_1, // normalize-space(left) + ast_func_translate, // translate(left, right, third) + ast_func_boolean, // boolean(left) + ast_func_not, // not(left) + ast_func_true, // true() + ast_func_false, // false() + ast_func_lang, // lang(left) + ast_func_number_0, // number() + ast_func_number_1, // number(left) + ast_func_sum, // sum(left) + ast_func_floor, // floor(left) + ast_func_ceiling, // ceiling(left) + ast_func_round, // round(left) + ast_step, // process set left with step + ast_step_root, // select root node + + ast_opt_translate_table, // translate(left, right, third) where right/third are constants + ast_opt_compare_attribute // @name = 'string' +}; + +enum axis_t { + axis_ancestor, + axis_ancestor_or_self, + axis_attribute, + axis_child, + axis_descendant, + axis_descendant_or_self, + axis_following, + axis_following_sibling, + axis_namespace, + axis_parent, + axis_preceding, + axis_preceding_sibling, + axis_self +}; + +enum nodetest_t { + nodetest_none, + nodetest_name, + nodetest_type_node, + nodetest_type_comment, + nodetest_type_pi, + nodetest_type_text, + nodetest_pi, + nodetest_all, + nodetest_all_in_namespace +}; + +enum predicate_t { + predicate_default, + predicate_posinv, + predicate_constant, + predicate_constant_one +}; + +enum nodeset_eval_t { + nodeset_eval_all, + nodeset_eval_any, + nodeset_eval_first +}; + +template struct axis_to_type { + static const axis_t axis; +}; + +template const axis_t axis_to_type::axis = N; + +class xpath_ast_node +{ +private: + // node type + char _type; + char _rettype; + + // for ast_step + char _axis; + + // for ast_step/ast_predicate/ast_filter + char _test; + + // tree node structure + xpath_ast_node* _left; + xpath_ast_node* _right; + xpath_ast_node* _next; + + union { + // value for ast_string_constant + const char_t* string; + // value for ast_number_constant + double number; + // variable for ast_variable + xpath_variable* variable; + // node test for ast_step (node name/namespace/node type/pi target) + const char_t* nodetest; + // table for ast_opt_translate_table + const unsigned char* table; + } _data; + + xpath_ast_node(const xpath_ast_node&); + xpath_ast_node& operator=(const xpath_ast_node&); + + template static bool compare_eq(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp) { + xpath_value_type lt = lhs->rettype(), rt = rhs->rettype(); + + if (lt != xpath_type_node_set && rt != xpath_type_node_set) { + if (lt == xpath_type_boolean || rt == xpath_type_boolean) + return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack)); + else if (lt == xpath_type_number || rt == xpath_type_number) + return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack)); + else if (lt == xpath_type_string || rt == xpath_type_string) { + xpath_allocator_capture cr(stack.result); + + xpath_string ls = lhs->eval_string(c, stack); + xpath_string rs = rhs->eval_string(c, stack); + + return comp(ls, rs); + } + } else if (lt == xpath_type_node_set && rt == xpath_type_node_set) { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { + xpath_allocator_capture cri(stack.result); + + if (comp(string_value(*li, stack.result), string_value(*ri, stack.result))) + return true; + } + + return false; + } else { + if (lt == xpath_type_node_set) { + swap(lhs, rhs); + swap(lt, rt); + } + + if (lt == xpath_type_boolean) + return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack)); + else if (lt == xpath_type_number) { + xpath_allocator_capture cr(stack.result); + + double l = lhs->eval_number(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { + xpath_allocator_capture cri(stack.result); + + if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str()))) + return true; + } + + return false; + } else if (lt == xpath_type_string) { + xpath_allocator_capture cr(stack.result); + + xpath_string l = lhs->eval_string(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { + xpath_allocator_capture cri(stack.result); + + if (comp(l, string_value(*ri, stack.result))) + return true; + } + + return false; + } + } + + assert(!"Wrong types"); + return false; + } + + static bool eval_once(xpath_node_set::type_t type, nodeset_eval_t eval) { + return type == xpath_node_set::type_sorted ? eval != nodeset_eval_all : eval == nodeset_eval_any; + } + + template static bool compare_rel(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp) { + xpath_value_type lt = lhs->rettype(), rt = rhs->rettype(); + + if (lt != xpath_type_node_set && rt != xpath_type_node_set) + return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack)); + else if (lt == xpath_type_node_set && rt == xpath_type_node_set) { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) { + xpath_allocator_capture cri(stack.result); + + double l = convert_string_to_number(string_value(*li, stack.result).c_str()); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { + xpath_allocator_capture crii(stack.result); + + if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str()))) + return true; + } + } + + return false; + } else if (lt != xpath_type_node_set && rt == xpath_type_node_set) { + xpath_allocator_capture cr(stack.result); + + double l = lhs->eval_number(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { + xpath_allocator_capture cri(stack.result); + + if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str()))) + return true; + } + + return false; + } else if (lt == xpath_type_node_set && rt != xpath_type_node_set) { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); + double r = rhs->eval_number(c, stack); + + for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) { + xpath_allocator_capture cri(stack.result); + + if (comp(convert_string_to_number(string_value(*li, stack.result).c_str()), r)) + return true; + } + + return false; + } else { + assert(!"Wrong types"); + return false; + } + } + + static void apply_predicate_boolean(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once) { + assert(ns.size() >= first); + assert(expr->rettype() != xpath_type_number); + + size_t i = 1; + size_t size = ns.size() - first; + + xpath_node* last = ns.begin() + first; + + // remove_if... or well, sort of + for (xpath_node* it = last; it != ns.end(); ++it, ++i) { + xpath_context c(*it, i, size); + + if (expr->eval_boolean(c, stack)) { + *last++ = *it; + + if (once) break; + } + } + + ns.truncate(last); + } + + static void apply_predicate_number(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once) { + assert(ns.size() >= first); + assert(expr->rettype() == xpath_type_number); + + size_t i = 1; + size_t size = ns.size() - first; + + xpath_node* last = ns.begin() + first; + + // remove_if... or well, sort of + for (xpath_node* it = last; it != ns.end(); ++it, ++i) { + xpath_context c(*it, i, size); + + if (expr->eval_number(c, stack) == i) { + *last++ = *it; + + if (once) break; + } + } + + ns.truncate(last); + } + + static void apply_predicate_number_const(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack) { + assert(ns.size() >= first); + assert(expr->rettype() == xpath_type_number); + + size_t size = ns.size() - first; + + xpath_node* last = ns.begin() + first; + + xpath_context c(xpath_node(), 1, size); + + double er = expr->eval_number(c, stack); + + if (er >= 1.0 && er <= size) { + size_t eri = static_cast(er); + + if (er == eri) { + xpath_node r = last[eri - 1]; + + *last++ = r; + } + } + + ns.truncate(last); + } + + void apply_predicate(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, bool once) { + if (ns.size() == first) return; + + assert(_type == ast_filter || _type == ast_predicate); + + if (_test == predicate_constant || _test == predicate_constant_one) + apply_predicate_number_const(ns, first, _right, stack); + else if (_right->rettype() == xpath_type_number) + apply_predicate_number(ns, first, _right, stack, once); + else + apply_predicate_boolean(ns, first, _right, stack, once); + } + + void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, nodeset_eval_t eval) { + if (ns.size() == first) return; + + bool last_once = eval_once(ns.type(), eval); + + for (xpath_ast_node* pred = _right; pred; pred = pred->_next) + pred->apply_predicate(ns, first, stack, !pred->_next && last_once); + } + + bool step_push(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* parent, xpath_allocator* alloc) { + assert(a); + + const char_t* name = a->name ? a->name + 0 : PUGIXML_TEXT(""); + + switch (_test) { + case nodetest_name: + if (strequal(name, _data.nodetest) && is_xpath_attribute(name)) { + ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc); + return true; + } + break; + + case nodetest_type_node: + case nodetest_all: + if (is_xpath_attribute(name)) { + ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc); + return true; + } + break; + + case nodetest_all_in_namespace: + if (starts_with(name, _data.nodetest) && is_xpath_attribute(name)) { + ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc); + return true; + } + break; + + default: + ; + } + + return false; + } + + bool step_push(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc) { + assert(n); + + xml_node_type type = PUGI__NODETYPE(n); + + switch (_test) { + case nodetest_name: + if (type == node_element && n->name && strequal(n->name, _data.nodetest)) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_type_node: + ns.push_back(xml_node(n), alloc); + return true; + + case nodetest_type_comment: + if (type == node_comment) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_type_text: + if (type == node_pcdata || type == node_cdata) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_type_pi: + if (type == node_pi) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_pi: + if (type == node_pi && n->name && strequal(n->name, _data.nodetest)) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_all: + if (type == node_element) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_all_in_namespace: + if (type == node_element && n->name && starts_with(n->name, _data.nodetest)) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + default: + assert(!"Unknown axis"); + } + + return false; + } + + template void step_fill(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc, bool once, T) { + const axis_t axis = T::axis; + + switch (axis) { + case axis_attribute: { + for (xml_attribute_struct* a = n->first_attribute; a; a = a->next_attribute) + if (step_push(ns, a, n, alloc) & once) + return; + + break; + } + + case axis_child: { + for (xml_node_struct* c = n->first_child; c; c = c->next_sibling) + if (step_push(ns, c, alloc) & once) + return; + + break; + } + + case axis_descendant: + case axis_descendant_or_self: { + if (axis == axis_descendant_or_self) + if (step_push(ns, n, alloc) & once) + return; + + xml_node_struct* cur = n->first_child; + + while (cur) { + if (step_push(ns, cur, alloc) & once) + return; + + if (cur->first_child) + cur = cur->first_child; + else { + while (!cur->next_sibling) { + cur = cur->parent; + + if (cur == n) return; + } + + cur = cur->next_sibling; + } + } + + break; + } + + case axis_following_sibling: { + for (xml_node_struct* c = n->next_sibling; c; c = c->next_sibling) + if (step_push(ns, c, alloc) & once) + return; + + break; + } + + case axis_preceding_sibling: { + for (xml_node_struct* c = n->prev_sibling_c; c->next_sibling; c = c->prev_sibling_c) + if (step_push(ns, c, alloc) & once) + return; + + break; + } + + case axis_following: { + xml_node_struct* cur = n; + + // exit from this node so that we don't include descendants + while (!cur->next_sibling) { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->next_sibling; + + while (cur) { + if (step_push(ns, cur, alloc) & once) + return; + + if (cur->first_child) + cur = cur->first_child; + else { + while (!cur->next_sibling) { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->next_sibling; + } + } + + break; + } + + case axis_preceding: { + xml_node_struct* cur = n; + + // exit from this node so that we don't include descendants + while (!cur->prev_sibling_c->next_sibling) { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->prev_sibling_c; + + while (cur) { + if (cur->first_child) + cur = cur->first_child->prev_sibling_c; + else { + // leaf node, can't be ancestor + if (step_push(ns, cur, alloc) & once) + return; + + while (!cur->prev_sibling_c->next_sibling) { + cur = cur->parent; + + if (!cur) return; + + if (!node_is_ancestor(cur, n)) + if (step_push(ns, cur, alloc) & once) + return; + } + + cur = cur->prev_sibling_c; + } + } + + break; + } + + case axis_ancestor: + case axis_ancestor_or_self: { + if (axis == axis_ancestor_or_self) + if (step_push(ns, n, alloc) & once) + return; + + xml_node_struct* cur = n->parent; + + while (cur) { + if (step_push(ns, cur, alloc) & once) + return; + + cur = cur->parent; + } + + break; + } + + case axis_self: { + step_push(ns, n, alloc); + + break; + } + + case axis_parent: { + if (n->parent) + step_push(ns, n->parent, alloc); + + break; + } + + default: + assert(!"Unimplemented axis"); + } + } + + template void step_fill(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* p, xpath_allocator* alloc, bool once, T v) { + const axis_t axis = T::axis; + + switch (axis) { + case axis_ancestor: + case axis_ancestor_or_self: { + if (axis == axis_ancestor_or_self && _test == nodetest_type_node) // reject attributes based on principal node type test + if (step_push(ns, a, p, alloc) & once) + return; + + xml_node_struct* cur = p; + + while (cur) { + if (step_push(ns, cur, alloc) & once) + return; + + cur = cur->parent; + } + + break; + } + + case axis_descendant_or_self: + case axis_self: { + if (_test == nodetest_type_node) // reject attributes based on principal node type test + step_push(ns, a, p, alloc); + + break; + } + + case axis_following: { + xml_node_struct* cur = p; + + while (cur) { + if (cur->first_child) + cur = cur->first_child; + else { + while (!cur->next_sibling) { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->next_sibling; + } + + if (step_push(ns, cur, alloc) & once) + return; + } + + break; + } + + case axis_parent: { + step_push(ns, p, alloc); + + break; + } + + case axis_preceding: { + // preceding:: axis does not include attribute nodes and attribute ancestors (they are the same as parent's ancestors), so we can reuse node preceding + step_fill(ns, p, alloc, once, v); + break; + } + + default: + assert(!"Unimplemented axis"); + } + } + + template void step_fill(xpath_node_set_raw& ns, const xpath_node& xn, xpath_allocator* alloc, bool once, T v) { + const axis_t axis = T::axis; + const bool axis_has_attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self); + + if (xn.node()) + step_fill(ns, xn.node().internal_object(), alloc, once, v); + else if (axis_has_attributes && xn.attribute() && xn.parent()) + step_fill(ns, xn.attribute().internal_object(), xn.parent().internal_object(), alloc, once, v); + } + + template xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval, T v) { + const axis_t axis = T::axis; + const bool axis_reverse = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling); + const xpath_node_set::type_t axis_type = axis_reverse ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted; + + bool once = + (axis == axis_attribute && _test == nodetest_name) || + (!_right && eval_once(axis_type, eval)) || + (_right && !_right->_next && _right->_test == predicate_constant_one); + + xpath_node_set_raw ns; + ns.set_type(axis_type); + + if (_left) { + xpath_node_set_raw s = _left->eval_node_set(c, stack, nodeset_eval_all); + + // self axis preserves the original order + if (axis == axis_self) ns.set_type(s.type()); + + for (const xpath_node* it = s.begin(); it != s.end(); ++it) { + size_t size = ns.size(); + + // in general, all axes generate elements in a particular order, but there is no order guarantee if axis is applied to two nodes + if (axis != axis_self && size != 0) ns.set_type(xpath_node_set::type_unsorted); + + step_fill(ns, *it, stack.result, once, v); + if (_right) apply_predicates(ns, size, stack, eval); + } + } else { + step_fill(ns, c.n, stack.result, once, v); + if (_right) apply_predicates(ns, 0, stack, eval); + } + + // child, attribute and self axes always generate unique set of nodes + // for other axis, if the set stayed sorted, it stayed unique because the traversal algorithms do not visit the same node twice + if (axis != axis_child && axis != axis_attribute && axis != axis_self && ns.type() == xpath_node_set::type_unsorted) + ns.remove_duplicates(); + + return ns; + } + +public: + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, const char_t* value): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0) { + assert(type == ast_string_constant); + _data.string = value; + } + + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, double value): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0) { + assert(type == ast_number_constant); + _data.number = value; + } + + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_variable* value): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0) { + assert(type == ast_variable); + _data.variable = value; + } + + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_ast_node* left = 0, xpath_ast_node* right = 0): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(left), _right(right), _next(0) { + } + + xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents): + _type(static_cast(type)), _rettype(xpath_type_node_set), _axis(static_cast(axis)), _test(static_cast(test)), _left(left), _right(0), _next(0) { + assert(type == ast_step); + _data.nodetest = contents; + } + + xpath_ast_node(ast_type_t type, xpath_ast_node* left, xpath_ast_node* right, predicate_t test): + _type(static_cast(type)), _rettype(xpath_type_node_set), _axis(0), _test(static_cast(test)), _left(left), _right(right), _next(0) { + assert(type == ast_filter || type == ast_predicate); + } + + void set_next(xpath_ast_node* value) { + _next = value; + } + + void set_right(xpath_ast_node* value) { + _right = value; + } + + bool eval_boolean(const xpath_context& c, const xpath_stack& stack) { + switch (_type) { + case ast_op_or: + return _left->eval_boolean(c, stack) || _right->eval_boolean(c, stack); + + case ast_op_and: + return _left->eval_boolean(c, stack) && _right->eval_boolean(c, stack); + + case ast_op_equal: + return compare_eq(_left, _right, c, stack, equal_to()); + + case ast_op_not_equal: + return compare_eq(_left, _right, c, stack, not_equal_to()); + + case ast_op_less: + return compare_rel(_left, _right, c, stack, less()); + + case ast_op_greater: + return compare_rel(_right, _left, c, stack, less()); + + case ast_op_less_or_equal: + return compare_rel(_left, _right, c, stack, less_equal()); + + case ast_op_greater_or_equal: + return compare_rel(_right, _left, c, stack, less_equal()); + + case ast_func_starts_with: { + xpath_allocator_capture cr(stack.result); + + xpath_string lr = _left->eval_string(c, stack); + xpath_string rr = _right->eval_string(c, stack); + + return starts_with(lr.c_str(), rr.c_str()); + } + + case ast_func_contains: { + xpath_allocator_capture cr(stack.result); + + xpath_string lr = _left->eval_string(c, stack); + xpath_string rr = _right->eval_string(c, stack); + + return find_substring(lr.c_str(), rr.c_str()) != 0; + } + + case ast_func_boolean: + return _left->eval_boolean(c, stack); + + case ast_func_not: + return !_left->eval_boolean(c, stack); + + case ast_func_true: + return true; + + case ast_func_false: + return false; + + case ast_func_lang: { + if (c.n.attribute()) return false; + + xpath_allocator_capture cr(stack.result); + + xpath_string lang = _left->eval_string(c, stack); + + for (xml_node n = c.n.node(); n; n = n.parent()) { + xml_attribute a = n.attribute(PUGIXML_TEXT("xml:lang")); + + if (a) { + const char_t* value = a.value(); + + // strnicmp / strncasecmp is not portable + for (const char_t* lit = lang.c_str(); *lit; ++lit) { + if (tolower_ascii(*lit) != tolower_ascii(*value)) return false; + ++value; + } + + return *value == 0 || *value == '-'; + } + } + + return false; + } + + case ast_opt_compare_attribute: { + const char_t* value = (_right->_type == ast_string_constant) ? _right->_data.string : _right->_data.variable->get_string(); + + xml_attribute attr = c.n.node().attribute(_left->_data.nodetest); + + return attr && strequal(attr.value(), value) && is_xpath_attribute(attr.name()); + } + + case ast_variable: { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_boolean) + return _data.variable->get_boolean(); + + // fallthrough to type conversion + } + + default: { + switch (_rettype) { + case xpath_type_number: + return convert_number_to_boolean(eval_number(c, stack)); + + case xpath_type_string: { + xpath_allocator_capture cr(stack.result); + + return !eval_string(c, stack).empty(); + } + + case xpath_type_node_set: { + xpath_allocator_capture cr(stack.result); + + return !eval_node_set(c, stack, nodeset_eval_any).empty(); + } + + default: + assert(!"Wrong expression for return type boolean"); + return false; + } + } + } + } + + double eval_number(const xpath_context& c, const xpath_stack& stack) { + switch (_type) { + case ast_op_add: + return _left->eval_number(c, stack) + _right->eval_number(c, stack); + + case ast_op_subtract: + return _left->eval_number(c, stack) - _right->eval_number(c, stack); + + case ast_op_multiply: + return _left->eval_number(c, stack) * _right->eval_number(c, stack); + + case ast_op_divide: + return _left->eval_number(c, stack) / _right->eval_number(c, stack); + + case ast_op_mod: + return fmod(_left->eval_number(c, stack), _right->eval_number(c, stack)); + + case ast_op_negate: + return -_left->eval_number(c, stack); + + case ast_number_constant: + return _data.number; + + case ast_func_last: + return static_cast(c.size); + + case ast_func_position: + return static_cast(c.position); + + case ast_func_count: { + xpath_allocator_capture cr(stack.result); + + return static_cast(_left->eval_node_set(c, stack, nodeset_eval_all).size()); + } + + case ast_func_string_length_0: { + xpath_allocator_capture cr(stack.result); + + return static_cast(string_value(c.n, stack.result).length()); + } + + case ast_func_string_length_1: { + xpath_allocator_capture cr(stack.result); + + return static_cast(_left->eval_string(c, stack).length()); + } + + case ast_func_number_0: { + xpath_allocator_capture cr(stack.result); + + return convert_string_to_number(string_value(c.n, stack.result).c_str()); + } + + case ast_func_number_1: + return _left->eval_number(c, stack); + + case ast_func_sum: { + xpath_allocator_capture cr(stack.result); + + double r = 0; + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* it = ns.begin(); it != ns.end(); ++it) { + xpath_allocator_capture cri(stack.result); + + r += convert_string_to_number(string_value(*it, stack.result).c_str()); + } + + return r; + } + + case ast_func_floor: { + double r = _left->eval_number(c, stack); + + return r == r ? floor(r) : r; + } + + case ast_func_ceiling: { + double r = _left->eval_number(c, stack); + + return r == r ? ceil(r) : r; + } + + case ast_func_round: + return round_nearest_nzero(_left->eval_number(c, stack)); + + case ast_variable: { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_number) + return _data.variable->get_number(); + + // fallthrough to type conversion + } + + default: { + switch (_rettype) { + case xpath_type_boolean: + return eval_boolean(c, stack) ? 1 : 0; + + case xpath_type_string: { + xpath_allocator_capture cr(stack.result); + + return convert_string_to_number(eval_string(c, stack).c_str()); + } + + case xpath_type_node_set: { + xpath_allocator_capture cr(stack.result); + + return convert_string_to_number(eval_string(c, stack).c_str()); + } + + default: + assert(!"Wrong expression for return type number"); + return 0; + } + + } + } + } + + xpath_string eval_string_concat(const xpath_context& c, const xpath_stack& stack) { + assert(_type == ast_func_concat); + + xpath_allocator_capture ct(stack.temp); + + // count the string number + size_t count = 1; + for (xpath_ast_node* nc = _right; nc; nc = nc->_next) count++; + + // gather all strings + xpath_string static_buffer[4]; + xpath_string* buffer = static_buffer; + + // allocate on-heap for large concats + if (count > sizeof(static_buffer) / sizeof(static_buffer[0])) { + buffer = static_cast(stack.temp->allocate(count * sizeof(xpath_string))); + assert(buffer); + } + + // evaluate all strings to temporary stack + xpath_stack swapped_stack = {stack.temp, stack.result}; + + buffer[0] = _left->eval_string(c, swapped_stack); + + size_t pos = 1; + for (xpath_ast_node* n = _right; n; n = n->_next, ++pos) buffer[pos] = n->eval_string(c, swapped_stack); + assert(pos == count); + + // get total length + size_t length = 0; + for (size_t i = 0; i < count; ++i) length += buffer[i].length(); + + // create final string + char_t* result = static_cast(stack.result->allocate((length + 1) * sizeof(char_t))); + assert(result); + + char_t* ri = result; + + for (size_t j = 0; j < count; ++j) + for (const char_t* bi = buffer[j].c_str(); *bi; ++bi) + *ri++ = *bi; + + *ri = 0; + + return xpath_string::from_heap_preallocated(result, ri); + } + + xpath_string eval_string(const xpath_context& c, const xpath_stack& stack) { + switch (_type) { + case ast_string_constant: + return xpath_string::from_const(_data.string); + + case ast_func_local_name_0: { + xpath_node na = c.n; + + return xpath_string::from_const(local_name(na)); + } + + case ast_func_local_name_1: { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); + xpath_node na = ns.first(); + + return xpath_string::from_const(local_name(na)); + } + + case ast_func_name_0: { + xpath_node na = c.n; + + return xpath_string::from_const(qualified_name(na)); + } + + case ast_func_name_1: { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); + xpath_node na = ns.first(); + + return xpath_string::from_const(qualified_name(na)); + } + + case ast_func_namespace_uri_0: { + xpath_node na = c.n; + + return xpath_string::from_const(namespace_uri(na)); + } + + case ast_func_namespace_uri_1: { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); + xpath_node na = ns.first(); + + return xpath_string::from_const(namespace_uri(na)); + } + + case ast_func_string_0: + return string_value(c.n, stack.result); + + case ast_func_string_1: + return _left->eval_string(c, stack); + + case ast_func_concat: + return eval_string_concat(c, stack); + + case ast_func_substring_before: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + xpath_string p = _right->eval_string(c, swapped_stack); + + const char_t* pos = find_substring(s.c_str(), p.c_str()); + + return pos ? xpath_string::from_heap(s.c_str(), pos, stack.result) : xpath_string(); + } + + case ast_func_substring_after: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + xpath_string p = _right->eval_string(c, swapped_stack); + + const char_t* pos = find_substring(s.c_str(), p.c_str()); + if (!pos) return xpath_string(); + + const char_t* rbegin = pos + p.length(); + const char_t* rend = s.c_str() + s.length(); + + return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin); + } + + case ast_func_substring_2: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + size_t s_length = s.length(); + + double first = round_nearest(_right->eval_number(c, stack)); + + if (is_nan(first)) return xpath_string(); // NaN + else if (first >= s_length + 1) return xpath_string(); + + size_t pos = first < 1 ? 1 : static_cast(first); + assert(1 <= pos && pos <= s_length + 1); + + const char_t* rbegin = s.c_str() + (pos - 1); + const char_t* rend = s.c_str() + s.length(); + + return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin); + } + + case ast_func_substring_3: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + size_t s_length = s.length(); + + double first = round_nearest(_right->eval_number(c, stack)); + double last = first + round_nearest(_right->_next->eval_number(c, stack)); + + if (is_nan(first) || is_nan(last)) return xpath_string(); + else if (first >= s_length + 1) return xpath_string(); + else if (first >= last) return xpath_string(); + else if (last < 1) return xpath_string(); + + size_t pos = first < 1 ? 1 : static_cast(first); + size_t end = last >= s_length + 1 ? s_length + 1 : static_cast(last); + + assert(1 <= pos && pos <= end && end <= s_length + 1); + const char_t* rbegin = s.c_str() + (pos - 1); + const char_t* rend = s.c_str() + (end - 1); + + return (end == s_length + 1 && !s.uses_heap()) ? xpath_string::from_const(rbegin) : xpath_string::from_heap(rbegin, rend, stack.result); + } + + case ast_func_normalize_space_0: { + xpath_string s = string_value(c.n, stack.result); + + char_t* begin = s.data(stack.result); + char_t* end = normalize_space(begin); + + return xpath_string::from_heap_preallocated(begin, end); + } + + case ast_func_normalize_space_1: { + xpath_string s = _left->eval_string(c, stack); + + char_t* begin = s.data(stack.result); + char_t* end = normalize_space(begin); + + return xpath_string::from_heap_preallocated(begin, end); + } + + case ast_func_translate: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, stack); + xpath_string from = _right->eval_string(c, swapped_stack); + xpath_string to = _right->_next->eval_string(c, swapped_stack); + + char_t* begin = s.data(stack.result); + char_t* end = translate(begin, from.c_str(), to.c_str(), to.length()); + + return xpath_string::from_heap_preallocated(begin, end); + } + + case ast_opt_translate_table: { + xpath_string s = _left->eval_string(c, stack); + + char_t* begin = s.data(stack.result); + char_t* end = translate_table(begin, _data.table); + + return xpath_string::from_heap_preallocated(begin, end); + } + + case ast_variable: { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_string) + return xpath_string::from_const(_data.variable->get_string()); + + // fallthrough to type conversion + } + + default: { + switch (_rettype) { + case xpath_type_boolean: + return xpath_string::from_const(eval_boolean(c, stack) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false")); + + case xpath_type_number: + return convert_number_to_string(eval_number(c, stack), stack.result); + + case xpath_type_node_set: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_node_set_raw ns = eval_node_set(c, swapped_stack, nodeset_eval_first); + return ns.empty() ? xpath_string() : string_value(ns.first(), stack.result); + } + + default: + assert(!"Wrong expression for return type string"); + return xpath_string(); + } + } + } + } + + xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval) { + switch (_type) { + case ast_op_union: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack, eval); + xpath_node_set_raw rs = _right->eval_node_set(c, stack, eval); + + // we can optimize merging two sorted sets, but this is a very rare operation, so don't bother + rs.set_type(xpath_node_set::type_unsorted); + + rs.append(ls.begin(), ls.end(), stack.result); + rs.remove_duplicates(); + + return rs; + } + + case ast_filter: { + xpath_node_set_raw set = _left->eval_node_set(c, stack, _test == predicate_constant_one ? nodeset_eval_first : nodeset_eval_all); + + // either expression is a number or it contains position() call; sort by document order + if (_test != predicate_posinv) set.sort_do(); + + bool once = eval_once(set.type(), eval); + + apply_predicate(set, 0, stack, once); + + return set; + } + + case ast_func_id: + return xpath_node_set_raw(); + + case ast_step: { + switch (_axis) { + case axis_ancestor: + return step_do(c, stack, eval, axis_to_type()); + + case axis_ancestor_or_self: + return step_do(c, stack, eval, axis_to_type()); + + case axis_attribute: + return step_do(c, stack, eval, axis_to_type()); + + case axis_child: + return step_do(c, stack, eval, axis_to_type()); + + case axis_descendant: + return step_do(c, stack, eval, axis_to_type()); + + case axis_descendant_or_self: + return step_do(c, stack, eval, axis_to_type()); + + case axis_following: + return step_do(c, stack, eval, axis_to_type()); + + case axis_following_sibling: + return step_do(c, stack, eval, axis_to_type()); + + case axis_namespace: + // namespaced axis is not supported + return xpath_node_set_raw(); + + case axis_parent: + return step_do(c, stack, eval, axis_to_type()); + + case axis_preceding: + return step_do(c, stack, eval, axis_to_type()); + + case axis_preceding_sibling: + return step_do(c, stack, eval, axis_to_type()); + + case axis_self: + return step_do(c, stack, eval, axis_to_type()); + + default: + assert(!"Unknown axis"); + return xpath_node_set_raw(); + } + } + + case ast_step_root: { + assert(!_right); // root step can't have any predicates + + xpath_node_set_raw ns; + + ns.set_type(xpath_node_set::type_sorted); + + if (c.n.node()) ns.push_back(c.n.node().root(), stack.result); + else if (c.n.attribute()) ns.push_back(c.n.parent().root(), stack.result); + + return ns; + } + + case ast_variable: { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_node_set) { + const xpath_node_set& s = _data.variable->get_node_set(); + + xpath_node_set_raw ns; + + ns.set_type(s.type()); + ns.append(s.begin(), s.end(), stack.result); + + return ns; + } + + // fallthrough to type conversion + } + + default: + assert(!"Wrong expression for return type node set"); + return xpath_node_set_raw(); + } + } + + void optimize(xpath_allocator* alloc) { + if (_left) _left->optimize(alloc); + if (_right) _right->optimize(alloc); + if (_next) _next->optimize(alloc); + + optimize_self(alloc); + } + + void optimize_self(xpath_allocator* alloc) { + // Rewrite [position()=expr] with [expr] + // Note that this step has to go before classification to recognize [position()=1] + if ((_type == ast_filter || _type == ast_predicate) && + _right->_type == ast_op_equal && _right->_left->_type == ast_func_position && _right->_right->_rettype == xpath_type_number) { + _right = _right->_right; + } + + // Classify filter/predicate ops to perform various optimizations during evaluation + if (_type == ast_filter || _type == ast_predicate) { + assert(_test == predicate_default); + + if (_right->_type == ast_number_constant && _right->_data.number == 1.0) + _test = predicate_constant_one; + else if (_right->_rettype == xpath_type_number && (_right->_type == ast_number_constant || _right->_type == ast_variable || _right->_type == ast_func_last)) + _test = predicate_constant; + else if (_right->_rettype != xpath_type_number && _right->is_posinv_expr()) + _test = predicate_posinv; + } + + // Rewrite descendant-or-self::node()/child::foo with descendant::foo + // The former is a full form of //foo, the latter is much faster since it executes the node test immediately + // Do a similar kind of rewrite for self/descendant/descendant-or-self axes + // Note that we only rewrite positionally invariant steps (//foo[1] != /descendant::foo[1]) + if (_type == ast_step && (_axis == axis_child || _axis == axis_self || _axis == axis_descendant || _axis == axis_descendant_or_self) && _left && + _left->_type == ast_step && _left->_axis == axis_descendant_or_self && _left->_test == nodetest_type_node && !_left->_right && + is_posinv_step()) { + if (_axis == axis_child || _axis == axis_descendant) + _axis = axis_descendant; + else + _axis = axis_descendant_or_self; + + _left = _left->_left; + } + + // Use optimized lookup table implementation for translate() with constant arguments + if (_type == ast_func_translate && _right->_type == ast_string_constant && _right->_next->_type == ast_string_constant) { + unsigned char* table = translate_table_generate(alloc, _right->_data.string, _right->_next->_data.string); + + if (table) { + _type = ast_opt_translate_table; + _data.table = table; + } + } + + // Use optimized path for @attr = 'value' or @attr = $value + if (_type == ast_op_equal && + _left->_type == ast_step && _left->_axis == axis_attribute && _left->_test == nodetest_name && !_left->_left && !_left->_right && + (_right->_type == ast_string_constant || (_right->_type == ast_variable && _right->_rettype == xpath_type_string))) { + _type = ast_opt_compare_attribute; + } + } + + bool is_posinv_expr() const { + switch (_type) { + case ast_func_position: + case ast_func_last: + return false; + + case ast_string_constant: + case ast_number_constant: + case ast_variable: + return true; + + case ast_step: + case ast_step_root: + return true; + + case ast_predicate: + case ast_filter: + return true; + + default: + if (_left && !_left->is_posinv_expr()) return false; + + for (xpath_ast_node* n = _right; n; n = n->_next) + if (!n->is_posinv_expr()) return false; + + return true; + } + } + + bool is_posinv_step() const { + assert(_type == ast_step); + + for (xpath_ast_node* n = _right; n; n = n->_next) { + assert(n->_type == ast_predicate); + + if (n->_test != predicate_posinv) + return false; + } + + return true; + } + + xpath_value_type rettype() const { + return static_cast(_rettype); + } +}; + +struct xpath_parser { + xpath_allocator* _alloc; + xpath_lexer _lexer; + + const char_t* _query; + xpath_variable_set* _variables; + + xpath_parse_result* _result; + + char_t _scratch[32]; + +#ifdef PUGIXML_NO_EXCEPTIONS + jmp_buf _error_handler; +#endif + + void throw_error(const char* message) { + _result->error = message; + _result->offset = _lexer.current_pos() - _query; + +#ifdef PUGIXML_NO_EXCEPTIONS + longjmp(_error_handler, 1); +#else + throw xpath_exception(*_result); +#endif + } + + void throw_error_oom() { +#ifdef PUGIXML_NO_EXCEPTIONS + throw_error("Out of memory"); +#else + throw std::bad_alloc(); +#endif + } + + void* alloc_node() { + void* result = _alloc->allocate_nothrow(sizeof(xpath_ast_node)); + + if (!result) throw_error_oom(); + + return result; + } + + const char_t* alloc_string(const xpath_lexer_string& value) { + if (value.begin) { + size_t length = static_cast(value.end - value.begin); + + char_t* c = static_cast(_alloc->allocate_nothrow((length + 1) * sizeof(char_t))); + if (!c) throw_error_oom(); + assert(c); // workaround for clang static analysis + + memcpy(c, value.begin, length * sizeof(char_t)); + c[length] = 0; + + return c; + } else return 0; + } + + xpath_ast_node* parse_function_helper(ast_type_t type0, ast_type_t type1, size_t argc, xpath_ast_node* args[2]) { + assert(argc <= 1); + + if (argc == 1 && args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set"); + + return new (alloc_node()) xpath_ast_node(argc == 0 ? type0 : type1, xpath_type_string, args[0]); + } + + xpath_ast_node* parse_function(const xpath_lexer_string& name, size_t argc, xpath_ast_node* args[2]) { + switch (name.begin[0]) { + case 'b': + if (name == PUGIXML_TEXT("boolean") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_boolean, xpath_type_boolean, args[0]); + + break; + + case 'c': + if (name == PUGIXML_TEXT("count") && argc == 1) { + if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set"); + return new (alloc_node()) xpath_ast_node(ast_func_count, xpath_type_number, args[0]); + } else if (name == PUGIXML_TEXT("contains") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_contains, xpath_type_boolean, args[0], args[1]); + else if (name == PUGIXML_TEXT("concat") && argc >= 2) + return new (alloc_node()) xpath_ast_node(ast_func_concat, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("ceiling") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_ceiling, xpath_type_number, args[0]); + + break; + + case 'f': + if (name == PUGIXML_TEXT("false") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_false, xpath_type_boolean); + else if (name == PUGIXML_TEXT("floor") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_floor, xpath_type_number, args[0]); + + break; + + case 'i': + if (name == PUGIXML_TEXT("id") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_id, xpath_type_node_set, args[0]); + + break; + + case 'l': + if (name == PUGIXML_TEXT("last") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_last, xpath_type_number); + else if (name == PUGIXML_TEXT("lang") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_lang, xpath_type_boolean, args[0]); + else if (name == PUGIXML_TEXT("local-name") && argc <= 1) + return parse_function_helper(ast_func_local_name_0, ast_func_local_name_1, argc, args); + + break; + + case 'n': + if (name == PUGIXML_TEXT("name") && argc <= 1) + return parse_function_helper(ast_func_name_0, ast_func_name_1, argc, args); + else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1) + return parse_function_helper(ast_func_namespace_uri_0, ast_func_namespace_uri_1, argc, args); + else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("not") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_not, xpath_type_boolean, args[0]); + else if (name == PUGIXML_TEXT("number") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]); + + break; + + case 'p': + if (name == PUGIXML_TEXT("position") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_position, xpath_type_number); + + break; + + case 'r': + if (name == PUGIXML_TEXT("round") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_round, xpath_type_number, args[0]); + + break; + + case 's': + if (name == PUGIXML_TEXT("string") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]); + else if (name == PUGIXML_TEXT("string-length") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_number, args[0]); + else if (name == PUGIXML_TEXT("starts-with") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]); + else if (name == PUGIXML_TEXT("substring-before") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_substring_before, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("substring-after") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_substring_after, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3)) + return new (alloc_node()) xpath_ast_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("sum") && argc == 1) { + if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set"); + return new (alloc_node()) xpath_ast_node(ast_func_sum, xpath_type_number, args[0]); + } + + break; + + case 't': + if (name == PUGIXML_TEXT("translate") && argc == 3) + return new (alloc_node()) xpath_ast_node(ast_func_translate, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("true") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_true, xpath_type_boolean); + + break; + + default: + break; + } + + throw_error("Unrecognized function or wrong parameter count"); + + return 0; + } + + axis_t parse_axis_name(const xpath_lexer_string& name, bool& specified) { + specified = true; + + switch (name.begin[0]) { + case 'a': + if (name == PUGIXML_TEXT("ancestor")) + return axis_ancestor; + else if (name == PUGIXML_TEXT("ancestor-or-self")) + return axis_ancestor_or_self; + else if (name == PUGIXML_TEXT("attribute")) + return axis_attribute; + + break; + + case 'c': + if (name == PUGIXML_TEXT("child")) + return axis_child; + + break; + + case 'd': + if (name == PUGIXML_TEXT("descendant")) + return axis_descendant; + else if (name == PUGIXML_TEXT("descendant-or-self")) + return axis_descendant_or_self; + + break; + + case 'f': + if (name == PUGIXML_TEXT("following")) + return axis_following; + else if (name == PUGIXML_TEXT("following-sibling")) + return axis_following_sibling; + + break; + + case 'n': + if (name == PUGIXML_TEXT("namespace")) + return axis_namespace; + + break; + + case 'p': + if (name == PUGIXML_TEXT("parent")) + return axis_parent; + else if (name == PUGIXML_TEXT("preceding")) + return axis_preceding; + else if (name == PUGIXML_TEXT("preceding-sibling")) + return axis_preceding_sibling; + + break; + + case 's': + if (name == PUGIXML_TEXT("self")) + return axis_self; + + break; + + default: + break; + } + + specified = false; + return axis_child; + } + + nodetest_t parse_node_test_type(const xpath_lexer_string& name) { + switch (name.begin[0]) { + case 'c': + if (name == PUGIXML_TEXT("comment")) + return nodetest_type_comment; + + break; + + case 'n': + if (name == PUGIXML_TEXT("node")) + return nodetest_type_node; + + break; + + case 'p': + if (name == PUGIXML_TEXT("processing-instruction")) + return nodetest_type_pi; + + break; + + case 't': + if (name == PUGIXML_TEXT("text")) + return nodetest_type_text; + + break; + + default: + break; + } + + return nodetest_none; + } + + // PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall + xpath_ast_node* parse_primary_expression() { + switch (_lexer.current()) { + case lex_var_ref: { + xpath_lexer_string name = _lexer.contents(); + + if (!_variables) + throw_error("Unknown variable: variable set is not provided"); + + xpath_variable* var = 0; + if (!get_variable_scratch(_scratch, _variables, name.begin, name.end, &var)) + throw_error_oom(); + + if (!var) + throw_error("Unknown variable: variable set does not contain the given name"); + + _lexer.next(); + + return new (alloc_node()) xpath_ast_node(ast_variable, var->type(), var); + } + + case lex_open_brace: { + _lexer.next(); + + xpath_ast_node* n = parse_expression(); + + if (_lexer.current() != lex_close_brace) + throw_error("Unmatched braces"); + + _lexer.next(); + + return n; + } + + case lex_quoted_string: { + const char_t* value = alloc_string(_lexer.contents()); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_string_constant, xpath_type_string, value); + _lexer.next(); + + return n; + } + + case lex_number: { + double value = 0; + + if (!convert_string_to_number_scratch(_scratch, _lexer.contents().begin, _lexer.contents().end, &value)) + throw_error_oom(); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_number_constant, xpath_type_number, value); + _lexer.next(); + + return n; + } + + case lex_string: { + xpath_ast_node* args[2] = {0}; + size_t argc = 0; + + xpath_lexer_string function = _lexer.contents(); + _lexer.next(); + + xpath_ast_node* last_arg = 0; + + if (_lexer.current() != lex_open_brace) + throw_error("Unrecognized function call"); + _lexer.next(); + + if (_lexer.current() != lex_close_brace) + args[argc++] = parse_expression(); + + while (_lexer.current() != lex_close_brace) { + if (_lexer.current() != lex_comma) + throw_error("No comma between function arguments"); + _lexer.next(); + + xpath_ast_node* n = parse_expression(); + + if (argc < 2) args[argc] = n; + else last_arg->set_next(n); + + argc++; + last_arg = n; + } + + _lexer.next(); + + return parse_function(function, argc, args); + } + + default: + throw_error("Unrecognizable primary expression"); + + return 0; + } + } + + // FilterExpr ::= PrimaryExpr | FilterExpr Predicate + // Predicate ::= '[' PredicateExpr ']' + // PredicateExpr ::= Expr + xpath_ast_node* parse_filter_expression() { + xpath_ast_node* n = parse_primary_expression(); + + while (_lexer.current() == lex_open_square_brace) { + _lexer.next(); + + xpath_ast_node* expr = parse_expression(); + + if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set"); + + n = new (alloc_node()) xpath_ast_node(ast_filter, n, expr, predicate_default); + + if (_lexer.current() != lex_close_square_brace) + throw_error("Unmatched square brace"); + + _lexer.next(); + } + + return n; + } + + // Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep + // AxisSpecifier ::= AxisName '::' | '@'? + // NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')' + // NameTest ::= '*' | NCName ':' '*' | QName + // AbbreviatedStep ::= '.' | '..' + xpath_ast_node* parse_step(xpath_ast_node* set) { + if (set && set->rettype() != xpath_type_node_set) + throw_error("Step has to be applied to node set"); + + bool axis_specified = false; + axis_t axis = axis_child; // implied child axis + + if (_lexer.current() == lex_axis_attribute) { + axis = axis_attribute; + axis_specified = true; + + _lexer.next(); + } else if (_lexer.current() == lex_dot) { + _lexer.next(); + + return new (alloc_node()) xpath_ast_node(ast_step, set, axis_self, nodetest_type_node, 0); + } else if (_lexer.current() == lex_double_dot) { + _lexer.next(); + + return new (alloc_node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0); + } + + nodetest_t nt_type = nodetest_none; + xpath_lexer_string nt_name; + + if (_lexer.current() == lex_string) { + // node name test + nt_name = _lexer.contents(); + _lexer.next(); + + // was it an axis name? + if (_lexer.current() == lex_double_colon) { + // parse axis name + if (axis_specified) throw_error("Two axis specifiers in one step"); + + axis = parse_axis_name(nt_name, axis_specified); + + if (!axis_specified) throw_error("Unknown axis"); + + // read actual node test + _lexer.next(); + + if (_lexer.current() == lex_multiply) { + nt_type = nodetest_all; + nt_name = xpath_lexer_string(); + _lexer.next(); + } else if (_lexer.current() == lex_string) { + nt_name = _lexer.contents(); + _lexer.next(); + } else throw_error("Unrecognized node test"); + } + + if (nt_type == nodetest_none) { + // node type test or processing-instruction + if (_lexer.current() == lex_open_brace) { + _lexer.next(); + + if (_lexer.current() == lex_close_brace) { + _lexer.next(); + + nt_type = parse_node_test_type(nt_name); + + if (nt_type == nodetest_none) throw_error("Unrecognized node type"); + + nt_name = xpath_lexer_string(); + } else if (nt_name == PUGIXML_TEXT("processing-instruction")) { + if (_lexer.current() != lex_quoted_string) + throw_error("Only literals are allowed as arguments to processing-instruction()"); + + nt_type = nodetest_pi; + nt_name = _lexer.contents(); + _lexer.next(); + + if (_lexer.current() != lex_close_brace) + throw_error("Unmatched brace near processing-instruction()"); + _lexer.next(); + } else + throw_error("Unmatched brace near node type test"); + + } + // QName or NCName:* + else { + if (nt_name.end - nt_name.begin > 2 && nt_name.end[-2] == ':' && nt_name.end[-1] == '*') { // NCName:* + nt_name.end--; // erase * + + nt_type = nodetest_all_in_namespace; + } else nt_type = nodetest_name; + } + } + } else if (_lexer.current() == lex_multiply) { + nt_type = nodetest_all; + _lexer.next(); + } else throw_error("Unrecognized node test"); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step, set, axis, nt_type, alloc_string(nt_name)); + + xpath_ast_node* last = 0; + + while (_lexer.current() == lex_open_square_brace) { + _lexer.next(); + + xpath_ast_node* expr = parse_expression(); + + xpath_ast_node* pred = new (alloc_node()) xpath_ast_node(ast_predicate, 0, expr, predicate_default); + + if (_lexer.current() != lex_close_square_brace) + throw_error("Unmatched square brace"); + _lexer.next(); + + if (last) last->set_next(pred); + else n->set_right(pred); + + last = pred; + } + + return n; + } + + // RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | RelativeLocationPath '//' Step + xpath_ast_node* parse_relative_location_path(xpath_ast_node* set) { + xpath_ast_node* n = parse_step(set); + + while (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash) { + lexeme_t l = _lexer.current(); + _lexer.next(); + + if (l == lex_double_slash) + n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0); + + n = parse_step(n); + } + + return n; + } + + // LocationPath ::= RelativeLocationPath | AbsoluteLocationPath + // AbsoluteLocationPath ::= '/' RelativeLocationPath? | '//' RelativeLocationPath + xpath_ast_node* parse_location_path() { + if (_lexer.current() == lex_slash) { + _lexer.next(); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set); + + // relative location path can start from axis_attribute, dot, double_dot, multiply and string lexemes; any other lexeme means standalone root path + lexeme_t l = _lexer.current(); + + if (l == lex_string || l == lex_axis_attribute || l == lex_dot || l == lex_double_dot || l == lex_multiply) + return parse_relative_location_path(n); + else + return n; + } else if (_lexer.current() == lex_double_slash) { + _lexer.next(); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set); + n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0); + + return parse_relative_location_path(n); + } + + // else clause moved outside of if because of bogus warning 'control may reach end of non-void function being inlined' in gcc 4.0.1 + return parse_relative_location_path(0); + } + + // PathExpr ::= LocationPath + // | FilterExpr + // | FilterExpr '/' RelativeLocationPath + // | FilterExpr '//' RelativeLocationPath + // UnionExpr ::= PathExpr | UnionExpr '|' PathExpr + // UnaryExpr ::= UnionExpr | '-' UnaryExpr + xpath_ast_node* parse_path_or_unary_expression() { + // Clarification. + // PathExpr begins with either LocationPath or FilterExpr. + // FilterExpr begins with PrimaryExpr + // PrimaryExpr begins with '$' in case of it being a variable reference, + // '(' in case of it being an expression, string literal, number constant or + // function call. + + if (_lexer.current() == lex_var_ref || _lexer.current() == lex_open_brace || + _lexer.current() == lex_quoted_string || _lexer.current() == lex_number || + _lexer.current() == lex_string) { + if (_lexer.current() == lex_string) { + // This is either a function call, or not - if not, we shall proceed with location path + const char_t* state = _lexer.state(); + + while (PUGI__IS_CHARTYPE(*state, ct_space)) ++state; + + if (*state != '(') return parse_location_path(); + + // This looks like a function call; however this still can be a node-test. Check it. + if (parse_node_test_type(_lexer.contents()) != nodetest_none) return parse_location_path(); + } + + xpath_ast_node* n = parse_filter_expression(); + + if (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash) { + lexeme_t l = _lexer.current(); + _lexer.next(); + + if (l == lex_double_slash) { + if (n->rettype() != xpath_type_node_set) throw_error("Step has to be applied to node set"); + + n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0); + } + + // select from location path + return parse_relative_location_path(n); + } + + return n; + } else if (_lexer.current() == lex_minus) { + _lexer.next(); + + // precedence 7+ - only parses union expressions + xpath_ast_node* expr = parse_expression_rec(parse_path_or_unary_expression(), 7); + + return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr); + } else + return parse_location_path(); + } + + struct binary_op_t { + ast_type_t asttype; + xpath_value_type rettype; + int precedence; + + binary_op_t(): asttype(ast_unknown), rettype(xpath_type_none), precedence(0) { + } + + binary_op_t(ast_type_t asttype_, xpath_value_type rettype_, int precedence_): asttype(asttype_), rettype(rettype_), precedence(precedence_) { + } + + static binary_op_t parse(xpath_lexer& lexer) { + switch (lexer.current()) { + case lex_string: + if (lexer.contents() == PUGIXML_TEXT("or")) + return binary_op_t(ast_op_or, xpath_type_boolean, 1); + else if (lexer.contents() == PUGIXML_TEXT("and")) + return binary_op_t(ast_op_and, xpath_type_boolean, 2); + else if (lexer.contents() == PUGIXML_TEXT("div")) + return binary_op_t(ast_op_divide, xpath_type_number, 6); + else if (lexer.contents() == PUGIXML_TEXT("mod")) + return binary_op_t(ast_op_mod, xpath_type_number, 6); + else + return binary_op_t(); + + case lex_equal: + return binary_op_t(ast_op_equal, xpath_type_boolean, 3); + + case lex_not_equal: + return binary_op_t(ast_op_not_equal, xpath_type_boolean, 3); + + case lex_less: + return binary_op_t(ast_op_less, xpath_type_boolean, 4); + + case lex_greater: + return binary_op_t(ast_op_greater, xpath_type_boolean, 4); + + case lex_less_or_equal: + return binary_op_t(ast_op_less_or_equal, xpath_type_boolean, 4); + + case lex_greater_or_equal: + return binary_op_t(ast_op_greater_or_equal, xpath_type_boolean, 4); + + case lex_plus: + return binary_op_t(ast_op_add, xpath_type_number, 5); + + case lex_minus: + return binary_op_t(ast_op_subtract, xpath_type_number, 5); + + case lex_multiply: + return binary_op_t(ast_op_multiply, xpath_type_number, 6); + + case lex_union: + return binary_op_t(ast_op_union, xpath_type_node_set, 7); + + default: + return binary_op_t(); + } + } + }; + + xpath_ast_node* parse_expression_rec(xpath_ast_node* lhs, int limit) { + binary_op_t op = binary_op_t::parse(_lexer); + + while (op.asttype != ast_unknown && op.precedence >= limit) { + _lexer.next(); + + xpath_ast_node* rhs = parse_path_or_unary_expression(); + + binary_op_t nextop = binary_op_t::parse(_lexer); + + while (nextop.asttype != ast_unknown && nextop.precedence > op.precedence) { + rhs = parse_expression_rec(rhs, nextop.precedence); + + nextop = binary_op_t::parse(_lexer); + } + + if (op.asttype == ast_op_union && (lhs->rettype() != xpath_type_node_set || rhs->rettype() != xpath_type_node_set)) + throw_error("Union operator has to be applied to node sets"); + + lhs = new (alloc_node()) xpath_ast_node(op.asttype, op.rettype, lhs, rhs); + + op = binary_op_t::parse(_lexer); + } + + return lhs; + } + + // Expr ::= OrExpr + // OrExpr ::= AndExpr | OrExpr 'or' AndExpr + // AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr + // EqualityExpr ::= RelationalExpr + // | EqualityExpr '=' RelationalExpr + // | EqualityExpr '!=' RelationalExpr + // RelationalExpr ::= AdditiveExpr + // | RelationalExpr '<' AdditiveExpr + // | RelationalExpr '>' AdditiveExpr + // | RelationalExpr '<=' AdditiveExpr + // | RelationalExpr '>=' AdditiveExpr + // AdditiveExpr ::= MultiplicativeExpr + // | AdditiveExpr '+' MultiplicativeExpr + // | AdditiveExpr '-' MultiplicativeExpr + // MultiplicativeExpr ::= UnaryExpr + // | MultiplicativeExpr '*' UnaryExpr + // | MultiplicativeExpr 'div' UnaryExpr + // | MultiplicativeExpr 'mod' UnaryExpr + xpath_ast_node* parse_expression() { + return parse_expression_rec(parse_path_or_unary_expression(), 0); + } + + xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result) { + } + + xpath_ast_node* parse() { + xpath_ast_node* result = parse_expression(); + + if (_lexer.current() != lex_eof) { + // there are still unparsed tokens left, error + throw_error("Incorrect query"); + } + + return result; + } + + static xpath_ast_node* parse(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result) { + xpath_parser parser(query, variables, alloc, result); + +#ifdef PUGIXML_NO_EXCEPTIONS + int error = setjmp(parser._error_handler); + + return (error == 0) ? parser.parse() : 0; +#else + return parser.parse(); +#endif + } +}; + +struct xpath_query_impl { + static xpath_query_impl* create() { + void* memory = xml_memory::allocate(sizeof(xpath_query_impl)); + if (!memory) return 0; + + return new (memory) xpath_query_impl(); + } + + static void destroy(xpath_query_impl* impl) { + // free all allocated pages + impl->alloc.release(); + + // free allocator memory (with the first page) + xml_memory::deallocate(impl); + } + + xpath_query_impl(): root(0), alloc(&block) { + block.next = 0; + block.capacity = sizeof(block.data); + } + + xpath_ast_node* root; + xpath_allocator alloc; + xpath_memory_block block; +}; + +PUGI__FN xpath_string evaluate_string_impl(xpath_query_impl* impl, const xpath_node& n, xpath_stack_data& sd) +{ + if (!impl) return xpath_string(); + +#ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return xpath_string(); +#endif + + xpath_context c(n, 1, 1); + + return impl->root->eval_string(c, sd.stack); +} + +PUGI__FN impl::xpath_ast_node* evaluate_node_set_prepare(xpath_query_impl* impl) +{ + if (!impl) return 0; + + if (impl->root->rettype() != xpath_type_node_set) { +#ifdef PUGIXML_NO_EXCEPTIONS + return 0; +#else + xpath_parse_result res; + res.error = "Expression does not evaluate to node set"; + + throw xpath_exception(res); +#endif + } + + return impl->root; +} +PUGI__NS_END + +namespace pugi +{ +#ifndef PUGIXML_NO_EXCEPTIONS +PUGI__FN xpath_exception::xpath_exception(const xpath_parse_result& result_): _result(result_) +{ + assert(_result.error); +} + +PUGI__FN const char* xpath_exception::what() const throw() +{ + return _result.error; +} + +PUGI__FN const xpath_parse_result& xpath_exception::result() const +{ + return _result; +} +#endif + +PUGI__FN xpath_node::xpath_node() +{ +} + +PUGI__FN xpath_node::xpath_node(const xml_node& node_): _node(node_) +{ +} + +PUGI__FN xpath_node::xpath_node(const xml_attribute& attribute_, const xml_node& parent_): _node(attribute_ ? parent_ : xml_node()), _attribute(attribute_) +{ +} + +PUGI__FN xml_node xpath_node::node() const +{ + return _attribute ? xml_node() : _node; +} + +PUGI__FN xml_attribute xpath_node::attribute() const +{ + return _attribute; +} + +PUGI__FN xml_node xpath_node::parent() const +{ + return _attribute ? _node : _node.parent(); +} + +PUGI__FN static void unspecified_bool_xpath_node(xpath_node***) +{ +} + +PUGI__FN xpath_node::operator xpath_node::unspecified_bool_type() const +{ + return (_node || _attribute) ? unspecified_bool_xpath_node : 0; +} + +PUGI__FN bool xpath_node::operator!() const +{ + return !(_node || _attribute); +} + +PUGI__FN bool xpath_node::operator==(const xpath_node& n) const +{ + return _node == n._node && _attribute == n._attribute; +} + +PUGI__FN bool xpath_node::operator!=(const xpath_node& n) const +{ + return _node != n._node || _attribute != n._attribute; +} + +#ifdef __BORLANDC__ +PUGI__FN bool operator&&(const xpath_node& lhs, bool rhs) +{ + return (bool)lhs && rhs; +} + +PUGI__FN bool operator||(const xpath_node& lhs, bool rhs) +{ + return (bool)lhs || rhs; +} +#endif + +PUGI__FN void xpath_node_set::_assign(const_iterator begin_, const_iterator end_, type_t type_) +{ + assert(begin_ <= end_); + + size_t size_ = static_cast(end_ - begin_); + + if (size_ <= 1) { + // deallocate old buffer + if (_begin != &_storage) impl::xml_memory::deallocate(_begin); + + // use internal buffer + if (begin_ != end_) _storage = *begin_; + + _begin = &_storage; + _end = &_storage + size_; + _type = type_; + } else { + // make heap copy + xpath_node* storage = static_cast(impl::xml_memory::allocate(size_ * sizeof(xpath_node))); + + if (!storage) { +#ifdef PUGIXML_NO_EXCEPTIONS + return; +#else + throw std::bad_alloc(); +#endif + } + + memcpy(storage, begin_, size_ * sizeof(xpath_node)); + + // deallocate old buffer + if (_begin != &_storage) impl::xml_memory::deallocate(_begin); + + // finalize + _begin = storage; + _end = storage + size_; + _type = type_; + } +} + +#if __cplusplus >= 201103 +PUGI__FN void xpath_node_set::_move(xpath_node_set& rhs) +{ + _type = rhs._type; + _storage = rhs._storage; + _begin = (rhs._begin == &rhs._storage) ? &_storage : rhs._begin; + _end = _begin + (rhs._end - rhs._begin); + + rhs._type = type_unsorted; + rhs._begin = &rhs._storage; + rhs._end = rhs._begin; +} +#endif + +PUGI__FN xpath_node_set::xpath_node_set(): _type(type_unsorted), _begin(&_storage), _end(&_storage) +{ +} + +PUGI__FN xpath_node_set::xpath_node_set(const_iterator begin_, const_iterator end_, type_t type_): _type(type_unsorted), _begin(&_storage), _end(&_storage) +{ + _assign(begin_, end_, type_); +} + +PUGI__FN xpath_node_set::~xpath_node_set() +{ + if (_begin != &_storage) + impl::xml_memory::deallocate(_begin); +} + +PUGI__FN xpath_node_set::xpath_node_set(const xpath_node_set& ns): _type(type_unsorted), _begin(&_storage), _end(&_storage) +{ + _assign(ns._begin, ns._end, ns._type); +} + +PUGI__FN xpath_node_set& xpath_node_set::operator=(const xpath_node_set& ns) +{ + if (this == &ns) return *this; + + _assign(ns._begin, ns._end, ns._type); + + return *this; +} + +#if __cplusplus >= 201103 +PUGI__FN xpath_node_set::xpath_node_set(xpath_node_set&& rhs): _type(type_unsorted), _begin(&_storage), _end(&_storage) +{ + _move(rhs); +} + +PUGI__FN xpath_node_set& xpath_node_set::operator=(xpath_node_set&& rhs) +{ + if (this == &rhs) return *this; + + if (_begin != &_storage) + impl::xml_memory::deallocate(_begin); + + _move(rhs); + + return *this; +} +#endif + +PUGI__FN xpath_node_set::type_t xpath_node_set::type() const +{ + return _type; +} + +PUGI__FN size_t xpath_node_set::size() const +{ + return _end - _begin; +} + +PUGI__FN bool xpath_node_set::empty() const +{ + return _begin == _end; +} + +PUGI__FN const xpath_node& xpath_node_set::operator[](size_t index) const +{ + assert(index < size()); + return _begin[index]; +} + +PUGI__FN xpath_node_set::const_iterator xpath_node_set::begin() const +{ + return _begin; +} + +PUGI__FN xpath_node_set::const_iterator xpath_node_set::end() const +{ + return _end; +} + +PUGI__FN void xpath_node_set::sort(bool reverse) +{ + _type = impl::xpath_sort(_begin, _end, _type, reverse); +} + +PUGI__FN xpath_node xpath_node_set::first() const +{ + return impl::xpath_first(_begin, _end, _type); +} + +PUGI__FN xpath_parse_result::xpath_parse_result(): error("Internal error"), offset(0) +{ +} + +PUGI__FN xpath_parse_result::operator bool() const +{ + return error == 0; +} + +PUGI__FN const char* xpath_parse_result::description() const +{ + return error ? error : "No error"; +} + +PUGI__FN xpath_variable::xpath_variable(xpath_value_type type_): _type(type_), _next(0) +{ +} + +PUGI__FN const char_t* xpath_variable::name() const +{ + switch (_type) { + case xpath_type_node_set: + return static_cast(this)->name; + + case xpath_type_number: + return static_cast(this)->name; + + case xpath_type_string: + return static_cast(this)->name; + + case xpath_type_boolean: + return static_cast(this)->name; + + default: + assert(!"Invalid variable type"); + return 0; + } +} + +PUGI__FN xpath_value_type xpath_variable::type() const +{ + return _type; +} + +PUGI__FN bool xpath_variable::get_boolean() const +{ + return (_type == xpath_type_boolean) ? static_cast(this)->value : false; +} + +PUGI__FN double xpath_variable::get_number() const +{ + return (_type == xpath_type_number) ? static_cast(this)->value : impl::gen_nan(); +} + +PUGI__FN const char_t* xpath_variable::get_string() const +{ + const char_t* value = (_type == xpath_type_string) ? static_cast(this)->value : 0; + return value ? value : PUGIXML_TEXT(""); +} + +PUGI__FN const xpath_node_set& xpath_variable::get_node_set() const +{ + return (_type == xpath_type_node_set) ? static_cast(this)->value : impl::dummy_node_set; +} + +PUGI__FN bool xpath_variable::set(bool value) +{ + if (_type != xpath_type_boolean) return false; + + static_cast(this)->value = value; + return true; +} + +PUGI__FN bool xpath_variable::set(double value) +{ + if (_type != xpath_type_number) return false; + + static_cast(this)->value = value; + return true; +} + +PUGI__FN bool xpath_variable::set(const char_t* value) +{ + if (_type != xpath_type_string) return false; + + impl::xpath_variable_string* var = static_cast(this); + + // duplicate string + size_t size = (impl::strlength(value) + 1) * sizeof(char_t); + + char_t* copy = static_cast(impl::xml_memory::allocate(size)); + if (!copy) return false; + + memcpy(copy, value, size); + + // replace old string + if (var->value) impl::xml_memory::deallocate(var->value); + var->value = copy; + + return true; +} + +PUGI__FN bool xpath_variable::set(const xpath_node_set& value) +{ + if (_type != xpath_type_node_set) return false; + + static_cast(this)->value = value; + return true; +} + +PUGI__FN xpath_variable_set::xpath_variable_set() +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) + _data[i] = 0; +} + +PUGI__FN xpath_variable_set::~xpath_variable_set() +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) + _destroy(_data[i]); +} + +PUGI__FN xpath_variable_set::xpath_variable_set(const xpath_variable_set& rhs) +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) + _data[i] = 0; + + _assign(rhs); +} + +PUGI__FN xpath_variable_set& xpath_variable_set::operator=(const xpath_variable_set& rhs) +{ + if (this == &rhs) return *this; + + _assign(rhs); + + return *this; +} + +#if __cplusplus >= 201103 +PUGI__FN xpath_variable_set::xpath_variable_set(xpath_variable_set&& rhs) +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) { + _data[i] = rhs._data[i]; + rhs._data[i] = 0; + } +} + +PUGI__FN xpath_variable_set& xpath_variable_set::operator=(xpath_variable_set&& rhs) +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) { + _destroy(_data[i]); + + _data[i] = rhs._data[i]; + rhs._data[i] = 0; + } + + return *this; +} +#endif + +PUGI__FN void xpath_variable_set::_assign(const xpath_variable_set& rhs) +{ + xpath_variable_set temp; + + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) + if (rhs._data[i] && !_clone(rhs._data[i], &temp._data[i])) + return; + + _swap(temp); +} + +PUGI__FN void xpath_variable_set::_swap(xpath_variable_set& rhs) +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) { + xpath_variable* chain = _data[i]; + + _data[i] = rhs._data[i]; + rhs._data[i] = chain; + } +} + +PUGI__FN xpath_variable* xpath_variable_set::_find(const char_t* name) const +{ + const size_t hash_size = sizeof(_data) / sizeof(_data[0]); + size_t hash = impl::hash_string(name) % hash_size; + + // look for existing variable + for (xpath_variable* var = _data[hash]; var; var = var->_next) + if (impl::strequal(var->name(), name)) + return var; + + return 0; +} + +PUGI__FN bool xpath_variable_set::_clone(xpath_variable* var, xpath_variable** out_result) +{ + xpath_variable* last = 0; + + while (var) { + // allocate storage for new variable + xpath_variable* nvar = impl::new_xpath_variable(var->_type, var->name()); + if (!nvar) return false; + + // link the variable to the result immediately to handle failures gracefully + if (last) + last->_next = nvar; + else + *out_result = nvar; + + last = nvar; + + // copy the value; this can fail due to out-of-memory conditions + if (!impl::copy_xpath_variable(nvar, var)) return false; + + var = var->_next; + } + + return true; +} + +PUGI__FN void xpath_variable_set::_destroy(xpath_variable* var) +{ + while (var) { + xpath_variable* next = var->_next; + + impl::delete_xpath_variable(var->_type, var); + + var = next; + } +} + +PUGI__FN xpath_variable* xpath_variable_set::add(const char_t* name, xpath_value_type type) +{ + const size_t hash_size = sizeof(_data) / sizeof(_data[0]); + size_t hash = impl::hash_string(name) % hash_size; + + // look for existing variable + for (xpath_variable* var = _data[hash]; var; var = var->_next) + if (impl::strequal(var->name(), name)) + return var->type() == type ? var : 0; + + // add new variable + xpath_variable* result = impl::new_xpath_variable(type, name); + + if (result) { + result->_next = _data[hash]; + + _data[hash] = result; + } + + return result; +} + +PUGI__FN bool xpath_variable_set::set(const char_t* name, bool value) +{ + xpath_variable* var = add(name, xpath_type_boolean); + return var ? var->set(value) : false; +} + +PUGI__FN bool xpath_variable_set::set(const char_t* name, double value) +{ + xpath_variable* var = add(name, xpath_type_number); + return var ? var->set(value) : false; +} + +PUGI__FN bool xpath_variable_set::set(const char_t* name, const char_t* value) +{ + xpath_variable* var = add(name, xpath_type_string); + return var ? var->set(value) : false; +} + +PUGI__FN bool xpath_variable_set::set(const char_t* name, const xpath_node_set& value) +{ + xpath_variable* var = add(name, xpath_type_node_set); + return var ? var->set(value) : false; +} + +PUGI__FN xpath_variable* xpath_variable_set::get(const char_t* name) +{ + return _find(name); +} + +PUGI__FN const xpath_variable* xpath_variable_set::get(const char_t* name) const +{ + return _find(name); +} + +PUGI__FN xpath_query::xpath_query(const char_t* query, xpath_variable_set* variables): _impl(0) +{ + impl::xpath_query_impl* qimpl = impl::xpath_query_impl::create(); + + if (!qimpl) { +#ifdef PUGIXML_NO_EXCEPTIONS + _result.error = "Out of memory"; +#else + throw std::bad_alloc(); +#endif + } else { + using impl::auto_deleter; // MSVC7 workaround + auto_deleter impl(qimpl, impl::xpath_query_impl::destroy); + + qimpl->root = impl::xpath_parser::parse(query, variables, &qimpl->alloc, &_result); + + if (qimpl->root) { + qimpl->root->optimize(&qimpl->alloc); + + _impl = impl.release(); + _result.error = 0; + } + } +} + +PUGI__FN xpath_query::xpath_query(): _impl(0) +{ +} + +PUGI__FN xpath_query::~xpath_query() +{ + if (_impl) + impl::xpath_query_impl::destroy(static_cast(_impl)); +} + +#if __cplusplus >= 201103 +PUGI__FN xpath_query::xpath_query(xpath_query&& rhs) +{ + _impl = rhs._impl; + _result = rhs._result; + rhs._impl = 0; + rhs._result = xpath_parse_result(); +} + +PUGI__FN xpath_query& xpath_query::operator=(xpath_query&& rhs) +{ + if (this == &rhs) return *this; + + if (_impl) + impl::xpath_query_impl::destroy(static_cast(_impl)); + + _impl = rhs._impl; + _result = rhs._result; + rhs._impl = 0; + rhs._result = xpath_parse_result(); + + return *this; +} +#endif + +PUGI__FN xpath_value_type xpath_query::return_type() const +{ + if (!_impl) return xpath_type_none; + + return static_cast(_impl)->root->rettype(); +} + +PUGI__FN bool xpath_query::evaluate_boolean(const xpath_node& n) const +{ + if (!_impl) return false; + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + +#ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return false; +#endif + + return static_cast(_impl)->root->eval_boolean(c, sd.stack); +} + +PUGI__FN double xpath_query::evaluate_number(const xpath_node& n) const +{ + if (!_impl) return impl::gen_nan(); + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + +#ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return impl::gen_nan(); +#endif + + return static_cast(_impl)->root->eval_number(c, sd.stack); +} + +#ifndef PUGIXML_NO_STL +PUGI__FN string_t xpath_query::evaluate_string(const xpath_node& n) const +{ + impl::xpath_stack_data sd; + + impl::xpath_string r = impl::evaluate_string_impl(static_cast(_impl), n, sd); + + return string_t(r.c_str(), r.length()); +} +#endif + +PUGI__FN size_t xpath_query::evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const +{ + impl::xpath_stack_data sd; + + impl::xpath_string r = impl::evaluate_string_impl(static_cast(_impl), n, sd); + + size_t full_size = r.length() + 1; + + if (capacity > 0) { + size_t size = (full_size < capacity) ? full_size : capacity; + assert(size > 0); + + memcpy(buffer, r.c_str(), (size - 1) * sizeof(char_t)); + buffer[size - 1] = 0; + } + + return full_size; +} + +PUGI__FN xpath_node_set xpath_query::evaluate_node_set(const xpath_node& n) const +{ + impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast(_impl)); + if (!root) return xpath_node_set(); + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + +#ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return xpath_node_set(); +#endif + + impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_all); + + return xpath_node_set(r.begin(), r.end(), r.type()); +} + +PUGI__FN xpath_node xpath_query::evaluate_node(const xpath_node& n) const +{ + impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast(_impl)); + if (!root) return xpath_node(); + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + +#ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return xpath_node(); +#endif + + impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_first); + + return r.first(); +} + +PUGI__FN const xpath_parse_result& xpath_query::result() const +{ + return _result; +} + +PUGI__FN static void unspecified_bool_xpath_query(xpath_query***) +{ +} + +PUGI__FN xpath_query::operator xpath_query::unspecified_bool_type() const +{ + return _impl ? unspecified_bool_xpath_query : 0; +} + +PUGI__FN bool xpath_query::operator!() const +{ + return !_impl; +} + +PUGI__FN xpath_node xml_node::select_node(const char_t* query, xpath_variable_set* variables) const +{ + xpath_query q(query, variables); + return select_node(q); +} + +PUGI__FN xpath_node xml_node::select_node(const xpath_query& query) const +{ + return query.evaluate_node(*this); +} + +PUGI__FN xpath_node_set xml_node::select_nodes(const char_t* query, xpath_variable_set* variables) const +{ + xpath_query q(query, variables); + return select_nodes(q); +} + +PUGI__FN xpath_node_set xml_node::select_nodes(const xpath_query& query) const +{ + return query.evaluate_node_set(*this); +} + +PUGI__FN xpath_node xml_node::select_single_node(const char_t* query, xpath_variable_set* variables) const +{ + xpath_query q(query, variables); + return select_single_node(q); +} + +PUGI__FN xpath_node xml_node::select_single_node(const xpath_query& query) const +{ + return query.evaluate_node(*this); +} +} + +#endif + +#ifdef __BORLANDC__ +# pragma option pop +#endif + +// Intel C++ does not properly keep warning state for function templates, +// so popping warning state at the end of translation unit leads to warnings in the middle. +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +# pragma warning(pop) +#endif + +// Undefine all local macros (makes sure we're not leaking macros in header-only mode) +#undef PUGI__NO_INLINE +#undef PUGI__UNLIKELY +#undef PUGI__STATIC_ASSERT +#undef PUGI__DMC_VOLATILE +#undef PUGI__MSVC_CRT_VERSION +#undef PUGI__NS_BEGIN +#undef PUGI__NS_END +#undef PUGI__FN +#undef PUGI__FN_NO_INLINE +#undef PUGI__GETPAGE_IMPL +#undef PUGI__GETPAGE +#undef PUGI__NODETYPE +#undef PUGI__IS_CHARTYPE_IMPL +#undef PUGI__IS_CHARTYPE +#undef PUGI__IS_CHARTYPEX +#undef PUGI__ENDSWITH +#undef PUGI__SKIPWS +#undef PUGI__OPTSET +#undef PUGI__PUSHNODE +#undef PUGI__POPNODE +#undef PUGI__SCANFOR +#undef PUGI__SCANWHILE +#undef PUGI__SCANWHILE_UNROLL +#undef PUGI__ENDSEG +#undef PUGI__THROW_ERROR +#undef PUGI__CHECK_ERROR + +#endif + +/** + * Copyright (c) 2006-2015 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/mosesdecoder/moses2/pugixml.hpp b/mosesdecoder/moses2/pugixml.hpp new file mode 100644 index 0000000000000000000000000000000000000000..13bf7917b727865b40dfeb4bb547e9add706000b --- /dev/null +++ b/mosesdecoder/moses2/pugixml.hpp @@ -0,0 +1,1391 @@ +/** + * pugixml parser - version 1.7 + * -------------------------------------------------------- + * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at http://pugixml.org/ + * + * This library is distributed under the MIT License. See notice at the end + * of this file. + * + * This work is based on the pugxml parser, which is: + * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) + */ + +#ifndef PUGIXML_VERSION +// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons +# define PUGIXML_VERSION 170 +#endif + +// Include user configuration file (this can define various configuration macros) +#include "pugiconfig.hpp" + +#ifndef HEADER_PUGIXML_HPP +#define HEADER_PUGIXML_HPP + +// Include stddef.h for size_t and ptrdiff_t +#include + +// Include exception header for XPath +#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS) +# include +#endif + +// Include STL headers +#ifndef PUGIXML_NO_STL +# include +# include +# include +#endif + +// Macro for deprecated features +#ifndef PUGIXML_DEPRECATED +# if defined(__GNUC__) +# define PUGIXML_DEPRECATED __attribute__((deprecated)) +# elif defined(_MSC_VER) && _MSC_VER >= 1300 +# define PUGIXML_DEPRECATED __declspec(deprecated) +# else +# define PUGIXML_DEPRECATED +# endif +#endif + +// If no API is defined, assume default +#ifndef PUGIXML_API +# define PUGIXML_API +#endif + +// If no API for classes is defined, assume default +#ifndef PUGIXML_CLASS +# define PUGIXML_CLASS PUGIXML_API +#endif + +// If no API for functions is defined, assume default +#ifndef PUGIXML_FUNCTION +# define PUGIXML_FUNCTION PUGIXML_API +#endif + +// If the platform is known to have long long support, enable long long functions +#ifndef PUGIXML_HAS_LONG_LONG +# if __cplusplus >= 201103 +# define PUGIXML_HAS_LONG_LONG +# elif defined(_MSC_VER) && _MSC_VER >= 1400 +# define PUGIXML_HAS_LONG_LONG +# endif +#endif + +// Character interface macros +#ifdef PUGIXML_WCHAR_MODE +# define PUGIXML_TEXT(t) L ## t +# define PUGIXML_CHAR wchar_t +#else +# define PUGIXML_TEXT(t) t +# define PUGIXML_CHAR char +#endif + +namespace pugi +{ +// Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE +typedef PUGIXML_CHAR char_t; + +#ifndef PUGIXML_NO_STL +// String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE +typedef std::basic_string, std::allocator > string_t; +#endif +} + +// The PugiXML namespace +namespace pugi +{ +// Tree node types +enum xml_node_type { + node_null, // Empty (null) node handle + node_document, // A document tree's absolute root + node_element, // Element tag, i.e. '' + node_pcdata, // Plain character data, i.e. 'text' + node_cdata, // Character data, i.e. '' + node_comment, // Comment tag, i.e. '' + node_pi, // Processing instruction, i.e. '' + node_declaration, // Document declaration, i.e. '' + node_doctype // Document type declaration, i.e. '' +}; + +// Parsing options + +// Minimal parsing mode (equivalent to turning all other flags off). +// Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed. +const unsigned int parse_minimal = 0x0000; + +// This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default. +const unsigned int parse_pi = 0x0001; + +// This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default. +const unsigned int parse_comments = 0x0002; + +// This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default. +const unsigned int parse_cdata = 0x0004; + +// This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree. +// This flag is off by default; turning it on usually results in slower parsing and more memory consumption. +const unsigned int parse_ws_pcdata = 0x0008; + +// This flag determines if character and entity references are expanded during parsing. This flag is on by default. +const unsigned int parse_escapes = 0x0010; + +// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default. +const unsigned int parse_eol = 0x0020; + +// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default. +const unsigned int parse_wconv_attribute = 0x0040; + +// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default. +const unsigned int parse_wnorm_attribute = 0x0080; + +// This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default. +const unsigned int parse_declaration = 0x0100; + +// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default. +const unsigned int parse_doctype = 0x0200; + +// This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only +// of whitespace is added to the DOM tree. +// This flag is off by default; turning it on may result in slower parsing and more memory consumption. +const unsigned int parse_ws_pcdata_single = 0x0400; + +// This flag determines if leading and trailing whitespace is to be removed from plain character data. This flag is off by default. +const unsigned int parse_trim_pcdata = 0x0800; + +// This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document +// is a valid document. This flag is off by default. +const unsigned int parse_fragment = 0x1000; + +// The default parsing mode. +// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, +// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. +const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol; + +// The full parsing mode. +// Nodes of all types are added to the DOM tree, character/reference entities are expanded, +// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. +const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype; + +// These flags determine the encoding of input data for XML document +enum xml_encoding { + encoding_auto, // Auto-detect input encoding using BOM or < / class xml_object_range +{ +public: + typedef It const_iterator; + typedef It iterator; + + xml_object_range(It b, It e): _begin(b), _end(e) { + } + + It begin() const { + return _begin; + } + It end() const { + return _end; + } + +private: + It _begin, _end; +}; + +// Writer interface for node printing (see xml_node::print) +class PUGIXML_CLASS xml_writer +{ +public: + virtual ~xml_writer() {} + + // Write memory chunk into stream/file/whatever + virtual void write(const void* data, size_t size) = 0; +}; + +// xml_writer implementation for FILE* +class PUGIXML_CLASS xml_writer_file: public xml_writer +{ +public: + // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio + xml_writer_file(void* file); + + virtual void write(const void* data, size_t size); + +private: + void* file; +}; + +#ifndef PUGIXML_NO_STL +// xml_writer implementation for streams +class PUGIXML_CLASS xml_writer_stream: public xml_writer +{ +public: + // Construct writer from an output stream object + xml_writer_stream(std::basic_ostream >& stream); + xml_writer_stream(std::basic_ostream >& stream); + + virtual void write(const void* data, size_t size); + +private: + std::basic_ostream >* narrow_stream; + std::basic_ostream >* wide_stream; +}; +#endif + +// A light-weight handle for manipulating attributes in DOM tree +class PUGIXML_CLASS xml_attribute +{ + friend class xml_attribute_iterator; + friend class xml_node; + +private: + xml_attribute_struct* _attr; + + typedef void (*unspecified_bool_type)(xml_attribute***); + +public: + // Default constructor. Constructs an empty attribute. + xml_attribute(); + + // Constructs attribute from internal pointer + explicit xml_attribute(xml_attribute_struct* attr); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators (compares wrapped attribute pointers) + bool operator==(const xml_attribute& r) const; + bool operator!=(const xml_attribute& r) const; + bool operator<(const xml_attribute& r) const; + bool operator>(const xml_attribute& r) const; + bool operator<=(const xml_attribute& r) const; + bool operator>=(const xml_attribute& r) const; + + // Check if attribute is empty + bool empty() const; + + // Get attribute name/value, or "" if attribute is empty + const char_t* name() const; + const char_t* value() const; + + // Get attribute value, or the default value if attribute is empty + const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; + + // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty + int as_int(int def = 0) const; + unsigned int as_uint(unsigned int def = 0) const; + double as_double(double def = 0) const; + float as_float(float def = 0) const; + +#ifdef PUGIXML_HAS_LONG_LONG + long long as_llong(long long def = 0) const; + unsigned long long as_ullong(unsigned long long def = 0) const; +#endif + + // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty + bool as_bool(bool def = false) const; + + // Set attribute name/value (returns false if attribute is empty or there is not enough memory) + bool set_name(const char_t* rhs); + bool set_value(const char_t* rhs); + + // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") + bool set_value(int rhs); + bool set_value(unsigned int rhs); + bool set_value(double rhs); + bool set_value(float rhs); + bool set_value(bool rhs); + +#ifdef PUGIXML_HAS_LONG_LONG + bool set_value(long long rhs); + bool set_value(unsigned long long rhs); +#endif + + // Set attribute value (equivalent to set_value without error checking) + xml_attribute& operator=(const char_t* rhs); + xml_attribute& operator=(int rhs); + xml_attribute& operator=(unsigned int rhs); + xml_attribute& operator=(double rhs); + xml_attribute& operator=(float rhs); + xml_attribute& operator=(bool rhs); + +#ifdef PUGIXML_HAS_LONG_LONG + xml_attribute& operator=(long long rhs); + xml_attribute& operator=(unsigned long long rhs); +#endif + + // Get next/previous attribute in the attribute list of the parent node + xml_attribute next_attribute() const; + xml_attribute previous_attribute() const; + + // Get hash value (unique for handles to the same object) + size_t hash_value() const; + + // Get internal pointer + xml_attribute_struct* internal_object() const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs); +#endif + +// A light-weight handle for manipulating nodes in DOM tree +class PUGIXML_CLASS xml_node +{ + friend class xml_attribute_iterator; + friend class xml_node_iterator; + friend class xml_named_node_iterator; + +protected: + xml_node_struct* _root; + + typedef void (*unspecified_bool_type)(xml_node***); + +public: + // Default constructor. Constructs an empty node. + xml_node(); + + // Constructs node from internal pointer + explicit xml_node(xml_node_struct* p); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators (compares wrapped node pointers) + bool operator==(const xml_node& r) const; + bool operator!=(const xml_node& r) const; + bool operator<(const xml_node& r) const; + bool operator>(const xml_node& r) const; + bool operator<=(const xml_node& r) const; + bool operator>=(const xml_node& r) const; + + // Check if node is empty. + bool empty() const; + + // Get node type + xml_node_type type() const; + + // Get node name, or "" if node is empty or it has no name + const char_t* name() const; + + // Get node value, or "" if node is empty or it has no value + // Note: For text node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes. + const char_t* value() const; + + // Get attribute list + xml_attribute first_attribute() const; + xml_attribute last_attribute() const; + + // Get children list + xml_node first_child() const; + xml_node last_child() const; + + // Get next/previous sibling in the children list of the parent node + xml_node next_sibling() const; + xml_node previous_sibling() const; + + // Get parent node + xml_node parent() const; + + // Get root of DOM tree this node belongs to + xml_node root() const; + + // Get text object for the current node + xml_text text() const; + + // Get child, attribute or next/previous sibling with the specified name + xml_node child(const char_t* name) const; + xml_attribute attribute(const char_t* name) const; + xml_node next_sibling(const char_t* name) const; + xml_node previous_sibling(const char_t* name) const; + + // Get attribute, starting the search from a hint (and updating hint so that searching for a sequence of attributes is fast) + xml_attribute attribute(const char_t* name, xml_attribute& hint) const; + + // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA + const char_t* child_value() const; + + // Get child value of child with specified name. Equivalent to child(name).child_value(). + const char_t* child_value(const char_t* name) const; + + // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value) + bool set_name(const char_t* rhs); + bool set_value(const char_t* rhs); + + // Add attribute with specified name. Returns added attribute, or empty attribute on errors. + xml_attribute append_attribute(const char_t* name); + xml_attribute prepend_attribute(const char_t* name); + xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr); + xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr); + + // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors. + xml_attribute append_copy(const xml_attribute& proto); + xml_attribute prepend_copy(const xml_attribute& proto); + xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr); + xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr); + + // Add child node with specified type. Returns added node, or empty node on errors. + xml_node append_child(xml_node_type type = node_element); + xml_node prepend_child(xml_node_type type = node_element); + xml_node insert_child_after(xml_node_type type, const xml_node& node); + xml_node insert_child_before(xml_node_type type, const xml_node& node); + + // Add child element with specified name. Returns added node, or empty node on errors. + xml_node append_child(const char_t* name); + xml_node prepend_child(const char_t* name); + xml_node insert_child_after(const char_t* name, const xml_node& node); + xml_node insert_child_before(const char_t* name, const xml_node& node); + + // Add a copy of the specified node as a child. Returns added node, or empty node on errors. + xml_node append_copy(const xml_node& proto); + xml_node prepend_copy(const xml_node& proto); + xml_node insert_copy_after(const xml_node& proto, const xml_node& node); + xml_node insert_copy_before(const xml_node& proto, const xml_node& node); + + // Move the specified node to become a child of this node. Returns moved node, or empty node on errors. + xml_node append_move(const xml_node& moved); + xml_node prepend_move(const xml_node& moved); + xml_node insert_move_after(const xml_node& moved, const xml_node& node); + xml_node insert_move_before(const xml_node& moved, const xml_node& node); + + // Remove specified attribute + bool remove_attribute(const xml_attribute& a); + bool remove_attribute(const char_t* name); + + // Remove specified child + bool remove_child(const xml_node& n); + bool remove_child(const char_t* name); + + // Parses buffer as an XML document fragment and appends all nodes as children of the current node. + // Copies/converts the buffer, so it may be deleted or changed after the function returns. + // Note: append_buffer allocates memory that has the lifetime of the owning document; removing the appended nodes does not immediately reclaim that memory. + xml_parse_result append_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Find attribute using predicate. Returns first attribute for which predicate returned true. + template xml_attribute find_attribute(Predicate pred) const { + if (!_root) return xml_attribute(); + + for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute()) + if (pred(attrib)) + return attrib; + + return xml_attribute(); + } + + // Find child node using predicate. Returns first child for which predicate returned true. + template xml_node find_child(Predicate pred) const { + if (!_root) return xml_node(); + + for (xml_node node = first_child(); node; node = node.next_sibling()) + if (pred(node)) + return node; + + return xml_node(); + } + + // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true. + template xml_node find_node(Predicate pred) const { + if (!_root) return xml_node(); + + xml_node cur = first_child(); + + while (cur._root && cur._root != _root) { + if (pred(cur)) return cur; + + if (cur.first_child()) cur = cur.first_child(); + else if (cur.next_sibling()) cur = cur.next_sibling(); + else { + while (!cur.next_sibling() && cur._root != _root) cur = cur.parent(); + + if (cur._root != _root) cur = cur.next_sibling(); + } + } + + return xml_node(); + } + + // Find child node by attribute name/value + xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const; + xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const; + +#ifndef PUGIXML_NO_STL + // Get the absolute node path from root as a text string. + string_t path(char_t delimiter = '/') const; +#endif + + // Search for a node by path consisting of node names and . or .. elements. + xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const; + + // Recursively traverse subtree with xml_tree_walker + bool traverse(xml_tree_walker& walker); + +#ifndef PUGIXML_NO_XPATH + // Select single node by evaluating XPath query. Returns first node from the resulting node set. + xpath_node select_node(const char_t* query, xpath_variable_set* variables = 0) const; + xpath_node select_node(const xpath_query& query) const; + + // Select node set by evaluating XPath query + xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const; + xpath_node_set select_nodes(const xpath_query& query) const; + + // (deprecated: use select_node instead) Select single node by evaluating XPath query. + xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const; + xpath_node select_single_node(const xpath_query& query) const; + +#endif + + // Print subtree using a writer object + void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; + +#ifndef PUGIXML_NO_STL + // Print subtree to stream + void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; + void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const; +#endif + + // Child nodes iterators + typedef xml_node_iterator iterator; + + iterator begin() const; + iterator end() const; + + // Attribute iterators + typedef xml_attribute_iterator attribute_iterator; + + attribute_iterator attributes_begin() const; + attribute_iterator attributes_end() const; + + // Range-based for support + xml_object_range children() const; + xml_object_range children(const char_t* name) const; + xml_object_range attributes() const; + + // Get node offset in parsed file/string (in char_t units) for debugging purposes + ptrdiff_t offset_debug() const; + + // Get hash value (unique for handles to the same object) + size_t hash_value() const; + + // Get internal pointer + xml_node_struct* internal_object() const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs); +#endif + +// A helper for working with text inside PCDATA nodes +class PUGIXML_CLASS xml_text +{ + friend class xml_node; + + xml_node_struct* _root; + + typedef void (*unspecified_bool_type)(xml_text***); + + explicit xml_text(xml_node_struct* root); + + xml_node_struct* _data_new(); + xml_node_struct* _data() const; + +public: + // Default constructor. Constructs an empty object. + xml_text(); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Check if text object is empty + bool empty() const; + + // Get text, or "" if object is empty + const char_t* get() const; + + // Get text, or the default value if object is empty + const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; + + // Get text as a number, or the default value if conversion did not succeed or object is empty + int as_int(int def = 0) const; + unsigned int as_uint(unsigned int def = 0) const; + double as_double(double def = 0) const; + float as_float(float def = 0) const; + +#ifdef PUGIXML_HAS_LONG_LONG + long long as_llong(long long def = 0) const; + unsigned long long as_ullong(unsigned long long def = 0) const; +#endif + + // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty + bool as_bool(bool def = false) const; + + // Set text (returns false if object is empty or there is not enough memory) + bool set(const char_t* rhs); + + // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") + bool set(int rhs); + bool set(unsigned int rhs); + bool set(double rhs); + bool set(float rhs); + bool set(bool rhs); + +#ifdef PUGIXML_HAS_LONG_LONG + bool set(long long rhs); + bool set(unsigned long long rhs); +#endif + + // Set text (equivalent to set without error checking) + xml_text& operator=(const char_t* rhs); + xml_text& operator=(int rhs); + xml_text& operator=(unsigned int rhs); + xml_text& operator=(double rhs); + xml_text& operator=(float rhs); + xml_text& operator=(bool rhs); + +#ifdef PUGIXML_HAS_LONG_LONG + xml_text& operator=(long long rhs); + xml_text& operator=(unsigned long long rhs); +#endif + + // Get the data node (node_pcdata or node_cdata) for this object + xml_node data() const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs); +#endif + +// Child node iterator (a bidirectional iterator over a collection of xml_node) +class PUGIXML_CLASS xml_node_iterator +{ + friend class xml_node; + +private: + mutable xml_node _wrap; + xml_node _parent; + + xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent); + +public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_node value_type; + typedef xml_node* pointer; + typedef xml_node& reference; + +#ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; +#endif + + // Default constructor + xml_node_iterator(); + + // Construct an iterator which points to the specified node + xml_node_iterator(const xml_node& node); + + // Iterator operators + bool operator==(const xml_node_iterator& rhs) const; + bool operator!=(const xml_node_iterator& rhs) const; + + xml_node& operator*() const; + xml_node* operator->() const; + + const xml_node_iterator& operator++(); + xml_node_iterator operator++(int); + + const xml_node_iterator& operator--(); + xml_node_iterator operator--(int); +}; + +// Attribute iterator (a bidirectional iterator over a collection of xml_attribute) +class PUGIXML_CLASS xml_attribute_iterator +{ + friend class xml_node; + +private: + mutable xml_attribute _wrap; + xml_node _parent; + + xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent); + +public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_attribute value_type; + typedef xml_attribute* pointer; + typedef xml_attribute& reference; + +#ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; +#endif + + // Default constructor + xml_attribute_iterator(); + + // Construct an iterator which points to the specified attribute + xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent); + + // Iterator operators + bool operator==(const xml_attribute_iterator& rhs) const; + bool operator!=(const xml_attribute_iterator& rhs) const; + + xml_attribute& operator*() const; + xml_attribute* operator->() const; + + const xml_attribute_iterator& operator++(); + xml_attribute_iterator operator++(int); + + const xml_attribute_iterator& operator--(); + xml_attribute_iterator operator--(int); +}; + +// Named node range helper +class PUGIXML_CLASS xml_named_node_iterator +{ + friend class xml_node; + +public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_node value_type; + typedef xml_node* pointer; + typedef xml_node& reference; + +#ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; +#endif + + // Default constructor + xml_named_node_iterator(); + + // Construct an iterator which points to the specified node + xml_named_node_iterator(const xml_node& node, const char_t* name); + + // Iterator operators + bool operator==(const xml_named_node_iterator& rhs) const; + bool operator!=(const xml_named_node_iterator& rhs) const; + + xml_node& operator*() const; + xml_node* operator->() const; + + const xml_named_node_iterator& operator++(); + xml_named_node_iterator operator++(int); + + const xml_named_node_iterator& operator--(); + xml_named_node_iterator operator--(int); + +private: + mutable xml_node _wrap; + xml_node _parent; + const char_t* _name; + + xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name); +}; + +// Abstract tree walker class (see xml_node::traverse) +class PUGIXML_CLASS xml_tree_walker +{ + friend class xml_node; + +private: + int _depth; + +protected: + // Get current traversal depth + int depth() const; + +public: + xml_tree_walker(); + virtual ~xml_tree_walker(); + + // Callback that is called when traversal begins + virtual bool begin(xml_node& node); + + // Callback that is called for each node traversed + virtual bool for_each(xml_node& node) = 0; + + // Callback that is called when traversal ends + virtual bool end(xml_node& node); +}; + +// Parsing status, returned as part of xml_parse_result object +enum xml_parse_status { + status_ok = 0, // No error + + status_file_not_found, // File was not found during load_file() + status_io_error, // Error reading from file/stream + status_out_of_memory, // Could not allocate memory + status_internal_error, // Internal error occurred + + status_unrecognized_tag, // Parser could not determine tag type + + status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction + status_bad_comment, // Parsing error occurred while parsing comment + status_bad_cdata, // Parsing error occurred while parsing CDATA section + status_bad_doctype, // Parsing error occurred while parsing document type declaration + status_bad_pcdata, // Parsing error occurred while parsing PCDATA section + status_bad_start_element, // Parsing error occurred while parsing start element tag + status_bad_attribute, // Parsing error occurred while parsing element attribute + status_bad_end_element, // Parsing error occurred while parsing end element tag + status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag) + + status_append_invalid_root, // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) + + status_no_document_element // Parsing resulted in a document without element nodes +}; + +// Parsing result +struct PUGIXML_CLASS xml_parse_result { + // Parsing status (see xml_parse_status) + xml_parse_status status; + + // Last parsed offset (in char_t units from start of input data) + ptrdiff_t offset; + + // Source document encoding + xml_encoding encoding; + + // Default constructor, initializes object to failed state + xml_parse_result(); + + // Cast to bool operator + operator bool() const; + + // Get error description + const char* description() const; +}; + +// Document class (DOM tree root) +class PUGIXML_CLASS xml_document: public xml_node +{ +private: + char_t* _buffer; + + char _memory[192]; + + // Non-copyable semantics + xml_document(const xml_document&); + xml_document& operator=(const xml_document&); + + void create(); + void destroy(); + +public: + // Default constructor, makes empty document + xml_document(); + + // Destructor, invalidates all node/attribute handles to this document + ~xml_document(); + + // Removes all nodes, leaving the empty document + void reset(); + + // Removes all nodes, then copies the entire contents of the specified document + void reset(const xml_document& proto); + +#ifndef PUGIXML_NO_STL + // Load document from stream. + xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default); +#endif + + // (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied. + xml_parse_result load(const char_t* contents, unsigned int options = parse_default); + + // Load document from zero-terminated string. No encoding conversions are applied. + xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default); + + // Load document from file + xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns. + xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). + // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed. + xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). + // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore). + xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details). + void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + +#ifndef PUGIXML_NO_STL + // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details). + void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const; +#endif + + // Save XML to file + bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + + // Get document element + xml_node document_element() const; +}; + +#ifndef PUGIXML_NO_XPATH +// XPath query return type +enum xpath_value_type { + xpath_type_none, // Unknown type (query failed to compile) + xpath_type_node_set, // Node set (xpath_node_set) + xpath_type_number, // Number + xpath_type_string, // String + xpath_type_boolean // Boolean +}; + +// XPath parsing result +struct PUGIXML_CLASS xpath_parse_result { + // Error message (0 if no error) + const char* error; + + // Last parsed offset (in char_t units from string start) + ptrdiff_t offset; + + // Default constructor, initializes object to failed state + xpath_parse_result(); + + // Cast to bool operator + operator bool() const; + + // Get error description + const char* description() const; +}; + +// A single XPath variable +class PUGIXML_CLASS xpath_variable +{ + friend class xpath_variable_set; + +protected: + xpath_value_type _type; + xpath_variable* _next; + + xpath_variable(xpath_value_type type); + + // Non-copyable semantics + xpath_variable(const xpath_variable&); + xpath_variable& operator=(const xpath_variable&); + +public: + // Get variable name + const char_t* name() const; + + // Get variable type + xpath_value_type type() const; + + // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error + bool get_boolean() const; + double get_number() const; + const char_t* get_string() const; + const xpath_node_set& get_node_set() const; + + // Set variable value; no type conversion is performed, false is returned on type mismatch error + bool set(bool value); + bool set(double value); + bool set(const char_t* value); + bool set(const xpath_node_set& value); +}; + +// A set of XPath variables +class PUGIXML_CLASS xpath_variable_set +{ +private: + xpath_variable* _data[64]; + + void _assign(const xpath_variable_set& rhs); + void _swap(xpath_variable_set& rhs); + + xpath_variable* _find(const char_t* name) const; + + static bool _clone(xpath_variable* var, xpath_variable** out_result); + static void _destroy(xpath_variable* var); + +public: + // Default constructor/destructor + xpath_variable_set(); + ~xpath_variable_set(); + + // Copy constructor/assignment operator + xpath_variable_set(const xpath_variable_set& rhs); + xpath_variable_set& operator=(const xpath_variable_set& rhs); + +#if __cplusplus >= 201103 + // Move semantics support + xpath_variable_set(xpath_variable_set&& rhs); + xpath_variable_set& operator=(xpath_variable_set&& rhs); +#endif + + // Add a new variable or get the existing one, if the types match + xpath_variable* add(const char_t* name, xpath_value_type type); + + // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch + bool set(const char_t* name, bool value); + bool set(const char_t* name, double value); + bool set(const char_t* name, const char_t* value); + bool set(const char_t* name, const xpath_node_set& value); + + // Get existing variable by name + xpath_variable* get(const char_t* name); + const xpath_variable* get(const char_t* name) const; +}; + +// A compiled XPath query object +class PUGIXML_CLASS xpath_query +{ +private: + void* _impl; + xpath_parse_result _result; + + typedef void (*unspecified_bool_type)(xpath_query***); + + // Non-copyable semantics + xpath_query(const xpath_query&); + xpath_query& operator=(const xpath_query&); + +public: + // Construct a compiled object from XPath expression. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors. + explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0); + + // Constructor + xpath_query(); + + // Destructor + ~xpath_query(); + +#if __cplusplus >= 201103 + // Move semantics support + xpath_query(xpath_query&& rhs); + xpath_query& operator=(xpath_query&& rhs); +#endif + + // Get query expression return type + xpath_value_type return_type() const; + + // Evaluate expression as boolean value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + bool evaluate_boolean(const xpath_node& n) const; + + // Evaluate expression as double value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + double evaluate_number(const xpath_node& n) const; + +#ifndef PUGIXML_NO_STL + // Evaluate expression as string value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + string_t evaluate_string(const xpath_node& n) const; +#endif + + // Evaluate expression as string value in the specified context; performs type conversion if necessary. + // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero). + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + // If PUGIXML_NO_EXCEPTIONS is defined, returns empty set instead. + size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const; + + // Evaluate expression as node set in the specified context. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors. + // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead. + xpath_node_set evaluate_node_set(const xpath_node& n) const; + + // Evaluate expression as node set in the specified context. + // Return first node in document order, or empty node if node set is empty. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors. + // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node instead. + xpath_node evaluate_node(const xpath_node& n) const; + + // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode) + const xpath_parse_result& result() const; + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; +}; + +#ifndef PUGIXML_NO_EXCEPTIONS +// XPath exception class +class PUGIXML_CLASS xpath_exception: public std::exception +{ +private: + xpath_parse_result _result; + +public: + // Construct exception from parse result + explicit xpath_exception(const xpath_parse_result& result); + + // Get error message + virtual const char* what() const throw(); + + // Get parse result + const xpath_parse_result& result() const; +}; +#endif + +// XPath node class (either xml_node or xml_attribute) +class PUGIXML_CLASS xpath_node +{ +private: + xml_node _node; + xml_attribute _attribute; + + typedef void (*unspecified_bool_type)(xpath_node***); + +public: + // Default constructor; constructs empty XPath node + xpath_node(); + + // Construct XPath node from XML node/attribute + xpath_node(const xml_node& node); + xpath_node(const xml_attribute& attribute, const xml_node& parent); + + // Get node/attribute, if any + xml_node node() const; + xml_attribute attribute() const; + + // Get parent of contained node/attribute + xml_node parent() const; + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators + bool operator==(const xpath_node& n) const; + bool operator!=(const xpath_node& n) const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs); +#endif + +// A fixed-size collection of XPath nodes +class PUGIXML_CLASS xpath_node_set +{ +public: + // Collection type + enum type_t { + type_unsorted, // Not ordered + type_sorted, // Sorted by document order (ascending) + type_sorted_reverse // Sorted by document order (descending) + }; + + // Constant iterator type + typedef const xpath_node* const_iterator; + + // We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work + typedef const xpath_node* iterator; + + // Default constructor. Constructs empty set. + xpath_node_set(); + + // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful + xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted); + + // Destructor + ~xpath_node_set(); + + // Copy constructor/assignment operator + xpath_node_set(const xpath_node_set& ns); + xpath_node_set& operator=(const xpath_node_set& ns); + +#if __cplusplus >= 201103 + // Move semantics support + xpath_node_set(xpath_node_set&& rhs); + xpath_node_set& operator=(xpath_node_set&& rhs); +#endif + + // Get collection type + type_t type() const; + + // Get collection size + size_t size() const; + + // Indexing operator + const xpath_node& operator[](size_t index) const; + + // Collection iterators + const_iterator begin() const; + const_iterator end() const; + + // Sort the collection in ascending/descending order by document order + void sort(bool reverse = false); + + // Get first node in the collection by document order + xpath_node first() const; + + // Check if collection is empty + bool empty() const; + +private: + type_t _type; + + xpath_node _storage; + + xpath_node* _begin; + xpath_node* _end; + + void _assign(const_iterator begin, const_iterator end, type_t type); + void _move(xpath_node_set& rhs); +}; +#endif + +#ifndef PUGIXML_NO_STL +// Convert wide string to UTF8 +std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const wchar_t* str); +std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const std::basic_string, std::allocator >& str); + +// Convert UTF8 to wide string +std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const char* str); +std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const std::basic_string, std::allocator >& str); +#endif + +// Memory allocation function interface; returns pointer to allocated memory or NULL on failure +typedef void* (*allocation_function)(size_t size); + +// Memory deallocation function interface +typedef void (*deallocation_function)(void* ptr); + +// Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions. +void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate); + +// Get current memory management functions +allocation_function PUGIXML_FUNCTION get_memory_allocation_function(); +deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function(); +} + +#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC)) +namespace std +{ +// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier) +std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&); +std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&); +std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&); +} +#endif + +#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC) +namespace std +{ +// Workarounds for (non-standard) iterator category detection +std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&); +std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&); +std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&); +} +#endif + +#endif + +// Make sure implementation is included in header-only mode +// Use macro expansion in #include to work around QMake (QTBUG-11923) +#if defined(PUGIXML_HEADER_ONLY) && !defined(PUGIXML_SOURCE) +# define PUGIXML_SOURCE "pugixml.cpp" +# include PUGIXML_SOURCE +#endif + +/** + * Copyright (c) 2006-2015 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/mosesdecoder/moses2/server/Server.cpp b/mosesdecoder/moses2/server/Server.cpp new file mode 100644 index 0000000000000000000000000000000000000000..de3542eb6a3a5d5b005077735b90fea141951c5e --- /dev/null +++ b/mosesdecoder/moses2/server/Server.cpp @@ -0,0 +1,72 @@ +/* + * Server.cpp + * + * Created on: 1 Apr 2016 + * Author: hieu + */ +#include +#include "../System.h" +#include "Server.h" +#include "Translator.h" +#include "../parameters/ServerOptions.h" + +using namespace std; + +namespace Moses2 +{ + +Server::Server(ServerOptions &server_options, System &system) + :m_server_options(server_options) + ,m_translator(new Translator(*this, system)) +{ + m_registry.addMethod("translate", m_translator); +} + +Server::~Server() +{ + unlink(m_pidfile.c_str()); +} + +void Server::run(System &system) +{ + xmlrpc_c::serverAbyss myAbyssServer + (xmlrpc_c::serverAbyss::constrOpt() + .registryP(&m_registry) + .portNumber(m_server_options.port) // TCP port on which to listen + .logFileName(m_server_options.logfile) + .allowOrigin("*") + .maxConn(m_server_options.maxConn) + .maxConnBacklog(m_server_options.maxConnBacklog) + .keepaliveTimeout(m_server_options.keepaliveTimeout) + .keepaliveMaxConn(m_server_options.keepaliveMaxConn) + .timeout(m_server_options.timeout) + ); + std::ostringstream pidfilename; + pidfilename << "/tmp/moses-server." << m_server_options.port << ".pid"; + m_pidfile = pidfilename.str(); + std::ofstream pidfile(m_pidfile.c_str()); + +#ifdef _WIN32 + int thePid = GetCurrentProcessId(); +#else + int thePid = getpid(); +#endif + pidfile << thePid << std::endl; + pidfile.close(); + cerr << "Listening on port " << m_server_options.port << std::endl; + if (m_server_options.is_serial) { + cerr << "Running server in serial mode." << std::endl; + while(true) myAbyssServer.runOnce(); + } else myAbyssServer.run(); + + std::cerr << "xmlrpc_c::serverAbyss.run() returned but it should not." + << std::endl; +} + +ServerOptions const&Server::options() const +{ + return m_server_options; +} + + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/server/Server.h b/mosesdecoder/moses2/server/Server.h new file mode 100644 index 0000000000000000000000000000000000000000..d19ef75d2f0150713242d05db480ef5739657171 --- /dev/null +++ b/mosesdecoder/moses2/server/Server.h @@ -0,0 +1,39 @@ +/* + * Server.h + * + * Created on: 1 Apr 2016 + * Author: hieu + */ +#pragma once + +#include +#include +#include + +namespace Moses2 +{ +class System; +class ServerOptions; +class Manager; + +class Server +{ +public: + Server(ServerOptions &server_options, System &system); + virtual ~Server(); + + void run(System &system); + + ServerOptions const& + options() const; + +protected: + ServerOptions &m_server_options; + std::string m_pidfile; + xmlrpc_c::registry m_registry; + xmlrpc_c::methodPtr const m_translator; + +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/server/TranslationRequest.cpp b/mosesdecoder/moses2/server/TranslationRequest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2d50835a64ffe6d3ee06c90d518b87911409a0b2 --- /dev/null +++ b/mosesdecoder/moses2/server/TranslationRequest.cpp @@ -0,0 +1,68 @@ +#include +#include "TranslationRequest.h" +#include "../ManagerBase.h" +#include "../System.h" + +using namespace std; + +namespace Moses2 +{ +TranslationRequest:: +TranslationRequest(xmlrpc_c::paramList const& paramList, + boost::condition_variable& cond, + boost::mutex& mut, + System &system, + const std::string &line, + long translationId) + :TranslationTask(system, line, translationId) + ,m_cond(cond) + ,m_mutex(mut) + ,m_done(false) +{ + +} + +boost::shared_ptr +TranslationRequest:: +create(Translator* translator, + xmlrpc_c::paramList const& paramList, + boost::condition_variable& cond, + boost::mutex& mut, + System &system, + const std::string &line, + long translationId) +{ + boost::shared_ptr ret; + TranslationRequest *request = new TranslationRequest(paramList, cond, mut, system, line, translationId); + ret.reset(request); + ret->m_translator = translator; + return ret; +} + +void +TranslationRequest:: +Run() +{ + m_mgr->Decode(); + + string out; + out = m_mgr->OutputBest(); + m_retData["text"] = xmlrpc_c::value_string(out); + + { + boost::lock_guard lock(m_mutex); + m_done = true; + } + m_cond.notify_one(); + + delete m_mgr; +} + +void TranslationRequest::pack_hypothesis(const Manager& manager, Hypothesis const* h, + std::string const& key, + std::map & dest) const +{ + +} + +} diff --git a/mosesdecoder/moses2/server/TranslationRequest.h b/mosesdecoder/moses2/server/TranslationRequest.h new file mode 100644 index 0000000000000000000000000000000000000000..822cde153a8496a64fd5405f1a4cfed6edcbe036 --- /dev/null +++ b/mosesdecoder/moses2/server/TranslationRequest.h @@ -0,0 +1,81 @@ +// -*- c++ -*- +#pragma once + +#include +#include +#include + +#ifdef WITH_THREADS +#include +#endif + +#include +#include +#include "../TranslationTask.h" + +#include "Translator.h" + +namespace Moses2 +{ +class Hypothesis; +class System; +class Manager; + +class + TranslationRequest : public virtual TranslationTask +{ +protected: + std::map m_retData; + Translator* m_translator; + + boost::condition_variable& m_cond; + boost::mutex& m_mutex; + bool m_done; + + TranslationRequest(xmlrpc_c::paramList const& paramList, + boost::condition_variable& cond, + boost::mutex& mut, + System &system, + const std::string &line, + long translationId); + + void + pack_hypothesis(const Manager& manager, Hypothesis const* h, + std::string const& key, + std::map & dest) const; + +public: + + static + boost::shared_ptr + create(Translator* translator, + xmlrpc_c::paramList const& paramList, + boost::condition_variable& cond, + boost::mutex& mut, + System &system, + const std::string &line, + long translationId); + + + virtual bool + DeleteAfterExecution() { + return false; + } + + bool + IsDone() const { + return m_done; + } + + std::map const& + GetRetData() { + return m_retData; + } + + void + Run(); + + +}; + +} diff --git a/mosesdecoder/moses2/server/Translator.cpp b/mosesdecoder/moses2/server/Translator.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6f62123233854d7fb3eed5d542696e76bcb40dbd --- /dev/null +++ b/mosesdecoder/moses2/server/Translator.cpp @@ -0,0 +1,68 @@ +/* + * Translator.cpp + * + * Created on: 1 Apr 2016 + * Author: hieu + */ +#include +#include "Translator.h" +#include "TranslationRequest.h" +#include "Server.h" +#include "../parameters/ServerOptions.h" + +using namespace std; + +namespace Moses2 +{ + +Translator::Translator(Server& server, System &system) + : m_server(server), + m_threadPool(server.options().numThreads), + m_system(system), + m_translationId(0) +{ + // signature and help strings are documentation -- the client + // can query this information with a system.methodSignature and + // system.methodHelp RPC. + this->_signature = "S:S"; + this->_help = "Does translation"; +} + +Translator::~Translator() +{ + // TODO Auto-generated destructor stub +} + +void Translator::execute(xmlrpc_c::paramList const& paramList, + xmlrpc_c::value *const retvalP) +{ + typedef std::map param_t; + param_t const& params = paramList.getStruct(0); + param_t::const_iterator si; + si = params.find("text"); + if (si == params.end()) { + throw xmlrpc_c::fault("Missing source text", xmlrpc_c::fault::CODE_PARSE); + } + + string line = xmlrpc_c::value_string(si->second); + long translationId; + + // get unique id. Thread safe + { + boost::unique_lock lock(m_accessLock); + translationId = m_translationId++; + } + + boost::condition_variable cond; + boost::mutex mut; + boost::shared_ptr task; + task = TranslationRequest::create(this, paramList,cond,mut, m_system, line, translationId); + m_threadPool.Submit(task); + boost::unique_lock lock(mut); + while (!task->IsDone()) { + cond.wait(lock); + } + *retvalP = xmlrpc_c::value_struct(task->GetRetData()); +} + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/server/Translator.h b/mosesdecoder/moses2/server/Translator.h new file mode 100644 index 0000000000000000000000000000000000000000..bb84c70b11c2ab75a0489e2d1384f242cefe64e2 --- /dev/null +++ b/mosesdecoder/moses2/server/Translator.h @@ -0,0 +1,40 @@ +/* + * Translator.h + * + * Created on: 1 Apr 2016 + * Author: hieu + */ + +#pragma once +#include +#include +#include +#include +#include "../legacy/ThreadPool.h" + +namespace Moses2 +{ +class Server; +class System; +class Manager; + +class Translator : public xmlrpc_c::method +{ +public: + Translator(Server& server, System &system); + virtual ~Translator(); + + void execute(xmlrpc_c::paramList const& paramList, + xmlrpc_c::value * const retvalP); + +protected: + Server& m_server; + Moses2::ThreadPool m_threadPool; + System &m_system; + long m_translationId; + boost::shared_mutex m_accessLock; + +}; + +} /* namespace Moses2 */ +