Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp +94 -0
- mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h +105 -0
- mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp +195 -0
- mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h +94 -0
- mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h +143 -0
- mosesdecoder/moses/TranslationModel/CompactPT/ListCoders.h +387 -0
- mosesdecoder/moses/TranslationModel/CompactPT/PackedArray.h +187 -0
- mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp +194 -0
- mosesdecoder/moses/TranslationModel/CompactPT/StringVectorTemp.h +430 -0
- mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h +163 -0
- mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp +30 -0
- mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.h +31 -0
- mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.cpp +65 -0
- mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.cpp +33 -0
- mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.cpp +260 -0
- mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp +417 -0
- mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp +398 -0
- mosesdecoder/moses/TranslationModel/RuleTable/Trie.h +63 -0
- mosesdecoder/moses/TranslationModel/RuleTable/UTrie.cpp +96 -0
- mosesdecoder/moses/TranslationModel/RuleTable/UTrie.h +73 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/Alignments.h +20 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp +1029 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h +91 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/Match.h +34 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.cpp +25 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.h +43 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.cpp +240 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.h +69 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.cpp +71 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.h +46 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.cpp +387 -0
- mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.h +5 -0
- mosesdecoder/moses/server/Hypothesis_4server.cpp +37 -0
- mosesdecoder/moses/server/Optimizer.h +17 -0
- mosesdecoder/moses/server/PackScores.cpp +45 -0
- mosesdecoder/moses/server/PackScores.h +10 -0
- mosesdecoder/moses/server/Server.h +46 -0
- mosesdecoder/moses/server/Session.h +75 -0
- mosesdecoder/moses/server/TranslationRequest.cpp +524 -0
- mosesdecoder/moses/server/Updater.cpp +58 -0
- mosesdecoder/moses/server/Updater.h +44 -0
- mosesdecoder/util/bit_packing_test.cc +59 -0
- mosesdecoder/util/ersatz_progress.hh +57 -0
- mosesdecoder/util/exception.hh +165 -0
- mosesdecoder/util/fake_ostream.hh +111 -0
- mosesdecoder/util/file_piece.hh +175 -0
- mosesdecoder/util/file_piece_test.cc +154 -0
- mosesdecoder/util/generator.hh +34 -0
- mosesdecoder/util/getopt.c +78 -0
- mosesdecoder/util/integer_to_string_test.cc +81 -0
mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifdef HAVE_CMPH
|
| 23 |
+
|
| 24 |
+
#include "CmphStringVectorAdapter.h"
|
| 25 |
+
|
| 26 |
+
namespace Moses
|
| 27 |
+
{
|
| 28 |
+
|
| 29 |
+
void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
|
| 30 |
+
{
|
| 31 |
+
delete[] key;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
void CmphStringVectorAdapterRewind(void *data)
|
| 35 |
+
{
|
| 36 |
+
cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
|
| 37 |
+
cmph_vector->position = 0;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
//************************************************************************//
|
| 41 |
+
|
| 42 |
+
cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v)
|
| 43 |
+
{
|
| 44 |
+
cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
|
| 45 |
+
cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
|
| 46 |
+
assert(key_source);
|
| 47 |
+
assert(cmph_vector);
|
| 48 |
+
|
| 49 |
+
cmph_vector->vector = (void *)&v;
|
| 50 |
+
cmph_vector->position = 0;
|
| 51 |
+
key_source->data = (void *)cmph_vector;
|
| 52 |
+
key_source->nkeys = v.size();
|
| 53 |
+
|
| 54 |
+
return key_source;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
|
| 58 |
+
{
|
| 59 |
+
cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
|
| 60 |
+
std::vector<std::string>* v = (std::vector<std::string>*)cmph_vector->vector;
|
| 61 |
+
size_t size;
|
| 62 |
+
*keylen = (*v)[cmph_vector->position].size();
|
| 63 |
+
size = *keylen;
|
| 64 |
+
*key = new char[size + 1];
|
| 65 |
+
std::string temp = (*v)[cmph_vector->position];
|
| 66 |
+
strcpy(*key, temp.c_str());
|
| 67 |
+
cmph_vector->position = cmph_vector->position + 1;
|
| 68 |
+
return (int)(*keylen);
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
|
| 72 |
+
{
|
| 73 |
+
delete[] key;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
void CmphVectorAdapterRewind(void *data)
|
| 77 |
+
{
|
| 78 |
+
cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
|
| 79 |
+
cmph_vector->position = 0;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v)
|
| 83 |
+
{
|
| 84 |
+
cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v);
|
| 85 |
+
|
| 86 |
+
key_source->read = CmphVectorAdapterRead;
|
| 87 |
+
key_source->dispose = CmphVectorAdapterDispose;
|
| 88 |
+
key_source->rewind = CmphVectorAdapterRewind;
|
| 89 |
+
return key_source;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_CmphStringVectorAdapterNew_h
|
| 23 |
+
#define moses_CmphStringVectorAdapterNew_h
|
| 24 |
+
|
| 25 |
+
#include <cassert>
|
| 26 |
+
#include <cstring>
|
| 27 |
+
|
| 28 |
+
#ifdef HAVE_CMPH
|
| 29 |
+
#include "cmph.h"
|
| 30 |
+
|
| 31 |
+
#include "StringVector.h"
|
| 32 |
+
|
| 33 |
+
namespace Moses
|
| 34 |
+
{
|
| 35 |
+
|
| 36 |
+
typedef struct {
|
| 37 |
+
void *vector;
|
| 38 |
+
cmph_uint32 position;
|
| 39 |
+
}
|
| 40 |
+
cmph_vector_t;
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
template <typename ValueT, typename PosT, template <typename> class Allocator>
|
| 44 |
+
cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector<ValueT, PosT, Allocator>& sv)
|
| 45 |
+
{
|
| 46 |
+
cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
|
| 47 |
+
cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
|
| 48 |
+
assert(key_source);
|
| 49 |
+
assert(cmph_vector);
|
| 50 |
+
|
| 51 |
+
cmph_vector->vector = (void *)&sv;
|
| 52 |
+
cmph_vector->position = 0;
|
| 53 |
+
key_source->data = (void *)cmph_vector;
|
| 54 |
+
key_source->nkeys = sv.size();
|
| 55 |
+
|
| 56 |
+
return key_source;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
template <typename ValueT, typename PosT, template <typename> class Allocator>
|
| 60 |
+
int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
|
| 61 |
+
{
|
| 62 |
+
cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
|
| 63 |
+
StringVector<ValueT, PosT, Allocator>* sv = (StringVector<ValueT, PosT, Allocator>*)cmph_vector->vector;
|
| 64 |
+
size_t size;
|
| 65 |
+
*keylen = (*sv)[cmph_vector->position].size();
|
| 66 |
+
size = *keylen;
|
| 67 |
+
*key = new char[size + 1];
|
| 68 |
+
std::string temp = (*sv)[cmph_vector->position];
|
| 69 |
+
std::strcpy(*key, temp.c_str());
|
| 70 |
+
cmph_vector->position = cmph_vector->position + 1;
|
| 71 |
+
return (int)(*keylen);
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
|
| 75 |
+
|
| 76 |
+
void CmphStringVectorAdapterRewind(void *data);
|
| 77 |
+
|
| 78 |
+
template <typename ValueT, typename PosT, template <typename> class Allocator>
|
| 79 |
+
cmph_io_adapter_t* CmphStringVectorAdapter(StringVector<ValueT, PosT, Allocator>& sv)
|
| 80 |
+
{
|
| 81 |
+
cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv);
|
| 82 |
+
|
| 83 |
+
key_source->read = CmphStringVectorAdapterRead<ValueT, PosT, Allocator>;
|
| 84 |
+
key_source->dispose = CmphStringVectorAdapterDispose;
|
| 85 |
+
key_source->rewind = CmphStringVectorAdapterRewind;
|
| 86 |
+
return key_source;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
//************************************************************************//
|
| 90 |
+
|
| 91 |
+
cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v);
|
| 92 |
+
|
| 93 |
+
int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen);
|
| 94 |
+
|
| 95 |
+
void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
|
| 96 |
+
|
| 97 |
+
void CmphVectorAdapterRewind(void *data);
|
| 98 |
+
|
| 99 |
+
cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v);
|
| 100 |
+
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
#endif
|
| 104 |
+
|
| 105 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- c++ -*-
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
// $Id$
|
| 4 |
+
/***********************************************************************
|
| 5 |
+
Moses - factored phrase-based language decoder
|
| 6 |
+
Copyright (C) 2006 University of Edinburgh
|
| 7 |
+
|
| 8 |
+
This library is free software; you can redistribute it and/or
|
| 9 |
+
modify it under the terms of the GNU Lesser General Public
|
| 10 |
+
License as published by the Free Software Foundation; either
|
| 11 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 12 |
+
|
| 13 |
+
This library is distributed in the hope that it will be useful,
|
| 14 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 15 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 16 |
+
Lesser General Public License for more details.
|
| 17 |
+
|
| 18 |
+
You should have received a copy of the GNU Lesser General Public
|
| 19 |
+
License along with this library; if not, write to the Free Software
|
| 20 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 21 |
+
***********************************************************************/
|
| 22 |
+
|
| 23 |
+
#include "LexicalReorderingTableCompact.h"
|
| 24 |
+
#include "moses/parameters/OOVHandlingOptions.h"
|
| 25 |
+
|
| 26 |
+
namespace Moses
|
| 27 |
+
{
|
| 28 |
+
bool LexicalReorderingTableCompact::s_inMemoryByDefault = false;
|
| 29 |
+
|
| 30 |
+
LexicalReorderingTableCompact::
|
| 31 |
+
LexicalReorderingTableCompact(const std::string& filePath,
|
| 32 |
+
const std::vector<FactorType>& f_factors,
|
| 33 |
+
const std::vector<FactorType>& e_factors,
|
| 34 |
+
const std::vector<FactorType>& c_factors)
|
| 35 |
+
: LexicalReorderingTable(f_factors, e_factors, c_factors)
|
| 36 |
+
, m_inMemory(s_inMemoryByDefault)
|
| 37 |
+
, m_numScoreComponent(6)
|
| 38 |
+
, m_multipleScoreTrees(true)
|
| 39 |
+
, m_hash(10, 16)
|
| 40 |
+
, m_scoreTrees(1)
|
| 41 |
+
{
|
| 42 |
+
Load(filePath);
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
LexicalReorderingTableCompact::
|
| 46 |
+
LexicalReorderingTableCompact(const std::vector<FactorType>& f_factors,
|
| 47 |
+
const std::vector<FactorType>& e_factors,
|
| 48 |
+
const std::vector<FactorType>& c_factors)
|
| 49 |
+
: LexicalReorderingTable(f_factors, e_factors, c_factors)
|
| 50 |
+
, m_inMemory(s_inMemoryByDefault)
|
| 51 |
+
, m_numScoreComponent(6)
|
| 52 |
+
, m_multipleScoreTrees(true)
|
| 53 |
+
, m_hash(10, 16)
|
| 54 |
+
, m_scoreTrees(1)
|
| 55 |
+
{ }
|
| 56 |
+
|
| 57 |
+
LexicalReorderingTableCompact::
|
| 58 |
+
~LexicalReorderingTableCompact()
|
| 59 |
+
{
|
| 60 |
+
for(size_t i = 0; i < m_scoreTrees.size(); i++)
|
| 61 |
+
delete m_scoreTrees[i];
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
std::vector<float>
|
| 65 |
+
LexicalReorderingTableCompact::
|
| 66 |
+
GetScore(const Phrase& f, const Phrase& e, const Phrase& c)
|
| 67 |
+
{
|
| 68 |
+
std::string key;
|
| 69 |
+
Scores scores;
|
| 70 |
+
|
| 71 |
+
if(0 == c.GetSize())
|
| 72 |
+
key = MakeKey(f, e, c);
|
| 73 |
+
else
|
| 74 |
+
for(size_t i = 0; i <= c.GetSize(); ++i) {
|
| 75 |
+
Phrase sub_c(c.GetSubString(Range(i,c.GetSize()-1)));
|
| 76 |
+
key = MakeKey(f,e,sub_c);
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
size_t index = m_hash[key];
|
| 80 |
+
if(m_hash.GetSize() != index) {
|
| 81 |
+
std::string scoresString;
|
| 82 |
+
if(m_inMemory)
|
| 83 |
+
scoresString = m_scoresMemory[index].str();
|
| 84 |
+
else
|
| 85 |
+
scoresString = m_scoresMapped[index].str();
|
| 86 |
+
|
| 87 |
+
BitWrapper<> bitStream(scoresString);
|
| 88 |
+
for(size_t i = 0; i < m_numScoreComponent; i++)
|
| 89 |
+
scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream));
|
| 90 |
+
|
| 91 |
+
return scores;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
return Scores();
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
std::string
|
| 98 |
+
LexicalReorderingTableCompact::
|
| 99 |
+
MakeKey(const Phrase& f,
|
| 100 |
+
const Phrase& e,
|
| 101 |
+
const Phrase& c) const
|
| 102 |
+
{
|
| 103 |
+
return MakeKey(Trim(f.GetStringRep(m_FactorsF)),
|
| 104 |
+
Trim(e.GetStringRep(m_FactorsE)),
|
| 105 |
+
Trim(c.GetStringRep(m_FactorsC)));
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
std::string
|
| 109 |
+
LexicalReorderingTableCompact::
|
| 110 |
+
MakeKey(const std::string& f,
|
| 111 |
+
const std::string& e,
|
| 112 |
+
const std::string& c) const
|
| 113 |
+
{
|
| 114 |
+
std::string key;
|
| 115 |
+
if(!f.empty()) key += f;
|
| 116 |
+
if(!m_FactorsE.empty()) {
|
| 117 |
+
if(!key.empty()) key += " ||| ";
|
| 118 |
+
key += e;
|
| 119 |
+
}
|
| 120 |
+
if(!m_FactorsC.empty()) {
|
| 121 |
+
if(!key.empty()) key += " ||| ";
|
| 122 |
+
key += c;
|
| 123 |
+
}
|
| 124 |
+
key += " ||| ";
|
| 125 |
+
return key;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
LexicalReorderingTable*
|
| 129 |
+
LexicalReorderingTableCompact::
|
| 130 |
+
CheckAndLoad
|
| 131 |
+
(const std::string& filePath,
|
| 132 |
+
const std::vector<FactorType>& f_factors,
|
| 133 |
+
const std::vector<FactorType>& e_factors,
|
| 134 |
+
const std::vector<FactorType>& c_factors)
|
| 135 |
+
{
|
| 136 |
+
#ifdef HAVE_CMPH
|
| 137 |
+
std::string minlexr = ".minlexr";
|
| 138 |
+
// file name is specified without suffix
|
| 139 |
+
if(FileExists(filePath + minlexr)) {
|
| 140 |
+
//there exists a compact binary version use that
|
| 141 |
+
VERBOSE(2,"Using compact lexical reordering table" << std::endl);
|
| 142 |
+
return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors);
|
| 143 |
+
}
|
| 144 |
+
// file name is specified with suffix
|
| 145 |
+
if(filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) == minlexr
|
| 146 |
+
&& FileExists(filePath)) {
|
| 147 |
+
//there exists a compact binary version use that
|
| 148 |
+
VERBOSE(2,"Using compact lexical reordering table" << std::endl);
|
| 149 |
+
return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors);
|
| 150 |
+
}
|
| 151 |
+
#endif
|
| 152 |
+
return 0;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
void
|
| 156 |
+
LexicalReorderingTableCompact::
|
| 157 |
+
Load(std::string filePath)
|
| 158 |
+
{
|
| 159 |
+
std::FILE* pFile = std::fopen(filePath.c_str(), "r");
|
| 160 |
+
UTIL_THROW_IF2(pFile == NULL, "File " << filePath << " could not be opened");
|
| 161 |
+
|
| 162 |
+
//if(m_inMemory)
|
| 163 |
+
m_hash.Load(pFile);
|
| 164 |
+
//else
|
| 165 |
+
//m_hash.LoadIndex(pFile);
|
| 166 |
+
|
| 167 |
+
size_t read = 0;
|
| 168 |
+
read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile);
|
| 169 |
+
read += std::fread(&m_multipleScoreTrees,
|
| 170 |
+
sizeof(m_multipleScoreTrees), 1, pFile);
|
| 171 |
+
|
| 172 |
+
if(m_multipleScoreTrees) {
|
| 173 |
+
m_scoreTrees.resize(m_numScoreComponent);
|
| 174 |
+
for(size_t i = 0; i < m_numScoreComponent; i++)
|
| 175 |
+
m_scoreTrees[i] = new CanonicalHuffman<float>(pFile);
|
| 176 |
+
} else {
|
| 177 |
+
m_scoreTrees.resize(1);
|
| 178 |
+
m_scoreTrees[0] = new CanonicalHuffman<float>(pFile);
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
if(m_inMemory)
|
| 182 |
+
m_scoresMemory.load(pFile, false);
|
| 183 |
+
else
|
| 184 |
+
m_scoresMapped.load(pFile, true);
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
void
|
| 188 |
+
LexicalReorderingTableCompact::
|
| 189 |
+
SetStaticDefaultParameters(Parameter const& param)
|
| 190 |
+
{
|
| 191 |
+
param.SetParameter(s_inMemoryByDefault, "minlexr-memory", false);
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
}
|
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_LexicalReorderingTableCompact_h
|
| 23 |
+
#define moses_LexicalReorderingTableCompact_h
|
| 24 |
+
|
| 25 |
+
#include "moses/FF/LexicalReordering/LexicalReorderingTable.h"
|
| 26 |
+
#include "moses/StaticData.h"
|
| 27 |
+
#include "moses/TranslationModel/PhraseDictionary.h"
|
| 28 |
+
#include "moses/GenerationDictionary.h"
|
| 29 |
+
#include "moses/TargetPhrase.h"
|
| 30 |
+
#include "moses/TargetPhraseCollection.h"
|
| 31 |
+
|
| 32 |
+
#include "BlockHashIndex.h"
|
| 33 |
+
#include "CanonicalHuffman.h"
|
| 34 |
+
#include "StringVector.h"
|
| 35 |
+
|
| 36 |
+
namespace Moses
|
| 37 |
+
{
|
| 38 |
+
|
| 39 |
+
class LexicalReorderingTableCompact:
|
| 40 |
+
public LexicalReorderingTable
|
| 41 |
+
{
|
| 42 |
+
private:
|
| 43 |
+
static bool s_inMemoryByDefault;
|
| 44 |
+
bool m_inMemory;
|
| 45 |
+
|
| 46 |
+
size_t m_numScoreComponent;
|
| 47 |
+
bool m_multipleScoreTrees;
|
| 48 |
+
|
| 49 |
+
BlockHashIndex m_hash;
|
| 50 |
+
|
| 51 |
+
typedef CanonicalHuffman<float> ScoreTree;
|
| 52 |
+
std::vector<ScoreTree*> m_scoreTrees;
|
| 53 |
+
|
| 54 |
+
StringVector<unsigned char, unsigned long, MmapAllocator> m_scoresMapped;
|
| 55 |
+
StringVector<unsigned char, unsigned long, std::allocator> m_scoresMemory;
|
| 56 |
+
|
| 57 |
+
std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
|
| 58 |
+
std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
|
| 59 |
+
|
| 60 |
+
public:
|
| 61 |
+
LexicalReorderingTableCompact(const std::string& filePath,
|
| 62 |
+
const std::vector<FactorType>& f_factors,
|
| 63 |
+
const std::vector<FactorType>& e_factors,
|
| 64 |
+
const std::vector<FactorType>& c_factors);
|
| 65 |
+
|
| 66 |
+
LexicalReorderingTableCompact(const std::vector<FactorType>& f_factors,
|
| 67 |
+
const std::vector<FactorType>& e_factors,
|
| 68 |
+
const std::vector<FactorType>& c_factors);
|
| 69 |
+
|
| 70 |
+
virtual
|
| 71 |
+
~LexicalReorderingTableCompact();
|
| 72 |
+
|
| 73 |
+
virtual
|
| 74 |
+
std::vector<float>
|
| 75 |
+
GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
|
| 76 |
+
|
| 77 |
+
static
|
| 78 |
+
LexicalReorderingTable*
|
| 79 |
+
CheckAndLoad(const std::string& filePath,
|
| 80 |
+
const std::vector<FactorType>& f_factors,
|
| 81 |
+
const std::vector<FactorType>& e_factors,
|
| 82 |
+
const std::vector<FactorType>& c_factors);
|
| 83 |
+
|
| 84 |
+
void
|
| 85 |
+
Load(std::string filePath);
|
| 86 |
+
|
| 87 |
+
static void
|
| 88 |
+
SetStaticDefaultParameters(Parameter const& param);
|
| 89 |
+
|
| 90 |
+
};
|
| 91 |
+
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_LexicalReorderingTableCreator_h
|
| 23 |
+
#define moses_LexicalReorderingTableCreator_h
|
| 24 |
+
|
| 25 |
+
#include "PhraseTableCreator.h"
|
| 26 |
+
|
| 27 |
+
namespace Moses
|
| 28 |
+
{
|
| 29 |
+
|
| 30 |
+
class LexicalReorderingTableCreator
|
| 31 |
+
{
|
| 32 |
+
private:
|
| 33 |
+
std::string m_inPath;
|
| 34 |
+
std::string m_outPath;
|
| 35 |
+
std::string m_tempfilePath;
|
| 36 |
+
|
| 37 |
+
std::FILE* m_outFile;
|
| 38 |
+
|
| 39 |
+
size_t m_orderBits;
|
| 40 |
+
size_t m_fingerPrintBits;
|
| 41 |
+
|
| 42 |
+
size_t m_numScoreComponent;
|
| 43 |
+
|
| 44 |
+
bool m_multipleScoreTrees;
|
| 45 |
+
bool m_quantize;
|
| 46 |
+
|
| 47 |
+
std::string m_separator;
|
| 48 |
+
|
| 49 |
+
BlockHashIndex m_hash;
|
| 50 |
+
|
| 51 |
+
typedef Counter<float> ScoreCounter;
|
| 52 |
+
typedef CanonicalHuffman<float> ScoreTree;
|
| 53 |
+
|
| 54 |
+
std::vector<ScoreCounter*> m_scoreCounters;
|
| 55 |
+
std::vector<ScoreTree*> m_scoreTrees;
|
| 56 |
+
|
| 57 |
+
StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedScores;
|
| 58 |
+
StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedScores;
|
| 59 |
+
|
| 60 |
+
std::priority_queue<PackedItem> m_queue;
|
| 61 |
+
long m_lastFlushedLine;
|
| 62 |
+
long m_lastFlushedSourceNum;
|
| 63 |
+
std::string m_lastFlushedSourcePhrase;
|
| 64 |
+
std::vector<std::string> m_lastRange;
|
| 65 |
+
|
| 66 |
+
#ifdef WITH_THREADS
|
| 67 |
+
size_t m_threads;
|
| 68 |
+
#endif
|
| 69 |
+
|
| 70 |
+
void PrintInfo();
|
| 71 |
+
|
| 72 |
+
void EncodeScores();
|
| 73 |
+
void CalcHuffmanCodes();
|
| 74 |
+
void CompressScores();
|
| 75 |
+
void Save();
|
| 76 |
+
|
| 77 |
+
std::string MakeSourceTargetKey(std::string&, std::string&);
|
| 78 |
+
|
| 79 |
+
std::string EncodeLine(std::vector<std::string>& tokens);
|
| 80 |
+
void AddEncodedLine(PackedItem& pi);
|
| 81 |
+
void FlushEncodedQueue(bool force = false);
|
| 82 |
+
|
| 83 |
+
std::string CompressEncodedScores(std::string &encodedScores);
|
| 84 |
+
void AddCompressedScores(PackedItem& pi);
|
| 85 |
+
void FlushCompressedQueue(bool force = false);
|
| 86 |
+
|
| 87 |
+
public:
|
| 88 |
+
LexicalReorderingTableCreator(std::string inPath,
|
| 89 |
+
std::string outPath,
|
| 90 |
+
std::string tempfilePath,
|
| 91 |
+
size_t orderBits = 10,
|
| 92 |
+
size_t fingerPrintBits = 16,
|
| 93 |
+
bool multipleScoreTrees = true,
|
| 94 |
+
size_t quantize = 0
|
| 95 |
+
#ifdef WITH_THREADS
|
| 96 |
+
, size_t threads = 2
|
| 97 |
+
#endif
|
| 98 |
+
);
|
| 99 |
+
|
| 100 |
+
~LexicalReorderingTableCreator();
|
| 101 |
+
|
| 102 |
+
friend class EncodingTaskReordering;
|
| 103 |
+
friend class CompressionTaskReordering;
|
| 104 |
+
};
|
| 105 |
+
|
| 106 |
+
class EncodingTaskReordering
|
| 107 |
+
{
|
| 108 |
+
private:
|
| 109 |
+
#ifdef WITH_THREADS
|
| 110 |
+
static boost::mutex m_mutex;
|
| 111 |
+
static boost::mutex m_fileMutex;
|
| 112 |
+
#endif
|
| 113 |
+
static size_t m_lineNum;
|
| 114 |
+
static size_t m_sourcePhraseNum;
|
| 115 |
+
static std::string m_lastSourcePhrase;
|
| 116 |
+
|
| 117 |
+
InputFileStream& m_inFile;
|
| 118 |
+
LexicalReorderingTableCreator& m_creator;
|
| 119 |
+
|
| 120 |
+
public:
|
| 121 |
+
EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator);
|
| 122 |
+
void operator()();
|
| 123 |
+
};
|
| 124 |
+
|
| 125 |
+
class CompressionTaskReordering
|
| 126 |
+
{
|
| 127 |
+
private:
|
| 128 |
+
#ifdef WITH_THREADS
|
| 129 |
+
static boost::mutex m_mutex;
|
| 130 |
+
#endif
|
| 131 |
+
static size_t m_scoresNum;
|
| 132 |
+
StringVector<unsigned char, unsigned long, MmapAllocator> &m_encodedScores;
|
| 133 |
+
LexicalReorderingTableCreator &m_creator;
|
| 134 |
+
|
| 135 |
+
public:
|
| 136 |
+
CompressionTaskReordering(StringVector<unsigned char, unsigned long, MmapAllocator>&
|
| 137 |
+
m_encodedScores, LexicalReorderingTableCreator& creator);
|
| 138 |
+
void operator()();
|
| 139 |
+
};
|
| 140 |
+
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/ListCoders.h
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_ListCoders_h
|
| 23 |
+
#define moses_ListCoders_h
|
| 24 |
+
|
| 25 |
+
#include <cmath>
|
| 26 |
+
#include <cassert>
|
| 27 |
+
|
| 28 |
+
namespace Moses
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
template <typename T = unsigned int>
|
| 32 |
+
class VarIntType
|
| 33 |
+
{
|
| 34 |
+
private:
|
| 35 |
+
template <typename IntType, typename OutIt>
|
| 36 |
+
static void EncodeSymbol(IntType input, OutIt output) {
|
| 37 |
+
if(input == 0) {
|
| 38 |
+
*output = 0;
|
| 39 |
+
output++;
|
| 40 |
+
return;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
T msb = 1 << (sizeof(T)*8-1);
|
| 44 |
+
IntType mask = ~msb;
|
| 45 |
+
IntType shift = (sizeof(T)*8-1);
|
| 46 |
+
|
| 47 |
+
while(input) {
|
| 48 |
+
T res = input & mask;
|
| 49 |
+
input >>= shift;
|
| 50 |
+
if(input)
|
| 51 |
+
res |= msb;
|
| 52 |
+
*output = res;
|
| 53 |
+
output++;
|
| 54 |
+
}
|
| 55 |
+
};
|
| 56 |
+
|
| 57 |
+
template <typename InIt, typename IntType>
|
| 58 |
+
static void DecodeSymbol(InIt &it, InIt end, IntType &output) {
|
| 59 |
+
T msb = 1 << (sizeof(T)*8-1);
|
| 60 |
+
IntType shift = (sizeof(T)*8-1);
|
| 61 |
+
|
| 62 |
+
output = 0;
|
| 63 |
+
size_t i = 0;
|
| 64 |
+
while(it != end && *it & msb) {
|
| 65 |
+
IntType temp = *it & ~msb;
|
| 66 |
+
temp <<= shift*i;
|
| 67 |
+
output |= temp;
|
| 68 |
+
it++;
|
| 69 |
+
i++;
|
| 70 |
+
}
|
| 71 |
+
assert(it != end);
|
| 72 |
+
|
| 73 |
+
IntType temp = *it;
|
| 74 |
+
temp <<= shift*i;
|
| 75 |
+
output |= temp;
|
| 76 |
+
it++;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
public:
|
| 80 |
+
|
| 81 |
+
template <typename InIt, typename OutIt>
|
| 82 |
+
static void Encode(InIt it, InIt end, OutIt outIt) {
|
| 83 |
+
while(it != end) {
|
| 84 |
+
EncodeSymbol(*it, outIt);
|
| 85 |
+
it++;
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
template <typename InIt, typename OutIt>
|
| 90 |
+
static void Decode(InIt &it, InIt end, OutIt outIt) {
|
| 91 |
+
while(it != end) {
|
| 92 |
+
size_t output;
|
| 93 |
+
DecodeSymbol(it, end, output);
|
| 94 |
+
*outIt = output;
|
| 95 |
+
outIt++;
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
template <typename InIt>
|
| 100 |
+
static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
|
| 101 |
+
size_t sum = 0;
|
| 102 |
+
size_t curr = 0;
|
| 103 |
+
|
| 104 |
+
while(it != end && curr < num) {
|
| 105 |
+
size_t output;
|
| 106 |
+
DecodeSymbol(it, end, output);
|
| 107 |
+
sum += output;
|
| 108 |
+
curr++;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
return sum;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
};
|
| 115 |
+
|
| 116 |
+
typedef VarIntType<unsigned char> VarByte;
|
| 117 |
+
|
| 118 |
+
typedef VarByte VarInt8;
|
| 119 |
+
typedef VarIntType<unsigned short> VarInt16;
|
| 120 |
+
typedef VarIntType<unsigned int> VarInt32;
|
| 121 |
+
|
| 122 |
+
class Simple9
|
| 123 |
+
{
|
| 124 |
+
private:
|
| 125 |
+
typedef unsigned int uint;
|
| 126 |
+
|
| 127 |
+
template <typename InIt>
|
| 128 |
+
inline static void EncodeSymbol(uint &output, InIt it, InIt end) {
|
| 129 |
+
uint length = end - it;
|
| 130 |
+
|
| 131 |
+
uint type = 0;
|
| 132 |
+
uint bitlength = 0;
|
| 133 |
+
|
| 134 |
+
switch(length) {
|
| 135 |
+
case 1:
|
| 136 |
+
type = 1;
|
| 137 |
+
bitlength = 28;
|
| 138 |
+
break;
|
| 139 |
+
case 2:
|
| 140 |
+
type = 2;
|
| 141 |
+
bitlength = 14;
|
| 142 |
+
break;
|
| 143 |
+
case 3:
|
| 144 |
+
type = 3;
|
| 145 |
+
bitlength = 9;
|
| 146 |
+
break;
|
| 147 |
+
case 4:
|
| 148 |
+
type = 4;
|
| 149 |
+
bitlength = 7;
|
| 150 |
+
break;
|
| 151 |
+
case 5:
|
| 152 |
+
type = 5;
|
| 153 |
+
bitlength = 5;
|
| 154 |
+
break;
|
| 155 |
+
case 7:
|
| 156 |
+
type = 6;
|
| 157 |
+
bitlength = 4;
|
| 158 |
+
break;
|
| 159 |
+
case 9:
|
| 160 |
+
type = 7;
|
| 161 |
+
bitlength = 3;
|
| 162 |
+
break;
|
| 163 |
+
case 14:
|
| 164 |
+
type = 8;
|
| 165 |
+
bitlength = 2;
|
| 166 |
+
break;
|
| 167 |
+
case 28:
|
| 168 |
+
type = 9;
|
| 169 |
+
bitlength = 1;
|
| 170 |
+
break;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
output = 0;
|
| 174 |
+
output |= (type << 28);
|
| 175 |
+
|
| 176 |
+
uint i = 0;
|
| 177 |
+
while(it != end) {
|
| 178 |
+
UTIL_THROW_IF2(*it > 268435455, "You are trying to encode " << *it
|
| 179 |
+
<< " with Simple9. Cannot encode numbers larger than 268435455 (2^28-1)");
|
| 180 |
+
|
| 181 |
+
uint l = bitlength * (length-i-1);
|
| 182 |
+
output |= *it << l;
|
| 183 |
+
it++;
|
| 184 |
+
i++;
|
| 185 |
+
}
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
template <typename OutIt>
|
| 189 |
+
static inline void DecodeSymbol(uint input, OutIt outIt) {
|
| 190 |
+
uint type = (input >> 28);
|
| 191 |
+
|
| 192 |
+
uint bitlen = 0;
|
| 193 |
+
uint shift = 0;
|
| 194 |
+
uint mask = 0;
|
| 195 |
+
|
| 196 |
+
switch(type) {
|
| 197 |
+
case 1:
|
| 198 |
+
bitlen = 28;
|
| 199 |
+
shift = 0;
|
| 200 |
+
mask = 268435455;
|
| 201 |
+
break;
|
| 202 |
+
case 2:
|
| 203 |
+
bitlen = 14;
|
| 204 |
+
shift = 14;
|
| 205 |
+
mask = 16383;
|
| 206 |
+
break;
|
| 207 |
+
case 3:
|
| 208 |
+
bitlen = 9;
|
| 209 |
+
shift = 18;
|
| 210 |
+
mask = 511;
|
| 211 |
+
break;
|
| 212 |
+
case 4:
|
| 213 |
+
bitlen = 7;
|
| 214 |
+
shift = 21;
|
| 215 |
+
mask = 127;
|
| 216 |
+
break;
|
| 217 |
+
case 5:
|
| 218 |
+
bitlen = 5;
|
| 219 |
+
shift = 20;
|
| 220 |
+
mask = 31;
|
| 221 |
+
break;
|
| 222 |
+
case 6:
|
| 223 |
+
bitlen = 4;
|
| 224 |
+
shift = 24;
|
| 225 |
+
mask = 15;
|
| 226 |
+
break;
|
| 227 |
+
case 7:
|
| 228 |
+
bitlen = 3;
|
| 229 |
+
shift = 24;
|
| 230 |
+
mask = 7;
|
| 231 |
+
break;
|
| 232 |
+
case 8:
|
| 233 |
+
bitlen = 2;
|
| 234 |
+
shift = 26;
|
| 235 |
+
mask = 3;
|
| 236 |
+
break;
|
| 237 |
+
case 9:
|
| 238 |
+
bitlen = 1;
|
| 239 |
+
shift = 27;
|
| 240 |
+
mask = 1;
|
| 241 |
+
break;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
while(shift > 0) {
|
| 245 |
+
*outIt = (input >> shift) & mask;
|
| 246 |
+
shift -= bitlen;
|
| 247 |
+
outIt++;
|
| 248 |
+
}
|
| 249 |
+
*outIt = input & mask;
|
| 250 |
+
outIt++;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) {
|
| 254 |
+
uint type = (input >> 28);
|
| 255 |
+
|
| 256 |
+
uint bitlen = 0;
|
| 257 |
+
uint shift = 0;
|
| 258 |
+
uint mask = 0;
|
| 259 |
+
|
| 260 |
+
switch(type) {
|
| 261 |
+
case 1:
|
| 262 |
+
bitlen = 28;
|
| 263 |
+
shift = 0;
|
| 264 |
+
mask = 268435455;
|
| 265 |
+
break;
|
| 266 |
+
case 2:
|
| 267 |
+
bitlen = 14;
|
| 268 |
+
shift = 14;
|
| 269 |
+
mask = 16383;
|
| 270 |
+
break;
|
| 271 |
+
case 3:
|
| 272 |
+
bitlen = 9;
|
| 273 |
+
shift = 18;
|
| 274 |
+
mask = 511;
|
| 275 |
+
break;
|
| 276 |
+
case 4:
|
| 277 |
+
bitlen = 7;
|
| 278 |
+
shift = 21;
|
| 279 |
+
mask = 127;
|
| 280 |
+
break;
|
| 281 |
+
case 5:
|
| 282 |
+
bitlen = 5;
|
| 283 |
+
shift = 20;
|
| 284 |
+
mask = 31;
|
| 285 |
+
break;
|
| 286 |
+
case 6:
|
| 287 |
+
bitlen = 4;
|
| 288 |
+
shift = 24;
|
| 289 |
+
mask = 15;
|
| 290 |
+
break;
|
| 291 |
+
case 7:
|
| 292 |
+
bitlen = 3;
|
| 293 |
+
shift = 24;
|
| 294 |
+
mask = 7;
|
| 295 |
+
break;
|
| 296 |
+
case 8:
|
| 297 |
+
bitlen = 2;
|
| 298 |
+
shift = 26;
|
| 299 |
+
mask = 3;
|
| 300 |
+
break;
|
| 301 |
+
case 9:
|
| 302 |
+
bitlen = 1;
|
| 303 |
+
shift = 27;
|
| 304 |
+
mask = 1;
|
| 305 |
+
break;
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
size_t sum = 0;
|
| 309 |
+
while(shift > 0) {
|
| 310 |
+
sum += (input >> shift) & mask;
|
| 311 |
+
shift -= bitlen;
|
| 312 |
+
if(++curr == num)
|
| 313 |
+
return sum;
|
| 314 |
+
}
|
| 315 |
+
sum += input & mask;
|
| 316 |
+
curr++;
|
| 317 |
+
return sum;
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
public:
|
| 321 |
+
template <typename InIt, typename OutIt>
|
| 322 |
+
static void Encode(InIt it, InIt end, OutIt outIt) {
|
| 323 |
+
uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
|
| 324 |
+
|
| 325 |
+
uint buffer[28];
|
| 326 |
+
for(InIt i = it; i < end; i++) {
|
| 327 |
+
uint lastbit = 1;
|
| 328 |
+
uint lastpos = 0;
|
| 329 |
+
uint lastyes = 0;
|
| 330 |
+
uint j = 0;
|
| 331 |
+
|
| 332 |
+
double log2 = log(2);
|
| 333 |
+
while(j < 9 && lastpos < 28 && (i+lastpos) < end) {
|
| 334 |
+
if(lastpos >= parts[j])
|
| 335 |
+
j++;
|
| 336 |
+
|
| 337 |
+
buffer[lastpos] = *(i + lastpos);
|
| 338 |
+
|
| 339 |
+
uint reqbit = ceil(log(buffer[lastpos]+1)/log2);
|
| 340 |
+
assert(reqbit <= 28);
|
| 341 |
+
|
| 342 |
+
uint bit = 28/floor(28/reqbit);
|
| 343 |
+
if(lastbit < bit)
|
| 344 |
+
lastbit = bit;
|
| 345 |
+
|
| 346 |
+
if(parts[j] > 28/lastbit)
|
| 347 |
+
break;
|
| 348 |
+
else if(lastpos == parts[j]-1)
|
| 349 |
+
lastyes = lastpos;
|
| 350 |
+
|
| 351 |
+
lastpos++;
|
| 352 |
+
}
|
| 353 |
+
i += lastyes;
|
| 354 |
+
|
| 355 |
+
uint length = lastyes + 1;
|
| 356 |
+
uint output;
|
| 357 |
+
EncodeSymbol(output, buffer, buffer + length);
|
| 358 |
+
|
| 359 |
+
*outIt = output;
|
| 360 |
+
outIt++;
|
| 361 |
+
}
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
template <typename InIt, typename OutIt>
|
| 365 |
+
static void Decode(InIt &it, InIt end, OutIt outIt) {
|
| 366 |
+
while(it != end) {
|
| 367 |
+
DecodeSymbol(*it, outIt);
|
| 368 |
+
it++;
|
| 369 |
+
}
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
template <typename InIt>
|
| 373 |
+
static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
|
| 374 |
+
size_t sum = 0;
|
| 375 |
+
size_t curr = 0;
|
| 376 |
+
while(it != end && curr < num) {
|
| 377 |
+
sum += DecodeAndSumSymbol(*it, num, curr);
|
| 378 |
+
it++;
|
| 379 |
+
}
|
| 380 |
+
assert(curr == num);
|
| 381 |
+
return sum;
|
| 382 |
+
}
|
| 383 |
+
};
|
| 384 |
+
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/PackedArray.h
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_PackedArray_h
|
| 23 |
+
#define moses_PackedArray_h
|
| 24 |
+
|
| 25 |
+
#include <vector>
|
| 26 |
+
#include <cmath>
|
| 27 |
+
#include <cstring>
|
| 28 |
+
#include <cstdio>
|
| 29 |
+
|
| 30 |
+
#include "ThrowingFwrite.h"
|
| 31 |
+
|
| 32 |
+
namespace Moses
|
| 33 |
+
{
|
| 34 |
+
|
| 35 |
+
template <typename T = size_t, typename D = unsigned char>
|
| 36 |
+
class PackedArray
|
| 37 |
+
{
|
| 38 |
+
protected:
|
| 39 |
+
static size_t m_dataBits;
|
| 40 |
+
|
| 41 |
+
size_t m_size;
|
| 42 |
+
size_t m_storageSize;
|
| 43 |
+
D* m_storage;
|
| 44 |
+
|
| 45 |
+
public:
|
| 46 |
+
PackedArray() {
|
| 47 |
+
m_size = 0;
|
| 48 |
+
m_storageSize = 0;
|
| 49 |
+
m_storage = new D[0];
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
PackedArray(size_t size, size_t bits) : m_size(size) {
|
| 53 |
+
m_storageSize = ceil(float(bits * size) / float(m_dataBits));
|
| 54 |
+
m_storage = new D[m_storageSize];
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
PackedArray(const PackedArray<T, D> &c) {
|
| 58 |
+
m_size = c.m_size;
|
| 59 |
+
|
| 60 |
+
m_storageSize = c.m_storageSize;
|
| 61 |
+
m_storage = new D[m_storageSize];
|
| 62 |
+
|
| 63 |
+
std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D));
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
virtual ~PackedArray() {
|
| 67 |
+
delete [] m_storage;
|
| 68 |
+
m_size = 0;
|
| 69 |
+
m_storageSize = 0;
|
| 70 |
+
m_storage = 0;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
T Get(size_t i, size_t bits) const {
|
| 74 |
+
T out = 0;
|
| 75 |
+
|
| 76 |
+
size_t bitstart = (i * bits);
|
| 77 |
+
size_t bitpos = bitstart;
|
| 78 |
+
|
| 79 |
+
size_t zero = ((1ul << (bits)) - 1);
|
| 80 |
+
|
| 81 |
+
while(bitpos - bitstart < bits) {
|
| 82 |
+
size_t pos = bitpos / m_dataBits;
|
| 83 |
+
size_t off = bitpos % m_dataBits;
|
| 84 |
+
|
| 85 |
+
out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off;
|
| 86 |
+
|
| 87 |
+
bitpos += (m_dataBits - off);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
out &= zero;
|
| 91 |
+
return out;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
void Set(size_t i, T v, size_t bits) {
|
| 95 |
+
size_t bitstart = (i * bits);
|
| 96 |
+
size_t bitpos = bitstart;
|
| 97 |
+
|
| 98 |
+
while(bitpos - bitstart < bits) {
|
| 99 |
+
size_t pos = bitpos / m_dataBits;
|
| 100 |
+
size_t off = bitpos % m_dataBits;
|
| 101 |
+
|
| 102 |
+
size_t rest = bits - (bitpos - bitstart);
|
| 103 |
+
D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1);
|
| 104 |
+
|
| 105 |
+
m_storage[pos] &= zero;
|
| 106 |
+
m_storage[pos] |= v << off;
|
| 107 |
+
v = v >> (m_dataBits - off);
|
| 108 |
+
bitpos += (m_dataBits - off);
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
virtual D*& GetStorage() {
|
| 113 |
+
return m_storage;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
virtual size_t GetStorageSize() const {
|
| 117 |
+
return m_storageSize;
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
virtual size_t Size() const {
|
| 121 |
+
return m_size;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
virtual size_t Load(std::FILE* in) {
|
| 125 |
+
size_t a1 = std::ftell(in);
|
| 126 |
+
|
| 127 |
+
size_t read = 0;
|
| 128 |
+
read += std::fread(&m_size, sizeof(m_size), 1, in);
|
| 129 |
+
read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in);
|
| 130 |
+
delete [] m_storage;
|
| 131 |
+
m_storage = new D[m_storageSize];
|
| 132 |
+
read += std::fread(m_storage, sizeof(D), m_storageSize, in);
|
| 133 |
+
|
| 134 |
+
size_t a2 = std::ftell(in);
|
| 135 |
+
return a2 - a1;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
virtual size_t Save(std::FILE* out) {
|
| 139 |
+
size_t a1 = std::ftell(out);
|
| 140 |
+
|
| 141 |
+
ThrowingFwrite(&m_size, sizeof(m_size), 1, out);
|
| 142 |
+
ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out);
|
| 143 |
+
ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out);
|
| 144 |
+
|
| 145 |
+
size_t a2 = std::ftell(out);
|
| 146 |
+
return a2 - a1;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
};
|
| 150 |
+
|
| 151 |
+
template <typename T, typename D>
|
| 152 |
+
size_t PackedArray<T, D>::m_dataBits = sizeof(D)*8;
|
| 153 |
+
|
| 154 |
+
/**************************************************************************/
|
| 155 |
+
|
| 156 |
+
template <typename T = size_t, typename D = unsigned char>
|
| 157 |
+
class PairedPackedArray : public PackedArray<T,D>
|
| 158 |
+
{
|
| 159 |
+
public:
|
| 160 |
+
PairedPackedArray() : PackedArray<T,D>() {}
|
| 161 |
+
|
| 162 |
+
PairedPackedArray(size_t size, size_t bits1, size_t bits2)
|
| 163 |
+
: PackedArray<T, D>(size, bits1 + bits2) { }
|
| 164 |
+
|
| 165 |
+
void Set(size_t i, T a, T b, size_t bits1, size_t bits2) {
|
| 166 |
+
T c = 0;
|
| 167 |
+
c = a | (b << bits1);
|
| 168 |
+
PackedArray<T,D>::Set(i, c, bits1 + bits2);
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
void Set(size_t i, std::pair<T,T> p, size_t bits1, size_t bits2) {
|
| 172 |
+
T c = 0;
|
| 173 |
+
c = p.second | (p.first << bits1);
|
| 174 |
+
PackedArray<T, D>::Set(i, c);
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
std::pair<T, T> Get(size_t i, size_t bits1, size_t bits2) {
|
| 178 |
+
T v = PackedArray<T, D>::Get(i, bits1 + bits2);
|
| 179 |
+
T a = v & ((1 << bits1) - 1);
|
| 180 |
+
T b = v >> bits1;
|
| 181 |
+
return std::pair<T, T>(a, b);
|
| 182 |
+
}
|
| 183 |
+
};
|
| 184 |
+
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include <fstream>
|
| 23 |
+
#include <string>
|
| 24 |
+
#include <iterator>
|
| 25 |
+
#include <queue>
|
| 26 |
+
#include <algorithm>
|
| 27 |
+
#include <sys/stat.h>
|
| 28 |
+
#include <boost/algorithm/string/predicate.hpp>
|
| 29 |
+
#include <boost/thread/tss.hpp>
|
| 30 |
+
|
| 31 |
+
#include "PhraseDictionaryCompact.h"
|
| 32 |
+
#include "moses/FactorCollection.h"
|
| 33 |
+
#include "moses/Word.h"
|
| 34 |
+
#include "moses/Util.h"
|
| 35 |
+
#include "moses/InputFileStream.h"
|
| 36 |
+
#include "moses/StaticData.h"
|
| 37 |
+
#include "moses/Range.h"
|
| 38 |
+
#include "moses/ThreadPool.h"
|
| 39 |
+
#include "util/exception.hh"
|
| 40 |
+
|
| 41 |
+
using namespace std;
|
| 42 |
+
using namespace boost::algorithm;
|
| 43 |
+
|
| 44 |
+
namespace Moses
|
| 45 |
+
{
|
| 46 |
+
|
| 47 |
+
PhraseDictionaryCompact::SentenceCache PhraseDictionaryCompact::m_sentenceCache;
|
| 48 |
+
|
| 49 |
+
PhraseDictionaryCompact::PhraseDictionaryCompact(const std::string &line)
|
| 50 |
+
:PhraseDictionary(line, true)
|
| 51 |
+
,m_inMemory(s_inMemoryByDefault)
|
| 52 |
+
,m_useAlignmentInfo(true)
|
| 53 |
+
,m_hash(10, 16)
|
| 54 |
+
,m_phraseDecoder(0)
|
| 55 |
+
{
|
| 56 |
+
ReadParameters();
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
void PhraseDictionaryCompact::Load(AllOptions::ptr const& opts)
|
| 60 |
+
{
|
| 61 |
+
m_options = opts;
|
| 62 |
+
const StaticData &staticData = StaticData::Instance();
|
| 63 |
+
|
| 64 |
+
SetFeaturesToApply();
|
| 65 |
+
|
| 66 |
+
std::string tFilePath = m_filePath;
|
| 67 |
+
|
| 68 |
+
std::string suffix = ".minphr";
|
| 69 |
+
if (!ends_with(tFilePath, suffix)) tFilePath += suffix;
|
| 70 |
+
if (!FileExists(tFilePath))
|
| 71 |
+
throw runtime_error("Error: File " + tFilePath + " does not exist.");
|
| 72 |
+
|
| 73 |
+
m_phraseDecoder
|
| 74 |
+
= new PhraseDecoder(*this, &m_input, &m_output, m_numScoreComponents);
|
| 75 |
+
|
| 76 |
+
std::FILE* pFile = std::fopen(tFilePath.c_str() , "r");
|
| 77 |
+
|
| 78 |
+
size_t indexSize;
|
| 79 |
+
//if(m_inMemory)
|
| 80 |
+
// Load source phrase index into memory
|
| 81 |
+
indexSize = m_hash.Load(pFile);
|
| 82 |
+
// else
|
| 83 |
+
// Keep source phrase index on disk
|
| 84 |
+
//indexSize = m_hash.LoadIndex(pFile);
|
| 85 |
+
|
| 86 |
+
size_t coderSize = m_phraseDecoder->Load(pFile);
|
| 87 |
+
|
| 88 |
+
size_t phraseSize;
|
| 89 |
+
if(m_inMemory)
|
| 90 |
+
// Load target phrase collections into memory
|
| 91 |
+
phraseSize = m_targetPhrasesMemory.load(pFile, false);
|
| 92 |
+
else
|
| 93 |
+
// Keep target phrase collections on disk
|
| 94 |
+
phraseSize = m_targetPhrasesMapped.load(pFile, true);
|
| 95 |
+
|
| 96 |
+
UTIL_THROW_IF2(indexSize == 0 || coderSize == 0 || phraseSize == 0,
|
| 97 |
+
"Not successfully loaded");
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
TargetPhraseCollection::shared_ptr
|
| 101 |
+
PhraseDictionaryCompact::
|
| 102 |
+
GetTargetPhraseCollectionNonCacheLEGACY(const Phrase &sourcePhrase) const
|
| 103 |
+
{
|
| 104 |
+
//cerr << "sourcePhrase=" << sourcePhrase << endl;
|
| 105 |
+
|
| 106 |
+
TargetPhraseCollection::shared_ptr ret;
|
| 107 |
+
// There is no souch source phrase if source phrase is longer than longest
|
| 108 |
+
// observed source phrase during compilation
|
| 109 |
+
if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
|
| 110 |
+
return ret;
|
| 111 |
+
|
| 112 |
+
// Retrieve target phrase collection from phrase table
|
| 113 |
+
TargetPhraseVectorPtr decodedPhraseColl
|
| 114 |
+
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, true);
|
| 115 |
+
|
| 116 |
+
if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
|
| 117 |
+
TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
|
| 118 |
+
TargetPhraseCollection::shared_ptr phraseColl(new TargetPhraseCollection);
|
| 119 |
+
|
| 120 |
+
// Score phrases and if possible apply ttable_limit
|
| 121 |
+
TargetPhraseVector::iterator nth =
|
| 122 |
+
(m_tableLimit == 0 || tpv->size() < m_tableLimit) ?
|
| 123 |
+
tpv->end() : tpv->begin() + m_tableLimit;
|
| 124 |
+
NTH_ELEMENT4(tpv->begin(), nth, tpv->end(), CompareTargetPhrase());
|
| 125 |
+
for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++) {
|
| 126 |
+
TargetPhrase *tp = new TargetPhrase(*it);
|
| 127 |
+
phraseColl->Add(tp);
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
// Cache phrase pair for clean-up or retrieval with PREnc
|
| 131 |
+
const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(phraseColl);
|
| 132 |
+
|
| 133 |
+
return phraseColl;
|
| 134 |
+
} else
|
| 135 |
+
return ret;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
TargetPhraseVectorPtr
|
| 139 |
+
PhraseDictionaryCompact::
|
| 140 |
+
GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase) const
|
| 141 |
+
{
|
| 142 |
+
|
| 143 |
+
// There is no such source phrase if source phrase is longer than longest
|
| 144 |
+
// observed source phrase during compilation
|
| 145 |
+
if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
|
| 146 |
+
return TargetPhraseVectorPtr();
|
| 147 |
+
|
| 148 |
+
// Retrieve target phrase collection from phrase table
|
| 149 |
+
return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, false);
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
PhraseDictionaryCompact::
|
| 153 |
+
~PhraseDictionaryCompact()
|
| 154 |
+
{
|
| 155 |
+
if(m_phraseDecoder)
|
| 156 |
+
delete m_phraseDecoder;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
void
|
| 160 |
+
PhraseDictionaryCompact::
|
| 161 |
+
CacheForCleanup(TargetPhraseCollection::shared_ptr tpc)
|
| 162 |
+
{
|
| 163 |
+
if(!m_sentenceCache.get())
|
| 164 |
+
m_sentenceCache.reset(new PhraseCache());
|
| 165 |
+
m_sentenceCache->push_back(tpc);
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
void
|
| 169 |
+
PhraseDictionaryCompact::
|
| 170 |
+
AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase)
|
| 171 |
+
{ }
|
| 172 |
+
|
| 173 |
+
void
|
| 174 |
+
PhraseDictionaryCompact::
|
| 175 |
+
CleanUpAfterSentenceProcessing(const InputType &source)
|
| 176 |
+
{
|
| 177 |
+
if(!m_sentenceCache.get())
|
| 178 |
+
m_sentenceCache.reset(new PhraseCache());
|
| 179 |
+
|
| 180 |
+
m_phraseDecoder->PruneCache();
|
| 181 |
+
m_sentenceCache->clear();
|
| 182 |
+
|
| 183 |
+
ReduceCache();
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
bool PhraseDictionaryCompact::s_inMemoryByDefault = false;
|
| 187 |
+
void
|
| 188 |
+
PhraseDictionaryCompact::
|
| 189 |
+
SetStaticDefaultParameters(Parameter const& param)
|
| 190 |
+
{
|
| 191 |
+
param.SetParameter(s_inMemoryByDefault, "minphr-memory", false);
|
| 192 |
+
}
|
| 193 |
+
}
|
| 194 |
+
|
mosesdecoder/moses/TranslationModel/CompactPT/StringVectorTemp.h
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_StringVectorTemp_h
|
| 23 |
+
#define moses_StringVectorTemp_h
|
| 24 |
+
|
| 25 |
+
#include <vector>
|
| 26 |
+
#include <algorithm>
|
| 27 |
+
#include <string>
|
| 28 |
+
#include <iterator>
|
| 29 |
+
#include <cstdio>
|
| 30 |
+
#include <cassert>
|
| 31 |
+
|
| 32 |
+
#include <boost/iterator/iterator_facade.hpp>
|
| 33 |
+
|
| 34 |
+
#include "ThrowingFwrite.h"
|
| 35 |
+
#include "StringVector.h"
|
| 36 |
+
|
| 37 |
+
#include "MmapAllocator.h"
|
| 38 |
+
|
| 39 |
+
namespace Moses
|
| 40 |
+
{
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
// ********** StringVectorTemp **********
|
| 44 |
+
|
| 45 |
+
template <typename ValueT = unsigned char, typename PosT = unsigned int,
|
| 46 |
+
template <typename> class Allocator = std::allocator>
|
| 47 |
+
class StringVectorTemp
|
| 48 |
+
{
|
| 49 |
+
protected:
|
| 50 |
+
bool m_sorted;
|
| 51 |
+
bool m_memoryMapped;
|
| 52 |
+
|
| 53 |
+
std::vector<ValueT, Allocator<ValueT> >* m_charArray;
|
| 54 |
+
std::vector<PosT> m_positions;
|
| 55 |
+
|
| 56 |
+
virtual const ValueT* value_ptr(PosT i) const;
|
| 57 |
+
|
| 58 |
+
public:
|
| 59 |
+
//typedef ValueIteratorRange<typename std::vector<ValueT, Allocator<ValueT> >::const_iterator> range;
|
| 60 |
+
typedef ValueIteratorRange<const ValueT *> range;
|
| 61 |
+
|
| 62 |
+
// ********** RangeIterator **********
|
| 63 |
+
|
| 64 |
+
class RangeIterator : public boost::iterator_facade<RangeIterator,
|
| 65 |
+
range, std::random_access_iterator_tag, range, PosT>
|
| 66 |
+
{
|
| 67 |
+
|
| 68 |
+
private:
|
| 69 |
+
PosT m_index;
|
| 70 |
+
StringVectorTemp<ValueT, PosT, Allocator>* m_container;
|
| 71 |
+
|
| 72 |
+
public:
|
| 73 |
+
RangeIterator();
|
| 74 |
+
RangeIterator(StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index=0);
|
| 75 |
+
|
| 76 |
+
PosT get_index();
|
| 77 |
+
|
| 78 |
+
private:
|
| 79 |
+
friend class boost::iterator_core_access;
|
| 80 |
+
|
| 81 |
+
range dereference() const;
|
| 82 |
+
bool equal(RangeIterator const& other) const;
|
| 83 |
+
void increment();
|
| 84 |
+
void decrement();
|
| 85 |
+
void advance(PosT n);
|
| 86 |
+
|
| 87 |
+
PosT distance_to(RangeIterator const& other) const;
|
| 88 |
+
};
|
| 89 |
+
|
| 90 |
+
// ********** StringIterator **********
|
| 91 |
+
|
| 92 |
+
class StringIterator : public boost::iterator_facade<StringIterator,
|
| 93 |
+
std::string, std::random_access_iterator_tag, const std::string, PosT>
|
| 94 |
+
{
|
| 95 |
+
|
| 96 |
+
private:
|
| 97 |
+
PosT m_index;
|
| 98 |
+
StringVectorTemp<ValueT, PosT, Allocator>* m_container;
|
| 99 |
+
|
| 100 |
+
public:
|
| 101 |
+
StringIterator();
|
| 102 |
+
StringIterator(StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index=0);
|
| 103 |
+
|
| 104 |
+
PosT get_index();
|
| 105 |
+
|
| 106 |
+
private:
|
| 107 |
+
friend class boost::iterator_core_access;
|
| 108 |
+
|
| 109 |
+
const std::string dereference() const;
|
| 110 |
+
bool equal(StringIterator const& other) const;
|
| 111 |
+
void increment();
|
| 112 |
+
void decrement();
|
| 113 |
+
void advance(PosT n);
|
| 114 |
+
PosT distance_to(StringIterator const& other) const;
|
| 115 |
+
};
|
| 116 |
+
|
| 117 |
+
typedef RangeIterator iterator;
|
| 118 |
+
typedef StringIterator string_iterator;
|
| 119 |
+
|
| 120 |
+
StringVectorTemp();
|
| 121 |
+
StringVectorTemp(Allocator<ValueT> alloc);
|
| 122 |
+
|
| 123 |
+
virtual ~StringVectorTemp() {
|
| 124 |
+
delete m_charArray;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
void swap(StringVectorTemp<ValueT, PosT, Allocator> &c) {
|
| 128 |
+
m_positions.swap(c.m_positions);
|
| 129 |
+
m_charArray->swap(*c.m_charArray);
|
| 130 |
+
|
| 131 |
+
bool temp = m_sorted;
|
| 132 |
+
m_sorted = c.m_sorted;
|
| 133 |
+
c.m_sorted = temp;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
bool is_sorted() const;
|
| 137 |
+
PosT size() const;
|
| 138 |
+
virtual PosT size2() const;
|
| 139 |
+
|
| 140 |
+
template<class Iterator> Iterator begin() const;
|
| 141 |
+
template<class Iterator> Iterator end() const;
|
| 142 |
+
|
| 143 |
+
iterator begin() const;
|
| 144 |
+
iterator end() const;
|
| 145 |
+
|
| 146 |
+
PosT length(PosT i) const;
|
| 147 |
+
//typename std::vector<ValueT, Allocator<ValueT> >::const_iterator begin(PosT i) const;
|
| 148 |
+
//typename std::vector<ValueT, Allocator<ValueT> >::const_iterator end(PosT i) const;
|
| 149 |
+
const ValueT* begin(PosT i) const;
|
| 150 |
+
const ValueT* end(PosT i) const;
|
| 151 |
+
|
| 152 |
+
void clear() {
|
| 153 |
+
m_charArray->clear();
|
| 154 |
+
m_sorted = true;
|
| 155 |
+
m_positions.clear();
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
range at(PosT i) const;
|
| 159 |
+
range operator[](PosT i) const;
|
| 160 |
+
range back() const;
|
| 161 |
+
|
| 162 |
+
template <typename StringT>
|
| 163 |
+
void push_back(StringT s);
|
| 164 |
+
void push_back(const char* c);
|
| 165 |
+
|
| 166 |
+
template <typename StringT>
|
| 167 |
+
PosT find(StringT &s) const;
|
| 168 |
+
PosT find(const char* c) const;
|
| 169 |
+
};
|
| 170 |
+
|
| 171 |
+
// ********** Implementation **********
|
| 172 |
+
|
| 173 |
+
// StringVectorTemp
|
| 174 |
+
|
| 175 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 176 |
+
StringVectorTemp<ValueT, PosT, Allocator>::StringVectorTemp()
|
| 177 |
+
: m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >()) { }
|
| 178 |
+
|
| 179 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 180 |
+
StringVectorTemp<ValueT, PosT, Allocator>::StringVectorTemp(Allocator<ValueT> alloc)
|
| 181 |
+
: m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { }
|
| 182 |
+
|
| 183 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 184 |
+
template <typename StringT>
|
| 185 |
+
void StringVectorTemp<ValueT, PosT, Allocator>::push_back(StringT s)
|
| 186 |
+
{
|
| 187 |
+
if(is_sorted() && size() && !(back() < s))
|
| 188 |
+
m_sorted = false;
|
| 189 |
+
|
| 190 |
+
m_positions.push_back(size2());
|
| 191 |
+
std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray));
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 195 |
+
void StringVectorTemp<ValueT, PosT, Allocator>::push_back(const char* c)
|
| 196 |
+
{
|
| 197 |
+
std::string dummy(c);
|
| 198 |
+
push_back(dummy);
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 202 |
+
template <typename Iterator>
|
| 203 |
+
Iterator StringVectorTemp<ValueT, PosT, Allocator>::begin() const
|
| 204 |
+
{
|
| 205 |
+
return Iterator(const_cast<StringVectorTemp<ValueT, PosT, Allocator>&>(*this), 0);
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 209 |
+
template <typename Iterator>
|
| 210 |
+
Iterator StringVectorTemp<ValueT, PosT, Allocator>::end() const
|
| 211 |
+
{
|
| 212 |
+
return Iterator(const_cast<StringVectorTemp<ValueT, PosT, Allocator>&>(*this), size());
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 216 |
+
typename StringVectorTemp<ValueT, PosT, Allocator>::iterator StringVectorTemp<ValueT, PosT, Allocator>::begin() const
|
| 217 |
+
{
|
| 218 |
+
return begin<iterator>();
|
| 219 |
+
};
|
| 220 |
+
|
| 221 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 222 |
+
typename StringVectorTemp<ValueT, PosT, Allocator>::iterator StringVectorTemp<ValueT, PosT, Allocator>::end() const
|
| 223 |
+
{
|
| 224 |
+
return end<iterator>();
|
| 225 |
+
};
|
| 226 |
+
|
| 227 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 228 |
+
bool StringVectorTemp<ValueT, PosT, Allocator>::is_sorted() const
|
| 229 |
+
{
|
| 230 |
+
return m_sorted;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 234 |
+
PosT StringVectorTemp<ValueT, PosT, Allocator>::size() const
|
| 235 |
+
{
|
| 236 |
+
return m_positions.size();
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 240 |
+
PosT StringVectorTemp<ValueT, PosT, Allocator>::size2() const
|
| 241 |
+
{
|
| 242 |
+
return m_charArray->size();
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 246 |
+
typename StringVectorTemp<ValueT, PosT, Allocator>::range StringVectorTemp<ValueT, PosT, Allocator>::at(PosT i) const
|
| 247 |
+
{
|
| 248 |
+
return range(begin(i), end(i));
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 252 |
+
typename StringVectorTemp<ValueT, PosT, Allocator>::range StringVectorTemp<ValueT, PosT, Allocator>::operator[](PosT i) const
|
| 253 |
+
{
|
| 254 |
+
return at(i);
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 258 |
+
typename StringVectorTemp<ValueT, PosT, Allocator>::range StringVectorTemp<ValueT, PosT, Allocator>::back() const
|
| 259 |
+
{
|
| 260 |
+
return at(size()-1);
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 264 |
+
PosT StringVectorTemp<ValueT, PosT, Allocator>::length(PosT i) const
|
| 265 |
+
{
|
| 266 |
+
if(i+1 < size())
|
| 267 |
+
return m_positions[i+1] - m_positions[i];
|
| 268 |
+
else
|
| 269 |
+
return size2() - m_positions[i];
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 273 |
+
const ValueT* StringVectorTemp<ValueT, PosT, Allocator>::value_ptr(PosT i) const
|
| 274 |
+
{
|
| 275 |
+
return &(*m_charArray)[m_positions[i]];
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 279 |
+
//typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVectorTemp<ValueT, PosT, Allocator>::begin(PosT i) const
|
| 280 |
+
const ValueT* StringVectorTemp<ValueT, PosT, Allocator>::begin(PosT i) const
|
| 281 |
+
{
|
| 282 |
+
//return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i));
|
| 283 |
+
return value_ptr(i);
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 287 |
+
//typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVectorTemp<ValueT, PosT, Allocator>::end(PosT i) const
|
| 288 |
+
const ValueT* StringVectorTemp<ValueT, PosT, Allocator>::end(PosT i) const
|
| 289 |
+
{
|
| 290 |
+
//return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i) + length(i));
|
| 291 |
+
return value_ptr(i) + length(i);
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 295 |
+
template <typename StringT>
|
| 296 |
+
PosT StringVectorTemp<ValueT, PosT, Allocator>::find(StringT &s) const
|
| 297 |
+
{
|
| 298 |
+
if(m_sorted)
|
| 299 |
+
return std::distance(begin(), std::lower_bound(begin(), end(), s));
|
| 300 |
+
return std::distance(begin(), std::find(begin(), end(), s));
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 304 |
+
PosT StringVectorTemp<ValueT, PosT, Allocator>::find(const char* c) const
|
| 305 |
+
{
|
| 306 |
+
std::string s(c);
|
| 307 |
+
return find(s);
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
// RangeIterator
|
| 311 |
+
|
| 312 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 313 |
+
StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::RangeIterator() : m_index(0), m_container(0) { }
|
| 314 |
+
|
| 315 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 316 |
+
StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::RangeIterator(StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index)
|
| 317 |
+
: m_index(index), m_container(&sv) { }
|
| 318 |
+
|
| 319 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 320 |
+
PosT StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::get_index()
|
| 321 |
+
{
|
| 322 |
+
return m_index;
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 326 |
+
typename StringVectorTemp<ValueT, PosT, Allocator>::range
|
| 327 |
+
StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::dereference() const
|
| 328 |
+
{
|
| 329 |
+
return typename StringVectorTemp<ValueT, PosT, Allocator>::range(
|
| 330 |
+
m_container->begin(m_index),
|
| 331 |
+
m_container->end(m_index)
|
| 332 |
+
);
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 336 |
+
bool StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::equal(
|
| 337 |
+
StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator const& other) const
|
| 338 |
+
{
|
| 339 |
+
return m_index == other.m_index && m_container == other.m_container;
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 343 |
+
void StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::increment()
|
| 344 |
+
{
|
| 345 |
+
m_index++;
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 349 |
+
void StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::decrement()
|
| 350 |
+
{
|
| 351 |
+
m_index--;
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 355 |
+
void StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::advance(PosT n)
|
| 356 |
+
{
|
| 357 |
+
m_index += n;
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 361 |
+
PosT StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::distance_to(
|
| 362 |
+
StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator const& other) const
|
| 363 |
+
{
|
| 364 |
+
return other.m_index - m_index;
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
// StringIterator
|
| 368 |
+
|
| 369 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 370 |
+
StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::StringIterator()
|
| 371 |
+
: m_index(0), m_container(0) { }
|
| 372 |
+
|
| 373 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 374 |
+
StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::StringIterator(
|
| 375 |
+
StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index) : m_index(index),
|
| 376 |
+
m_container(&sv) { }
|
| 377 |
+
|
| 378 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 379 |
+
PosT StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::get_index()
|
| 380 |
+
{
|
| 381 |
+
return m_index;
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 385 |
+
const std::string StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::dereference() const
|
| 386 |
+
{
|
| 387 |
+
return StringVectorTemp<ValueT, PosT, Allocator>::range(m_container->begin(m_index),
|
| 388 |
+
m_container->end(m_index)).str();
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 392 |
+
bool StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::equal(
|
| 393 |
+
StringVectorTemp<ValueT, PosT, Allocator>::StringIterator const& other) const
|
| 394 |
+
{
|
| 395 |
+
return m_index == other.m_index && m_container == other.m_container;
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 399 |
+
void StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::increment()
|
| 400 |
+
{
|
| 401 |
+
m_index++;
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 405 |
+
void StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::decrement()
|
| 406 |
+
{
|
| 407 |
+
m_index--;
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 411 |
+
void StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::advance(PosT n)
|
| 412 |
+
{
|
| 413 |
+
m_index += n;
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
| 417 |
+
PosT StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::distance_to(
|
| 418 |
+
StringVectorTemp<ValueT, PosT, Allocator>::StringIterator const& other) const
|
| 419 |
+
{
|
| 420 |
+
return other.m_index - m_index;
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
// ********** Some typedefs **********
|
| 424 |
+
|
| 425 |
+
typedef StringVectorTemp<unsigned char, unsigned int> MediumStringVectorTemp;
|
| 426 |
+
typedef StringVectorTemp<unsigned char, unsigned long> LongStringVectorTemp;
|
| 427 |
+
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_TargetPhraseCollectionCache_h
|
| 23 |
+
#define moses_TargetPhraseCollectionCache_h
|
| 24 |
+
|
| 25 |
+
#include <map>
|
| 26 |
+
#include <set>
|
| 27 |
+
#include <vector>
|
| 28 |
+
|
| 29 |
+
#include <boost/thread/tss.hpp>
|
| 30 |
+
#include <boost/shared_ptr.hpp>
|
| 31 |
+
|
| 32 |
+
#include "moses/Phrase.h"
|
| 33 |
+
#include "moses/TargetPhraseCollection.h"
|
| 34 |
+
|
| 35 |
+
namespace Moses
|
| 36 |
+
{
|
| 37 |
+
|
| 38 |
+
// Avoid using new due to locking
|
| 39 |
+
typedef std::vector<TargetPhrase> TargetPhraseVector;
|
| 40 |
+
typedef boost::shared_ptr<TargetPhraseVector> TargetPhraseVectorPtr;
|
| 41 |
+
|
| 42 |
+
/** Implementation of Persistent Cache **/
|
| 43 |
+
class TargetPhraseCollectionCache
|
| 44 |
+
{
|
| 45 |
+
private:
|
| 46 |
+
size_t m_max;
|
| 47 |
+
float m_tolerance;
|
| 48 |
+
|
| 49 |
+
struct LastUsed {
|
| 50 |
+
clock_t m_clock;
|
| 51 |
+
TargetPhraseVectorPtr m_tpv;
|
| 52 |
+
size_t m_bitsLeft;
|
| 53 |
+
|
| 54 |
+
LastUsed() : m_clock(0), m_bitsLeft(0) {}
|
| 55 |
+
|
| 56 |
+
LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0)
|
| 57 |
+
: m_clock(clock), m_tpv(tpv), m_bitsLeft(bitsLeft) {}
|
| 58 |
+
};
|
| 59 |
+
|
| 60 |
+
typedef std::map<Phrase, LastUsed> CacheMap;
|
| 61 |
+
static boost::thread_specific_ptr<CacheMap> m_phraseCache;
|
| 62 |
+
|
| 63 |
+
public:
|
| 64 |
+
|
| 65 |
+
typedef CacheMap::iterator iterator;
|
| 66 |
+
typedef CacheMap::const_iterator const_iterator;
|
| 67 |
+
|
| 68 |
+
TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2)
|
| 69 |
+
: m_max(max), m_tolerance(tolerance) {
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
iterator Begin() {
|
| 73 |
+
if(!m_phraseCache.get())
|
| 74 |
+
m_phraseCache.reset(new CacheMap());
|
| 75 |
+
return m_phraseCache->begin();
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
const_iterator Begin() const {
|
| 79 |
+
if(!m_phraseCache.get())
|
| 80 |
+
m_phraseCache.reset(new CacheMap());
|
| 81 |
+
return m_phraseCache->begin();
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
iterator End() {
|
| 85 |
+
if(!m_phraseCache.get())
|
| 86 |
+
m_phraseCache.reset(new CacheMap());
|
| 87 |
+
return m_phraseCache->end();
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
const_iterator End() const {
|
| 91 |
+
if(!m_phraseCache.get())
|
| 92 |
+
m_phraseCache.reset(new CacheMap());
|
| 93 |
+
return m_phraseCache->end();
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
/** retrieve translations for source phrase from persistent cache **/
|
| 97 |
+
void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv,
|
| 98 |
+
size_t bitsLeft = 0, size_t maxRank = 0) {
|
| 99 |
+
if(!m_phraseCache.get())
|
| 100 |
+
m_phraseCache.reset(new CacheMap());
|
| 101 |
+
// check if source phrase is already in cache
|
| 102 |
+
iterator it = m_phraseCache->find(sourcePhrase);
|
| 103 |
+
if(it != m_phraseCache->end())
|
| 104 |
+
// if found, just update clock
|
| 105 |
+
it->second.m_clock = clock();
|
| 106 |
+
else {
|
| 107 |
+
// else, add to cache
|
| 108 |
+
if(maxRank && tpv->size() > maxRank) {
|
| 109 |
+
TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector());
|
| 110 |
+
tpv_temp->resize(maxRank);
|
| 111 |
+
std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin());
|
| 112 |
+
(*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft);
|
| 113 |
+
} else
|
| 114 |
+
(*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft);
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
std::pair<TargetPhraseVectorPtr, size_t> Retrieve(const Phrase &sourcePhrase) {
|
| 119 |
+
if(!m_phraseCache.get())
|
| 120 |
+
m_phraseCache.reset(new CacheMap());
|
| 121 |
+
iterator it = m_phraseCache->find(sourcePhrase);
|
| 122 |
+
if(it != m_phraseCache->end()) {
|
| 123 |
+
LastUsed &lu = it->second;
|
| 124 |
+
lu.m_clock = clock();
|
| 125 |
+
return std::make_pair(lu.m_tpv, lu.m_bitsLeft);
|
| 126 |
+
} else
|
| 127 |
+
return std::make_pair(TargetPhraseVectorPtr(), 0);
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
// if cache full, reduce
|
| 131 |
+
void Prune() {
|
| 132 |
+
if(!m_phraseCache.get())
|
| 133 |
+
m_phraseCache.reset(new CacheMap());
|
| 134 |
+
if(m_phraseCache->size() > m_max * (1 + m_tolerance)) {
|
| 135 |
+
typedef std::set<std::pair<clock_t, Phrase> > Cands;
|
| 136 |
+
Cands cands;
|
| 137 |
+
for(CacheMap::iterator it = m_phraseCache->begin();
|
| 138 |
+
it != m_phraseCache->end(); it++) {
|
| 139 |
+
LastUsed &lu = it->second;
|
| 140 |
+
cands.insert(std::make_pair(lu.m_clock, it->first));
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
for(Cands::iterator it = cands.begin(); it != cands.end(); it++) {
|
| 144 |
+
const Phrase& p = it->second;
|
| 145 |
+
m_phraseCache->erase(p);
|
| 146 |
+
|
| 147 |
+
if(m_phraseCache->size() < (m_max * (1 - m_tolerance)))
|
| 148 |
+
break;
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
void CleanUp() {
|
| 154 |
+
if(!m_phraseCache.get())
|
| 155 |
+
m_phraseCache.reset(new CacheMap());
|
| 156 |
+
m_phraseCache->clear();
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
};
|
| 160 |
+
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include "ThrowingFwrite.h"
|
| 23 |
+
|
| 24 |
+
size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream)
|
| 25 |
+
{
|
| 26 |
+
assert(size);
|
| 27 |
+
size_t returnValue = std::fwrite(ptr, size, count, stream);
|
| 28 |
+
UTIL_THROW_IF2(count != returnValue, "Short fwrite; requested size " << size);
|
| 29 |
+
return returnValue;
|
| 30 |
+
}
|
mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.h
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_ThrowingFwrite_h
|
| 23 |
+
#define moses_ThrowingFwrite_h
|
| 24 |
+
|
| 25 |
+
#include <cassert>
|
| 26 |
+
#include <cstdio>
|
| 27 |
+
#include "util/exception.hh"
|
| 28 |
+
|
| 29 |
+
size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream);
|
| 30 |
+
|
| 31 |
+
#endif
|
mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.cpp
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "LoaderFactory.h"
|
| 21 |
+
|
| 22 |
+
#include "moses/Util.h"
|
| 23 |
+
#include "moses/InputFileStream.h"
|
| 24 |
+
#include "LoaderCompact.h"
|
| 25 |
+
#include "LoaderHiero.h"
|
| 26 |
+
#include "LoaderStandard.h"
|
| 27 |
+
|
| 28 |
+
#include <sstream>
|
| 29 |
+
#include <iostream>
|
| 30 |
+
|
| 31 |
+
using namespace std;
|
| 32 |
+
|
| 33 |
+
namespace Moses
|
| 34 |
+
{
|
| 35 |
+
|
| 36 |
+
// Determines the rule table type by peeking inside the file then creates
|
| 37 |
+
// a suitable RuleTableLoader object.
|
| 38 |
+
std::auto_ptr<RuleTableLoader>
|
| 39 |
+
RuleTableLoaderFactory::
|
| 40 |
+
Create(const std::string &path)
|
| 41 |
+
{
|
| 42 |
+
InputFileStream input(path);
|
| 43 |
+
std::string line;
|
| 44 |
+
|
| 45 |
+
if (std::getline(input, line)) {
|
| 46 |
+
std::vector<std::string> tokens;
|
| 47 |
+
Tokenize(tokens, line);
|
| 48 |
+
if (tokens.size() == 1) {
|
| 49 |
+
if (tokens[0] == "1") {
|
| 50 |
+
return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderCompact());
|
| 51 |
+
}
|
| 52 |
+
std::cerr << "Unsupported compact rule table format: " << tokens[0];
|
| 53 |
+
return std::auto_ptr<RuleTableLoader>();
|
| 54 |
+
} else if (tokens[0] == "[X]" && tokens[1] == "|||") {
|
| 55 |
+
return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderHiero());
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderStandard());
|
| 59 |
+
} else {
|
| 60 |
+
// empty phrase table
|
| 61 |
+
return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderStandard());
|
| 62 |
+
}
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.cpp
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// RuleTableLoaderHiero.cpp
|
| 3 |
+
// moses
|
| 4 |
+
//
|
| 5 |
+
// Created by Hieu Hoang on 04/11/2011.
|
| 6 |
+
// Copyright 2011 __MyCompanyName__. All rights reserved.
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
#include <iostream>
|
| 10 |
+
#include "LoaderHiero.h"
|
| 11 |
+
|
| 12 |
+
using namespace std;
|
| 13 |
+
|
| 14 |
+
namespace Moses
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
bool RuleTableLoaderHiero::Load(AllOptions const& opts,
|
| 18 |
+
const std::vector<FactorType> &input,
|
| 19 |
+
const std::vector<FactorType> &output,
|
| 20 |
+
const std::string &inFile,
|
| 21 |
+
size_t tableLimit,
|
| 22 |
+
RuleTableTrie &ruleTable)
|
| 23 |
+
{
|
| 24 |
+
bool ret = RuleTableLoaderStandard::Load(opts, HieroFormat
|
| 25 |
+
,input, output
|
| 26 |
+
,inFile
|
| 27 |
+
,tableLimit
|
| 28 |
+
,ruleTable);
|
| 29 |
+
return ret;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
}
|
| 33 |
+
|
mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.cpp
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "LoaderStandard.h"
|
| 21 |
+
|
| 22 |
+
#include <fstream>
|
| 23 |
+
#include <string>
|
| 24 |
+
#include <iterator>
|
| 25 |
+
#include <algorithm>
|
| 26 |
+
#include <iostream>
|
| 27 |
+
#include <sys/stat.h>
|
| 28 |
+
#include <cstdlib>
|
| 29 |
+
#include <boost/algorithm/string/predicate.hpp>
|
| 30 |
+
#include "Trie.h"
|
| 31 |
+
#include "moses/FactorCollection.h"
|
| 32 |
+
#include "moses/Word.h"
|
| 33 |
+
#include "moses/Util.h"
|
| 34 |
+
#include "moses/InputFileStream.h"
|
| 35 |
+
#include "moses/StaticData.h"
|
| 36 |
+
#include "moses/Range.h"
|
| 37 |
+
#include "moses/ChartTranslationOptionList.h"
|
| 38 |
+
#include "moses/FactorCollection.h"
|
| 39 |
+
#include "util/file_piece.hh"
|
| 40 |
+
#include "util/string_piece.hh"
|
| 41 |
+
#include "util/tokenize_piece.hh"
|
| 42 |
+
#include "util/double-conversion/double-conversion.h"
|
| 43 |
+
#include "util/exception.hh"
|
| 44 |
+
|
| 45 |
+
using namespace std;
|
| 46 |
+
using namespace boost::algorithm;
|
| 47 |
+
|
| 48 |
+
namespace Moses
|
| 49 |
+
{
|
| 50 |
+
|
| 51 |
+
bool
|
| 52 |
+
RuleTableLoaderStandard::
|
| 53 |
+
Load(AllOptions const& opts
|
| 54 |
+
, const std::vector<FactorType> &input
|
| 55 |
+
, const std::vector<FactorType> &output
|
| 56 |
+
, const std::string &inFile
|
| 57 |
+
, size_t tableLimit
|
| 58 |
+
, RuleTableTrie &ruleTable)
|
| 59 |
+
{
|
| 60 |
+
return Load(opts, MosesFormat,input, output ,inFile ,tableLimit ,ruleTable);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
void ReformatHieroRule(int sourceTarget, string &phrase, map<size_t, pair<size_t, size_t> > &ntAlign)
|
| 64 |
+
{
|
| 65 |
+
vector<string> toks;
|
| 66 |
+
Tokenize(toks, phrase, " ");
|
| 67 |
+
|
| 68 |
+
for (size_t i = 0; i < toks.size(); ++i) {
|
| 69 |
+
string &tok = toks[i];
|
| 70 |
+
if (starts_with(tok, "[") && ends_with(tok, "]")) {
|
| 71 |
+
// no-term
|
| 72 |
+
vector<string> split = Tokenize(tok, ",");
|
| 73 |
+
UTIL_THROW_IF2(split.size() != 2,
|
| 74 |
+
"Incorrectly formmatted non-terminal: " << tok);
|
| 75 |
+
|
| 76 |
+
tok = "[X]" + split[0] + "]";
|
| 77 |
+
size_t coIndex = Scan<size_t>(split[1]);
|
| 78 |
+
|
| 79 |
+
pair<size_t, size_t> &alignPoint = ntAlign[coIndex];
|
| 80 |
+
if (sourceTarget == 0) {
|
| 81 |
+
alignPoint.first = i;
|
| 82 |
+
} else {
|
| 83 |
+
alignPoint.second = i;
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
phrase = Join(" ", toks) + " [X]";
|
| 89 |
+
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
void ReformateHieroScore(string &scoreString)
|
| 93 |
+
{
|
| 94 |
+
vector<string> toks;
|
| 95 |
+
Tokenize(toks, scoreString, " ");
|
| 96 |
+
|
| 97 |
+
for (size_t i = 0; i < toks.size(); ++i) {
|
| 98 |
+
string &tok = toks[i];
|
| 99 |
+
vector<string> nameValue = Tokenize(tok, "=");
|
| 100 |
+
UTIL_THROW_IF2(nameValue.size() != 2,
|
| 101 |
+
"Incorrectly formatted score: " << tok);
|
| 102 |
+
|
| 103 |
+
float score = Scan<float>(nameValue[1]);
|
| 104 |
+
score = exp(-score);
|
| 105 |
+
tok = SPrint(score);
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
scoreString = Join(" ", toks);
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
void ReformatHieroRule(const string &lineOrig, string &out)
|
| 112 |
+
{
|
| 113 |
+
vector<string> tokens;
|
| 114 |
+
vector<float> scoreVector;
|
| 115 |
+
|
| 116 |
+
TokenizeMultiCharSeparator(tokens, lineOrig, "|||" );
|
| 117 |
+
|
| 118 |
+
string &sourcePhraseString = tokens[1]
|
| 119 |
+
, &targetPhraseString = tokens[2]
|
| 120 |
+
, &scoreString = tokens[3];
|
| 121 |
+
|
| 122 |
+
map<size_t, pair<size_t, size_t> > ntAlign;
|
| 123 |
+
ReformatHieroRule(0, sourcePhraseString, ntAlign);
|
| 124 |
+
ReformatHieroRule(1, targetPhraseString, ntAlign);
|
| 125 |
+
ReformateHieroScore(scoreString);
|
| 126 |
+
|
| 127 |
+
util::StringStream align;
|
| 128 |
+
map<size_t, pair<size_t, size_t> >::const_iterator iterAlign;
|
| 129 |
+
for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign) {
|
| 130 |
+
const pair<size_t, size_t> &alignPoint = iterAlign->second;
|
| 131 |
+
align << alignPoint.first << "-" << alignPoint.second << " ";
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
util::StringStream ret;
|
| 135 |
+
ret << sourcePhraseString << " ||| "
|
| 136 |
+
<< targetPhraseString << " ||| "
|
| 137 |
+
<< scoreString << " ||| "
|
| 138 |
+
<< align.str();
|
| 139 |
+
|
| 140 |
+
out = ret.str();
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
bool RuleTableLoaderStandard::Load(AllOptions const& opts, FormatType format
|
| 144 |
+
, const std::vector<FactorType> &input
|
| 145 |
+
, const std::vector<FactorType> &output
|
| 146 |
+
, const std::string &inFile
|
| 147 |
+
, size_t /* tableLimit */
|
| 148 |
+
, RuleTableTrie &ruleTable)
|
| 149 |
+
{
|
| 150 |
+
PrintUserTime(string("Start loading text phrase table. ") + (format==MosesFormat?"Moses":"Hiero") + " format");
|
| 151 |
+
|
| 152 |
+
// const StaticData &staticData = StaticData::Instance();
|
| 153 |
+
|
| 154 |
+
string lineOrig;
|
| 155 |
+
size_t count = 0;
|
| 156 |
+
|
| 157 |
+
std::ostream *progress = NULL;
|
| 158 |
+
IFVERBOSE(1) progress = &std::cerr;
|
| 159 |
+
util::FilePiece in(inFile.c_str(), progress);
|
| 160 |
+
|
| 161 |
+
// reused variables
|
| 162 |
+
vector<float> scoreVector;
|
| 163 |
+
StringPiece line;
|
| 164 |
+
std::string hiero_before, hiero_after;
|
| 165 |
+
|
| 166 |
+
double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
|
| 167 |
+
|
| 168 |
+
while(true) {
|
| 169 |
+
try {
|
| 170 |
+
line = in.ReadLine();
|
| 171 |
+
} catch (const util::EndOfFileException &e) {
|
| 172 |
+
break;
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
if (format == HieroFormat) { // inefficiently reformat line
|
| 176 |
+
hiero_before.assign(line.data(), line.size());
|
| 177 |
+
ReformatHieroRule(hiero_before, hiero_after);
|
| 178 |
+
line = hiero_after;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
util::TokenIter<util::MultiCharacter> pipes(line, "|||");
|
| 182 |
+
StringPiece sourcePhraseString(*pipes);
|
| 183 |
+
StringPiece targetPhraseString(*++pipes);
|
| 184 |
+
StringPiece scoreString(*++pipes);
|
| 185 |
+
|
| 186 |
+
StringPiece alignString;
|
| 187 |
+
if (++pipes) {
|
| 188 |
+
StringPiece temp(*pipes);
|
| 189 |
+
alignString = temp;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
|
| 193 |
+
if (isLHSEmpty && !opts.unk.word_deletion_enabled) {
|
| 194 |
+
TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
|
| 195 |
+
continue;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
scoreVector.clear();
|
| 199 |
+
for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
|
| 200 |
+
int processed;
|
| 201 |
+
float score = converter.StringToFloat(s->data(), s->length(), &processed);
|
| 202 |
+
UTIL_THROW_IF2(isnan(score), "Bad score " << *s << " on line " << count);
|
| 203 |
+
scoreVector.push_back(FloorScore(TransformScore(score)));
|
| 204 |
+
}
|
| 205 |
+
const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
|
| 206 |
+
if (scoreVector.size() != numScoreComponents) {
|
| 207 |
+
UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
|
| 208 |
+
<< numScoreComponents << ") of score components on line " << count);
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
// parse source & find pt node
|
| 212 |
+
|
| 213 |
+
// constituent labels
|
| 214 |
+
Word *sourceLHS = NULL;
|
| 215 |
+
Word *targetLHS;
|
| 216 |
+
|
| 217 |
+
// create target phrase obj
|
| 218 |
+
TargetPhrase *targetPhrase = new TargetPhrase(&ruleTable);
|
| 219 |
+
targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
|
| 220 |
+
// source
|
| 221 |
+
Phrase sourcePhrase;
|
| 222 |
+
sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);
|
| 223 |
+
|
| 224 |
+
// rest of target phrase
|
| 225 |
+
targetPhrase->SetAlignmentInfo(alignString);
|
| 226 |
+
targetPhrase->SetTargetLHS(targetLHS);
|
| 227 |
+
|
| 228 |
+
++pipes; // skip over counts field
|
| 229 |
+
|
| 230 |
+
if (++pipes) {
|
| 231 |
+
StringPiece sparseString(*pipes);
|
| 232 |
+
targetPhrase->SetSparseScore(&ruleTable, sparseString);
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
if (++pipes) {
|
| 236 |
+
StringPiece propertiesString(*pipes);
|
| 237 |
+
targetPhrase->SetProperties(propertiesString);
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
|
| 241 |
+
targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
|
| 242 |
+
|
| 243 |
+
TargetPhraseCollection::shared_ptr phraseColl
|
| 244 |
+
= GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase,
|
| 245 |
+
*targetPhrase, sourceLHS);
|
| 246 |
+
phraseColl->Add(targetPhrase);
|
| 247 |
+
|
| 248 |
+
// not implemented correctly in memory pt. just delete it for now
|
| 249 |
+
delete sourceLHS;
|
| 250 |
+
|
| 251 |
+
count++;
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
// sort and prune each target phrase collection
|
| 255 |
+
SortAndPrune(ruleTable);
|
| 256 |
+
|
| 257 |
+
return true;
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
}
|
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// vim:tabstop=2
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include <cstdio>
|
| 23 |
+
#include <cstdlib>
|
| 24 |
+
#include <cstring>
|
| 25 |
+
#include <climits>
|
| 26 |
+
#include <sys/types.h>
|
| 27 |
+
#include <unistd.h>
|
| 28 |
+
#include <dirent.h>
|
| 29 |
+
|
| 30 |
+
#include <fstream>
|
| 31 |
+
#include <string>
|
| 32 |
+
#include <iterator>
|
| 33 |
+
#include <algorithm>
|
| 34 |
+
#include "Loader.h"
|
| 35 |
+
#include "LoaderFactory.h"
|
| 36 |
+
#include "PhraseDictionaryFuzzyMatch.h"
|
| 37 |
+
#include "moses/FactorCollection.h"
|
| 38 |
+
#include "moses/Word.h"
|
| 39 |
+
#include "moses/Util.h"
|
| 40 |
+
#include "moses/InputFileStream.h"
|
| 41 |
+
#include "moses/StaticData.h"
|
| 42 |
+
#include "moses/Range.h"
|
| 43 |
+
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h"
|
| 44 |
+
#include "moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h"
|
| 45 |
+
#include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h"
|
| 46 |
+
#include "moses/TranslationTask.h"
|
| 47 |
+
#include "util/file.hh"
|
| 48 |
+
#include "util/exception.hh"
|
| 49 |
+
#include "util/random.hh"
|
| 50 |
+
|
| 51 |
+
using namespace std;
|
| 52 |
+
|
| 53 |
+
#if defined __MINGW32__ && !defined mkdtemp
|
| 54 |
+
#include <windows.h>
|
| 55 |
+
#include <cerrno>
|
| 56 |
+
char *mkdtemp(char *tempbuf)
|
| 57 |
+
{
|
| 58 |
+
int rand_value = 0;
|
| 59 |
+
char* tempbase = NULL;
|
| 60 |
+
char tempbasebuf[MAX_PATH] = "";
|
| 61 |
+
|
| 62 |
+
if (strcmp(&tempbuf[strlen(tempbuf)-6], "XXXXXX")) {
|
| 63 |
+
errno = EINVAL;
|
| 64 |
+
return NULL;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
util::rand_init();
|
| 68 |
+
rand_value = util::rand_excl(1e6);
|
| 69 |
+
tempbase = strrchr(tempbuf, '/');
|
| 70 |
+
tempbase = tempbase ? tempbase+1 : tempbuf;
|
| 71 |
+
strcpy(tempbasebuf, tempbase);
|
| 72 |
+
sprintf(&tempbasebuf[strlen(tempbasebuf)-6], "%d", rand_value);
|
| 73 |
+
::GetTempPath(MAX_PATH, tempbuf);
|
| 74 |
+
strcat(tempbuf, tempbasebuf);
|
| 75 |
+
::CreateDirectory(tempbuf, NULL);
|
| 76 |
+
return tempbuf;
|
| 77 |
+
}
|
| 78 |
+
#endif
|
| 79 |
+
|
| 80 |
+
namespace Moses
|
| 81 |
+
{
|
| 82 |
+
|
| 83 |
+
PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line)
|
| 84 |
+
:PhraseDictionary(line, true)
|
| 85 |
+
,m_config(3)
|
| 86 |
+
,m_FuzzyMatchWrapper(NULL)
|
| 87 |
+
{
|
| 88 |
+
ReadParameters();
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
PhraseDictionaryFuzzyMatch::~PhraseDictionaryFuzzyMatch()
|
| 92 |
+
{
|
| 93 |
+
delete m_FuzzyMatchWrapper;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
void PhraseDictionaryFuzzyMatch::Load(AllOptions::ptr const& opts)
|
| 97 |
+
{
|
| 98 |
+
m_options = opts;
|
| 99 |
+
SetFeaturesToApply();
|
| 100 |
+
|
| 101 |
+
m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(
|
| 105 |
+
const ChartParser &parser,
|
| 106 |
+
const ChartCellCollectionBase &cellCollection,
|
| 107 |
+
std::size_t /*maxChartSpan*/)
|
| 108 |
+
{
|
| 109 |
+
return new ChartRuleLookupManagerMemoryPerSentence(parser, cellCollection, *this);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
void
|
| 113 |
+
PhraseDictionaryFuzzyMatch::
|
| 114 |
+
SetParameter(const std::string& key, const std::string& value)
|
| 115 |
+
{
|
| 116 |
+
if (key == "source") {
|
| 117 |
+
m_config[0] = value;
|
| 118 |
+
} else if (key == "target") {
|
| 119 |
+
m_config[1] = value;
|
| 120 |
+
} else if (key == "alignment") {
|
| 121 |
+
m_config[2] = value;
|
| 122 |
+
} else {
|
| 123 |
+
PhraseDictionary::SetParameter(key, value);
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
int removedirectoryrecursively(const char *dirname)
|
| 128 |
+
{
|
| 129 |
+
#if defined __MINGW32__
|
| 130 |
+
//TODO(jie): replace this function with boost implementation
|
| 131 |
+
#else
|
| 132 |
+
DIR *dir;
|
| 133 |
+
struct dirent *entry;
|
| 134 |
+
char path[PATH_MAX];
|
| 135 |
+
|
| 136 |
+
dir = opendir(dirname);
|
| 137 |
+
if (dir == NULL) {
|
| 138 |
+
perror("Error opendir()");
|
| 139 |
+
return 0;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
while ((entry = readdir(dir)) != NULL) {
|
| 143 |
+
if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) {
|
| 144 |
+
snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name);
|
| 145 |
+
if (entry->d_type == DT_DIR) {
|
| 146 |
+
removedirectoryrecursively(path);
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
remove(path);
|
| 150 |
+
/*
|
| 151 |
+
* Here, the actual deletion must be done. Beacuse this is
|
| 152 |
+
* quite a dangerous thing to do, and this program is not very
|
| 153 |
+
* well tested, we are just printing as if we are deleting.
|
| 154 |
+
*/
|
| 155 |
+
//printf("(not really) Deleting: %s\n", path);
|
| 156 |
+
/*
|
| 157 |
+
* When you are finished testing this and feel you are ready to do the real
|
| 158 |
+
* deleting, use this: remove*STUB*(path);
|
| 159 |
+
* (see "man 3 remove")
|
| 160 |
+
* Please note that I DONT TAKE RESPONSIBILITY for data you delete with this!
|
| 161 |
+
*/
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
}
|
| 165 |
+
closedir(dir);
|
| 166 |
+
|
| 167 |
+
rmdir(dirname);
|
| 168 |
+
/*
|
| 169 |
+
* Now the directory is emtpy, finally delete the directory itself. (Just
|
| 170 |
+
* printing here, see above)
|
| 171 |
+
*/
|
| 172 |
+
//printf("(not really) Deleting: %s\n", dirname);
|
| 173 |
+
#endif
|
| 174 |
+
return 1;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
void PhraseDictionaryFuzzyMatch::InitializeForInput(ttasksptr const& ttask)
|
| 178 |
+
{
|
| 179 |
+
InputType const& inputSentence = *ttask->GetSource();
|
| 180 |
+
#if defined __MINGW32__
|
| 181 |
+
char dirName[] = "moses.XXXXXX";
|
| 182 |
+
#else
|
| 183 |
+
char dirName[] = "/tmp/moses.XXXXXX";
|
| 184 |
+
#endif // defined
|
| 185 |
+
char *temp = mkdtemp(dirName);
|
| 186 |
+
UTIL_THROW_IF2(temp == NULL,
|
| 187 |
+
"Couldn't create temporary directory " << dirName);
|
| 188 |
+
|
| 189 |
+
string dirNameStr(dirName);
|
| 190 |
+
|
| 191 |
+
string inFileName(dirNameStr + "/in");
|
| 192 |
+
|
| 193 |
+
ofstream inFile(inFileName.c_str());
|
| 194 |
+
|
| 195 |
+
for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) {
|
| 196 |
+
inFile << inputSentence.GetWord(i);
|
| 197 |
+
}
|
| 198 |
+
inFile << endl;
|
| 199 |
+
inFile.close();
|
| 200 |
+
|
| 201 |
+
long translationId = inputSentence.GetTranslationId();
|
| 202 |
+
string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);
|
| 203 |
+
|
| 204 |
+
// populate with rules for this sentence
|
| 205 |
+
PhraseDictionaryNodeMemory &rootNode = m_collection[translationId];
|
| 206 |
+
FormatType format = MosesFormat;
|
| 207 |
+
|
| 208 |
+
// data from file
|
| 209 |
+
InputFileStream inStream(ptFileName);
|
| 210 |
+
|
| 211 |
+
// copied from class LoaderStandard
|
| 212 |
+
PrintUserTime("Start loading fuzzy-match phrase model");
|
| 213 |
+
|
| 214 |
+
const StaticData &staticData = StaticData::Instance();
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
string lineOrig;
|
| 218 |
+
size_t count = 0;
|
| 219 |
+
|
| 220 |
+
while(getline(inStream, lineOrig)) {
|
| 221 |
+
const string *line;
|
| 222 |
+
if (format == HieroFormat) { // reformat line
|
| 223 |
+
UTIL_THROW(util::Exception, "Cannot be Hiero format");
|
| 224 |
+
//line = ReformatHieroRule(lineOrig);
|
| 225 |
+
} else {
|
| 226 |
+
// do nothing to format of line
|
| 227 |
+
line = &lineOrig;
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
vector<string> tokens;
|
| 231 |
+
vector<float> scoreVector;
|
| 232 |
+
|
| 233 |
+
TokenizeMultiCharSeparator(tokens, *line , "|||" );
|
| 234 |
+
|
| 235 |
+
if (tokens.size() != 4 && tokens.size() != 5) {
|
| 236 |
+
UTIL_THROW2("Syntax error at " << ptFileName << ":" << count);
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
const string &sourcePhraseString = tokens[0]
|
| 240 |
+
, &targetPhraseString = tokens[1]
|
| 241 |
+
, &scoreString = tokens[2]
|
| 242 |
+
, &alignString = tokens[3];
|
| 243 |
+
|
| 244 |
+
bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
|
| 245 |
+
if (isLHSEmpty && !ttask->options()->unk.word_deletion_enabled) {
|
| 246 |
+
TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
|
| 247 |
+
continue;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
Tokenize<float>(scoreVector, scoreString);
|
| 251 |
+
const size_t numScoreComponents = GetNumScoreComponents();
|
| 252 |
+
if (scoreVector.size() != numScoreComponents) {
|
| 253 |
+
UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
|
| 254 |
+
<< numScoreComponents << ") of score components on line " << count);
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
UTIL_THROW_IF2(scoreVector.size() != numScoreComponents,
|
| 258 |
+
"Number of scores incorrectly specified");
|
| 259 |
+
|
| 260 |
+
// parse source & find pt node
|
| 261 |
+
|
| 262 |
+
// constituent labels
|
| 263 |
+
Word *sourceLHS;
|
| 264 |
+
Word *targetLHS;
|
| 265 |
+
|
| 266 |
+
// source
|
| 267 |
+
Phrase sourcePhrase( 0);
|
| 268 |
+
sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS);
|
| 269 |
+
|
| 270 |
+
// create target phrase obj
|
| 271 |
+
TargetPhrase *targetPhrase = new TargetPhrase(this);
|
| 272 |
+
targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS);
|
| 273 |
+
|
| 274 |
+
// rest of target phrase
|
| 275 |
+
targetPhrase->SetAlignmentInfo(alignString);
|
| 276 |
+
targetPhrase->SetTargetLHS(targetLHS);
|
| 277 |
+
//targetPhrase->SetDebugOutput(string("New Format pt ") + line);
|
| 278 |
+
|
| 279 |
+
// component score, for n-best output
|
| 280 |
+
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
|
| 281 |
+
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
|
| 282 |
+
|
| 283 |
+
targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
|
| 284 |
+
targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
|
| 285 |
+
|
| 286 |
+
TargetPhraseCollection::shared_ptr phraseColl
|
| 287 |
+
= GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase,
|
| 288 |
+
*targetPhrase, sourceLHS);
|
| 289 |
+
phraseColl->Add(targetPhrase);
|
| 290 |
+
|
| 291 |
+
count++;
|
| 292 |
+
|
| 293 |
+
if (format == HieroFormat) { // reformat line
|
| 294 |
+
delete line;
|
| 295 |
+
} else {
|
| 296 |
+
// do nothing
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
// sort and prune each target phrase collection
|
| 302 |
+
SortAndPrune(rootNode);
|
| 303 |
+
|
| 304 |
+
//removedirectoryrecursively(dirName);
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
TargetPhraseCollection::shared_ptr
|
| 308 |
+
PhraseDictionaryFuzzyMatch::
|
| 309 |
+
GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
|
| 310 |
+
, const Phrase &source
|
| 311 |
+
, const TargetPhrase &target
|
| 312 |
+
, const Word *sourceLHS)
|
| 313 |
+
{
|
| 314 |
+
PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);
|
| 315 |
+
return currNode.GetTargetPhraseCollection();
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
|
| 319 |
+
, const Phrase &source
|
| 320 |
+
, const TargetPhrase &target
|
| 321 |
+
, const Word *sourceLHS)
|
| 322 |
+
{
|
| 323 |
+
cerr << source << endl << target << endl;
|
| 324 |
+
const size_t size = source.GetSize();
|
| 325 |
+
|
| 326 |
+
const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
|
| 327 |
+
AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
|
| 328 |
+
|
| 329 |
+
PhraseDictionaryNodeMemory *currNode = &rootNode;
|
| 330 |
+
for (size_t pos = 0 ; pos < size ; ++pos) {
|
| 331 |
+
const Word& word = source.GetWord(pos);
|
| 332 |
+
|
| 333 |
+
if (word.IsNonTerminal()) {
|
| 334 |
+
// indexed by source label 1st
|
| 335 |
+
const Word &sourceNonTerm = word;
|
| 336 |
+
|
| 337 |
+
UTIL_THROW_IF2(iterAlign == alignmentInfo.end(),
|
| 338 |
+
"No alignment for non-term at position " << pos);
|
| 339 |
+
UTIL_THROW_IF2(iterAlign->first != pos,
|
| 340 |
+
"Alignment info incorrect at position " << pos);
|
| 341 |
+
|
| 342 |
+
size_t targetNonTermInd = iterAlign->second;
|
| 343 |
+
++iterAlign;
|
| 344 |
+
const Word &targetNonTerm = target.GetWord(targetNonTermInd);
|
| 345 |
+
|
| 346 |
+
#if defined(UNLABELLED_SOURCE)
|
| 347 |
+
currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
|
| 348 |
+
#else
|
| 349 |
+
currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);
|
| 350 |
+
#endif
|
| 351 |
+
} else {
|
| 352 |
+
currNode = currNode->GetOrCreateChild(word);
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
UTIL_THROW_IF2(currNode == NULL,
|
| 356 |
+
"Node not found at position " << pos);
|
| 357 |
+
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
// finally, the source LHS
|
| 361 |
+
//currNode = currNode->GetOrCreateChild(sourceLHS);
|
| 362 |
+
|
| 363 |
+
return *currNode;
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode)
|
| 367 |
+
{
|
| 368 |
+
if (GetTableLimit()) {
|
| 369 |
+
rootNode.Sort(GetTableLimit());
|
| 370 |
+
}
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source)
|
| 374 |
+
{
|
| 375 |
+
m_collection.erase(source.GetTranslationId());
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(long translationId) const
|
| 379 |
+
{
|
| 380 |
+
std::map<long, PhraseDictionaryNodeMemory>::const_iterator iter = m_collection.find(translationId);
|
| 381 |
+
UTIL_THROW_IF2(iter == m_collection.end(),
|
| 382 |
+
"Couldn't find root node for input: " << translationId);
|
| 383 |
+
return iter->second;
|
| 384 |
+
}
|
| 385 |
+
PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source)
|
| 386 |
+
{
|
| 387 |
+
long transId = source.GetTranslationId();
|
| 388 |
+
std::map<long, PhraseDictionaryNodeMemory>::iterator iter = m_collection.find(transId);
|
| 389 |
+
UTIL_THROW_IF2(iter == m_collection.end(),
|
| 390 |
+
"Couldn't find root node for input: " << transId);
|
| 391 |
+
return iter->second;
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
TO_STRING_BODY(PhraseDictionaryFuzzyMatch);
|
| 395 |
+
|
| 396 |
+
// friend
|
| 397 |
+
ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)
|
| 398 |
+
{
|
| 399 |
+
/*
|
| 400 |
+
typedef PhraseDictionaryNodeMemory::TerminalMap TermMap;
|
| 401 |
+
typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap;
|
| 402 |
+
|
| 403 |
+
const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection;
|
| 404 |
+
for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) {
|
| 405 |
+
const Word &sourceNonTerm = p->first.first;
|
| 406 |
+
out << sourceNonTerm;
|
| 407 |
+
}
|
| 408 |
+
for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) {
|
| 409 |
+
const Word &sourceTerm = p->first;
|
| 410 |
+
out << sourceTerm;
|
| 411 |
+
}
|
| 412 |
+
*/
|
| 413 |
+
|
| 414 |
+
return out;
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
}
|
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// vim:tabstop=2
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based language decoder
|
| 4 |
+
Copyright (C) 2010 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
|
| 21 |
+
#include "PhraseDictionaryOnDisk.h"
|
| 22 |
+
#include "moses/InputFileStream.h"
|
| 23 |
+
#include "moses/StaticData.h"
|
| 24 |
+
#include "moses/TargetPhraseCollection.h"
|
| 25 |
+
#include "moses/InputPath.h"
|
| 26 |
+
#include "moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h"
|
| 27 |
+
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h"
|
| 28 |
+
#include "moses/TranslationTask.h"
|
| 29 |
+
|
| 30 |
+
#include "OnDiskPt/OnDiskWrapper.h"
|
| 31 |
+
#include "OnDiskPt/Word.h"
|
| 32 |
+
|
| 33 |
+
#include "util/tokenize_piece.hh"
|
| 34 |
+
|
| 35 |
+
using namespace std;
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
namespace Moses
|
| 39 |
+
{
|
| 40 |
+
PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
|
| 41 |
+
: MyBase(line, true)
|
| 42 |
+
, m_maxSpanDefault(NOT_FOUND)
|
| 43 |
+
, m_maxSpanLabelled(NOT_FOUND)
|
| 44 |
+
{
|
| 45 |
+
ReadParameters();
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
PhraseDictionaryOnDisk::~PhraseDictionaryOnDisk()
|
| 49 |
+
{
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
void PhraseDictionaryOnDisk::Load(AllOptions::ptr const& opts)
|
| 53 |
+
{
|
| 54 |
+
m_options = opts;
|
| 55 |
+
SetFeaturesToApply();
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
ChartRuleLookupManager *PhraseDictionaryOnDisk::CreateRuleLookupManager(
|
| 59 |
+
const ChartParser &parser,
|
| 60 |
+
const ChartCellCollectionBase &cellCollection,
|
| 61 |
+
std::size_t /*maxChartSpan*/)
|
| 62 |
+
{
|
| 63 |
+
return new ChartRuleLookupManagerOnDisk(parser, cellCollection, *this,
|
| 64 |
+
GetImplementation(),
|
| 65 |
+
m_input,
|
| 66 |
+
m_output);
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation()
|
| 70 |
+
{
|
| 71 |
+
OnDiskPt::OnDiskWrapper* dict;
|
| 72 |
+
dict = m_implementation.get();
|
| 73 |
+
UTIL_THROW_IF2(dict == NULL, "Dictionary object not yet created for this thread");
|
| 74 |
+
return *dict;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
const OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation() const
|
| 78 |
+
{
|
| 79 |
+
OnDiskPt::OnDiskWrapper* dict;
|
| 80 |
+
dict = m_implementation.get();
|
| 81 |
+
UTIL_THROW_IF2(dict == NULL, "Dictionary object not yet created for this thread");
|
| 82 |
+
return *dict;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
void PhraseDictionaryOnDisk::InitializeForInput(ttasksptr const& ttask)
|
| 86 |
+
{
|
| 87 |
+
InputType const& source = *ttask->GetSource();
|
| 88 |
+
ReduceCache();
|
| 89 |
+
|
| 90 |
+
OnDiskPt::OnDiskWrapper *obj = new OnDiskPt::OnDiskWrapper();
|
| 91 |
+
obj->BeginLoad(m_filePath);
|
| 92 |
+
|
| 93 |
+
UTIL_THROW_IF2(obj->GetMisc("Version") != OnDiskPt::OnDiskWrapper::VERSION_NUM,
|
| 94 |
+
"On-disk phrase table is version " << obj->GetMisc("Version")
|
| 95 |
+
<< ". It is not compatible with version " << OnDiskPt::OnDiskWrapper::VERSION_NUM);
|
| 96 |
+
|
| 97 |
+
UTIL_THROW_IF2(obj->GetMisc("NumSourceFactors") != m_input.size(),
|
| 98 |
+
"On-disk phrase table has " << obj->GetMisc("NumSourceFactors") << " source factors."
|
| 99 |
+
<< ". The ini file specified " << m_input.size() << " source factors");
|
| 100 |
+
|
| 101 |
+
UTIL_THROW_IF2(obj->GetMisc("NumTargetFactors") != m_output.size(),
|
| 102 |
+
"On-disk phrase table has " << obj->GetMisc("NumTargetFactors") << " target factors."
|
| 103 |
+
<< ". The ini file specified " << m_output.size() << " target factors");
|
| 104 |
+
|
| 105 |
+
UTIL_THROW_IF2(obj->GetMisc("NumScores") != m_numScoreComponents,
|
| 106 |
+
"On-disk phrase table has " << obj->GetMisc("NumScores") << " scores."
|
| 107 |
+
<< ". The ini file specified " << m_numScoreComponents << " scores");
|
| 108 |
+
|
| 109 |
+
m_implementation.reset(obj);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
|
| 113 |
+
{
|
| 114 |
+
InputPathList::const_iterator iter;
|
| 115 |
+
for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
|
| 116 |
+
InputPath &inputPath = **iter;
|
| 117 |
+
GetTargetPhraseCollectionBatch(inputPath);
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
// delete nodes that's been saved
|
| 121 |
+
for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
|
| 122 |
+
InputPath &inputPath = **iter;
|
| 123 |
+
const OnDiskPt::PhraseNode *ptNode = static_cast<const OnDiskPt::PhraseNode*>(inputPath.GetPtNode(*this));
|
| 124 |
+
delete ptNode;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(InputPath &inputPath) const
|
| 130 |
+
{
|
| 131 |
+
OnDiskPt::OnDiskWrapper &wrapper = const_cast<OnDiskPt::OnDiskWrapper&>(GetImplementation());
|
| 132 |
+
const Phrase &phrase = inputPath.GetPhrase();
|
| 133 |
+
const InputPath *prevInputPath = inputPath.GetPrevPath();
|
| 134 |
+
|
| 135 |
+
const OnDiskPt::PhraseNode *prevPtNode = NULL;
|
| 136 |
+
|
| 137 |
+
if (prevInputPath) {
|
| 138 |
+
prevPtNode = static_cast<const OnDiskPt::PhraseNode*>(prevInputPath->GetPtNode(*this));
|
| 139 |
+
} else {
|
| 140 |
+
// Starting subphrase.
|
| 141 |
+
assert(phrase.GetSize() == 1);
|
| 142 |
+
prevPtNode = &wrapper.GetRootSourceNode();
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
// backoff
|
| 146 |
+
if (!SatisfyBackoff(inputPath)) {
|
| 147 |
+
return;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
if (prevPtNode) {
|
| 151 |
+
Word lastWord = phrase.GetWord(phrase.GetSize() - 1);
|
| 152 |
+
lastWord.OnlyTheseFactors(m_inputFactors);
|
| 153 |
+
OnDiskPt::Word *lastWordOnDisk = ConvertFromMoses(wrapper, m_input, lastWord);
|
| 154 |
+
|
| 155 |
+
TargetPhraseCollection::shared_ptr tpc;
|
| 156 |
+
if (lastWordOnDisk == NULL) {
|
| 157 |
+
// OOV according to this phrase table. Not possible to extend
|
| 158 |
+
inputPath.SetTargetPhrases(*this, tpc, NULL);
|
| 159 |
+
} else {
|
| 160 |
+
OnDiskPt::PhraseNode const* ptNode;
|
| 161 |
+
ptNode = prevPtNode->GetChild(*lastWordOnDisk, wrapper);
|
| 162 |
+
if (ptNode) tpc = GetTargetPhraseCollection(ptNode);
|
| 163 |
+
inputPath.SetTargetPhrases(*this, tpc, ptNode);
|
| 164 |
+
|
| 165 |
+
delete lastWordOnDisk;
|
| 166 |
+
}
|
| 167 |
+
}
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
TargetPhraseCollection::shared_ptr
|
| 171 |
+
PhraseDictionaryOnDisk::
|
| 172 |
+
GetTargetPhraseCollection(const OnDiskPt::PhraseNode *ptNode) const
|
| 173 |
+
{
|
| 174 |
+
TargetPhraseCollection::shared_ptr ret;
|
| 175 |
+
|
| 176 |
+
CacheColl &cache = GetCache();
|
| 177 |
+
size_t hash = (size_t) ptNode->GetFilePos();
|
| 178 |
+
|
| 179 |
+
CacheColl::iterator iter;
|
| 180 |
+
|
| 181 |
+
iter = cache.find(hash);
|
| 182 |
+
|
| 183 |
+
if (iter == cache.end()) {
|
| 184 |
+
// not in cache, need to look up from phrase table
|
| 185 |
+
ret = GetTargetPhraseCollectionNonCache(ptNode);
|
| 186 |
+
|
| 187 |
+
std::pair<TargetPhraseCollection::shared_ptr , clock_t> value(ret, clock());
|
| 188 |
+
cache[hash] = value;
|
| 189 |
+
} else {
|
| 190 |
+
// in cache. just use it
|
| 191 |
+
iter->second.second = clock();
|
| 192 |
+
ret = iter->second.first;
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
return ret;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
TargetPhraseCollection::shared_ptr
|
| 199 |
+
PhraseDictionaryOnDisk::
|
| 200 |
+
GetTargetPhraseCollectionNonCache(const OnDiskPt::PhraseNode *ptNode) const
|
| 201 |
+
{
|
| 202 |
+
OnDiskPt::OnDiskWrapper& wrapper
|
| 203 |
+
= const_cast<OnDiskPt::OnDiskWrapper&>(GetImplementation());
|
| 204 |
+
|
| 205 |
+
vector<float> weightT = StaticData::Instance().GetWeights(this);
|
| 206 |
+
OnDiskPt::Vocab &vocab = wrapper.GetVocab();
|
| 207 |
+
|
| 208 |
+
OnDiskPt::TargetPhraseCollection::shared_ptr targetPhrasesOnDisk
|
| 209 |
+
= ptNode->GetTargetPhraseCollection(m_tableLimit, wrapper);
|
| 210 |
+
TargetPhraseCollection::shared_ptr targetPhrases
|
| 211 |
+
= ConvertToMoses(targetPhrasesOnDisk, m_input, m_output, *this,
|
| 212 |
+
weightT, vocab, false);
|
| 213 |
+
|
| 214 |
+
// delete targetPhrasesOnDisk;
|
| 215 |
+
|
| 216 |
+
return targetPhrases;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
Moses::TargetPhraseCollection::shared_ptr
|
| 220 |
+
PhraseDictionaryOnDisk::ConvertToMoses(
|
| 221 |
+
const OnDiskPt::TargetPhraseCollection::shared_ptr targetPhrasesOnDisk
|
| 222 |
+
, const std::vector<Moses::FactorType> &inputFactors
|
| 223 |
+
, const std::vector<Moses::FactorType> &outputFactors
|
| 224 |
+
, const Moses::PhraseDictionary &phraseDict
|
| 225 |
+
, const std::vector<float> &weightT
|
| 226 |
+
, OnDiskPt::Vocab &vocab
|
| 227 |
+
, bool isSyntax) const
|
| 228 |
+
{
|
| 229 |
+
Moses::TargetPhraseCollection::shared_ptr ret;
|
| 230 |
+
ret.reset(new Moses::TargetPhraseCollection);
|
| 231 |
+
|
| 232 |
+
for (size_t i = 0; i < targetPhrasesOnDisk->GetSize(); ++i) {
|
| 233 |
+
const OnDiskPt::TargetPhrase &tp = targetPhrasesOnDisk->GetTargetPhrase(i);
|
| 234 |
+
Moses::TargetPhrase *mosesPhrase
|
| 235 |
+
= ConvertToMoses(tp, inputFactors, outputFactors, vocab,
|
| 236 |
+
phraseDict, weightT, isSyntax);
|
| 237 |
+
|
| 238 |
+
/*
|
| 239 |
+
// debugging output
|
| 240 |
+
stringstream strme;
|
| 241 |
+
strme << filePath << " " << *mosesPhrase;
|
| 242 |
+
mosesPhrase->SetDebugOutput(strme.str());
|
| 243 |
+
*/
|
| 244 |
+
|
| 245 |
+
ret->Add(mosesPhrase);
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
ret->Sort(true, phraseDict.GetTableLimit());
|
| 249 |
+
|
| 250 |
+
return ret;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
Moses::TargetPhrase *PhraseDictionaryOnDisk::ConvertToMoses(const OnDiskPt::TargetPhrase &targetPhraseOnDisk
|
| 254 |
+
, const std::vector<Moses::FactorType> &inputFactors
|
| 255 |
+
, const std::vector<Moses::FactorType> &outputFactors
|
| 256 |
+
, const OnDiskPt::Vocab &vocab
|
| 257 |
+
, const Moses::PhraseDictionary &phraseDict
|
| 258 |
+
, const std::vector<float> &weightT
|
| 259 |
+
, bool isSyntax) const
|
| 260 |
+
{
|
| 261 |
+
Moses::TargetPhrase *ret = new Moses::TargetPhrase(&phraseDict);
|
| 262 |
+
|
| 263 |
+
// words
|
| 264 |
+
size_t phraseSize = targetPhraseOnDisk.GetSize();
|
| 265 |
+
UTIL_THROW_IF2(phraseSize == 0, "Target phrase cannot be empty"); // last word is lhs
|
| 266 |
+
if (isSyntax) {
|
| 267 |
+
--phraseSize;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
for (size_t pos = 0; pos < phraseSize; ++pos) {
|
| 271 |
+
const OnDiskPt::Word &wordOnDisk = targetPhraseOnDisk.GetWord(pos);
|
| 272 |
+
ConvertToMoses(wordOnDisk, outputFactors, vocab, ret->AddWord());
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
// alignments
|
| 276 |
+
// int index = 0;
|
| 277 |
+
Moses::AlignmentInfo::CollType alignTerm, alignNonTerm;
|
| 278 |
+
std::set<std::pair<size_t, size_t> > alignmentInfo;
|
| 279 |
+
const OnDiskPt::PhrasePtr sp = targetPhraseOnDisk.GetSourcePhrase();
|
| 280 |
+
for (size_t ind = 0; ind < targetPhraseOnDisk.GetAlign().size(); ++ind) {
|
| 281 |
+
const std::pair<size_t, size_t> &entry = targetPhraseOnDisk.GetAlign()[ind];
|
| 282 |
+
alignmentInfo.insert(entry);
|
| 283 |
+
size_t sourcePos = entry.first;
|
| 284 |
+
size_t targetPos = entry.second;
|
| 285 |
+
|
| 286 |
+
if (targetPhraseOnDisk.GetWord(targetPos).IsNonTerminal()) {
|
| 287 |
+
alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
|
| 288 |
+
} else {
|
| 289 |
+
alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
}
|
| 293 |
+
ret->SetAlignTerm(alignTerm);
|
| 294 |
+
ret->SetAlignNonTerm(alignNonTerm);
|
| 295 |
+
|
| 296 |
+
if (isSyntax) {
|
| 297 |
+
Moses::Word *lhsTarget = new Moses::Word(true);
|
| 298 |
+
const OnDiskPt::Word &lhsOnDisk = targetPhraseOnDisk.GetWord(targetPhraseOnDisk.GetSize() - 1);
|
| 299 |
+
ConvertToMoses(lhsOnDisk, outputFactors, vocab, *lhsTarget);
|
| 300 |
+
ret->SetTargetLHS(lhsTarget);
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
// set source phrase
|
| 304 |
+
Moses::Phrase mosesSP(Moses::Input);
|
| 305 |
+
for (size_t pos = 0; pos < sp->GetSize(); ++pos) {
|
| 306 |
+
ConvertToMoses(sp->GetWord(pos), inputFactors, vocab, mosesSP.AddWord());
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
// scores
|
| 310 |
+
ret->GetScoreBreakdown().Assign(&phraseDict, targetPhraseOnDisk.GetScores());
|
| 311 |
+
|
| 312 |
+
// sparse features
|
| 313 |
+
ret->GetScoreBreakdown().Assign(&phraseDict, targetPhraseOnDisk.GetSparseFeatures());
|
| 314 |
+
|
| 315 |
+
// property
|
| 316 |
+
ret->SetProperties(targetPhraseOnDisk.GetProperty());
|
| 317 |
+
|
| 318 |
+
ret->EvaluateInIsolation(mosesSP, phraseDict.GetFeaturesToApply());
|
| 319 |
+
|
| 320 |
+
return ret;
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
void PhraseDictionaryOnDisk::ConvertToMoses(
|
| 324 |
+
const OnDiskPt::Word &wordOnDisk,
|
| 325 |
+
const std::vector<Moses::FactorType> &outputFactorsVec,
|
| 326 |
+
const OnDiskPt::Vocab &vocab,
|
| 327 |
+
Moses::Word &overwrite) const
|
| 328 |
+
{
|
| 329 |
+
Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
|
| 330 |
+
overwrite = Moses::Word(wordOnDisk.IsNonTerminal());
|
| 331 |
+
|
| 332 |
+
if (wordOnDisk.IsNonTerminal()) {
|
| 333 |
+
const std::string &tok = vocab.GetString(wordOnDisk.GetVocabId());
|
| 334 |
+
overwrite.SetFactor(0, factorColl.AddFactor(tok, wordOnDisk.IsNonTerminal()));
|
| 335 |
+
} else {
|
| 336 |
+
// TODO: this conversion should have been done at load time.
|
| 337 |
+
util::TokenIter<util::SingleCharacter> tok(vocab.GetString(wordOnDisk.GetVocabId()), '|');
|
| 338 |
+
|
| 339 |
+
for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
|
| 340 |
+
UTIL_THROW_IF2(!tok, "Too few factors in \"" << vocab.GetString(wordOnDisk.GetVocabId()) << "\"; was expecting " << outputFactorsVec.size());
|
| 341 |
+
overwrite.SetFactor(*t, factorColl.AddFactor(*tok, wordOnDisk.IsNonTerminal()));
|
| 342 |
+
}
|
| 343 |
+
UTIL_THROW_IF2(tok, "Too many factors in \"" << vocab.GetString(wordOnDisk.GetVocabId()) << "\"; was expecting " << outputFactorsVec.size());
|
| 344 |
+
}
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
OnDiskPt::Word *PhraseDictionaryOnDisk::ConvertFromMoses(OnDiskPt::OnDiskWrapper &wrapper, const std::vector<Moses::FactorType> &factorsVec
|
| 348 |
+
, const Moses::Word &origWord) const
|
| 349 |
+
{
|
| 350 |
+
bool isNonTerminal = origWord.IsNonTerminal();
|
| 351 |
+
OnDiskPt::Word *newWord = new OnDiskPt::Word(isNonTerminal);
|
| 352 |
+
|
| 353 |
+
util::StringStream strme;
|
| 354 |
+
|
| 355 |
+
size_t factorType = factorsVec[0];
|
| 356 |
+
const Moses::Factor *factor = origWord.GetFactor(factorType);
|
| 357 |
+
UTIL_THROW_IF2(factor == NULL, "Expecting factor " << factorType);
|
| 358 |
+
strme << factor->GetString();
|
| 359 |
+
|
| 360 |
+
for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) {
|
| 361 |
+
size_t factorType = factorsVec[ind];
|
| 362 |
+
const Moses::Factor *factor = origWord.GetFactor(factorType);
|
| 363 |
+
if (factor == NULL) {
|
| 364 |
+
// can have less factors than factorType.size()
|
| 365 |
+
break;
|
| 366 |
+
}
|
| 367 |
+
UTIL_THROW_IF2(factor == NULL,
|
| 368 |
+
"Expecting factor " << factorType << " at position " << ind);
|
| 369 |
+
strme << "|" << factor->GetString();
|
| 370 |
+
} // for (size_t factorType
|
| 371 |
+
|
| 372 |
+
bool found;
|
| 373 |
+
uint64_t vocabId = wrapper.GetVocab().GetVocabId(strme.str(), found);
|
| 374 |
+
if (!found) {
|
| 375 |
+
// factor not in phrase table -> phrse definately not in. exit
|
| 376 |
+
delete newWord;
|
| 377 |
+
return NULL;
|
| 378 |
+
} else {
|
| 379 |
+
newWord->SetVocabId(vocabId);
|
| 380 |
+
return newWord;
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
void PhraseDictionaryOnDisk::SetParameter(const std::string& key, const std::string& value)
|
| 386 |
+
{
|
| 387 |
+
if (key == "max-span-default") {
|
| 388 |
+
m_maxSpanDefault = Scan<size_t>(value);
|
| 389 |
+
} else if (key == "max-span-labelled") {
|
| 390 |
+
m_maxSpanLabelled = Scan<size_t>(value);
|
| 391 |
+
} else {
|
| 392 |
+
PhraseDictionary::SetParameter(key, value);
|
| 393 |
+
}
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
} // namespace
|
| 398 |
+
|
mosesdecoder/moses/TranslationModel/RuleTable/Trie.h
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2012 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include "moses/TranslationModel/PhraseDictionary.h"
|
| 23 |
+
#include "moses/TypeDef.h"
|
| 24 |
+
|
| 25 |
+
#include <string>
|
| 26 |
+
#include <vector>
|
| 27 |
+
|
| 28 |
+
namespace Moses
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
class Phrase;
|
| 32 |
+
class TargetPhrase;
|
| 33 |
+
class TargetPhraseCollection;
|
| 34 |
+
class Word;
|
| 35 |
+
|
| 36 |
+
/*** Implementation of a SCFG rule table in a trie. Looking up a rule of
|
| 37 |
+
* length n symbols requires n look-ups to find the TargetPhraseCollection.
|
| 38 |
+
* @todo why need this and PhraseDictionaryMemory?
|
| 39 |
+
*/
|
| 40 |
+
class RuleTableTrie : public PhraseDictionary
|
| 41 |
+
{
|
| 42 |
+
public:
|
| 43 |
+
RuleTableTrie(const std::string &line)
|
| 44 |
+
: PhraseDictionary(line, true) {
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
virtual ~RuleTableTrie();
|
| 48 |
+
|
| 49 |
+
void Load(AllOptions::ptr const& opts);
|
| 50 |
+
|
| 51 |
+
private:
|
| 52 |
+
friend class RuleTableLoader;
|
| 53 |
+
|
| 54 |
+
virtual TargetPhraseCollection::shared_ptr
|
| 55 |
+
GetOrCreateTargetPhraseCollection(const Phrase &source,
|
| 56 |
+
const TargetPhrase &target,
|
| 57 |
+
const Word *sourceLHS) = 0;
|
| 58 |
+
|
| 59 |
+
virtual void SortAndPrune() = 0;
|
| 60 |
+
|
| 61 |
+
};
|
| 62 |
+
|
| 63 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/RuleTable/UTrie.cpp
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2012 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "moses/NonTerminal.h"
|
| 21 |
+
#include "moses/TranslationModel/Scope3Parser/Parser.h"
|
| 22 |
+
#include "moses/StaticData.h"
|
| 23 |
+
#include "moses/TargetPhrase.h"
|
| 24 |
+
#include "moses/TargetPhraseCollection.h"
|
| 25 |
+
#include "moses/Util.h"
|
| 26 |
+
#include "moses/Word.h"
|
| 27 |
+
#include "UTrie.h"
|
| 28 |
+
#include "Trie.h"
|
| 29 |
+
#include "UTrieNode.h"
|
| 30 |
+
|
| 31 |
+
#include <boost/functional/hash.hpp>
|
| 32 |
+
#include <boost/unordered_map.hpp>
|
| 33 |
+
#include <boost/version.hpp>
|
| 34 |
+
|
| 35 |
+
#include <map>
|
| 36 |
+
#include <vector>
|
| 37 |
+
|
| 38 |
+
namespace Moses
|
| 39 |
+
{
|
| 40 |
+
|
| 41 |
+
TargetPhraseCollection::shared_ptr
|
| 42 |
+
RuleTableUTrie::
|
| 43 |
+
GetOrCreateTargetPhraseCollection(const Phrase &source,
|
| 44 |
+
const TargetPhrase &target,
|
| 45 |
+
const Word *sourceLHS)
|
| 46 |
+
{
|
| 47 |
+
UTrieNode &currNode = GetOrCreateNode(source, target, sourceLHS);
|
| 48 |
+
return currNode.GetOrCreateTargetPhraseCollection(target);
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
UTrieNode &RuleTableUTrie::GetOrCreateNode(const Phrase &source,
|
| 52 |
+
const TargetPhrase &target,
|
| 53 |
+
const Word */*sourceLHS*/)
|
| 54 |
+
{
|
| 55 |
+
const size_t size = source.GetSize();
|
| 56 |
+
|
| 57 |
+
const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
|
| 58 |
+
AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
|
| 59 |
+
|
| 60 |
+
UTrieNode *currNode = &m_root;
|
| 61 |
+
for (size_t pos = 0 ; pos < size ; ++pos) {
|
| 62 |
+
const Word &word = source.GetWord(pos);
|
| 63 |
+
|
| 64 |
+
if (word.IsNonTerminal()) {
|
| 65 |
+
assert(iterAlign != alignmentInfo.end());
|
| 66 |
+
assert(iterAlign->first == pos);
|
| 67 |
+
size_t targetNonTermInd = iterAlign->second;
|
| 68 |
+
++iterAlign;
|
| 69 |
+
const Word &targetNonTerm = target.GetWord(targetNonTermInd);
|
| 70 |
+
currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
|
| 71 |
+
} else {
|
| 72 |
+
currNode = currNode->GetOrCreateTerminalChild(word);
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
assert(currNode != NULL);
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
return *currNode;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
ChartRuleLookupManager *RuleTableUTrie::CreateRuleLookupManager(
|
| 82 |
+
const ChartParser &parser,
|
| 83 |
+
const ChartCellCollectionBase &cellCollection,
|
| 84 |
+
std::size_t maxChartSpan)
|
| 85 |
+
{
|
| 86 |
+
return new Scope3Parser(parser, cellCollection, *this, maxChartSpan);
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
void RuleTableUTrie::SortAndPrune()
|
| 90 |
+
{
|
| 91 |
+
if (GetTableLimit()) {
|
| 92 |
+
m_root.Sort(GetTableLimit());
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/RuleTable/UTrie.h
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2012 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include "Trie.h"
|
| 23 |
+
#include "UTrieNode.h"
|
| 24 |
+
#include "moses/TargetPhraseCollection.h"
|
| 25 |
+
|
| 26 |
+
namespace Moses
|
| 27 |
+
{
|
| 28 |
+
|
| 29 |
+
class Phrase;
|
| 30 |
+
class TargetPhrase;
|
| 31 |
+
class Word;
|
| 32 |
+
class ChartParser;
|
| 33 |
+
|
| 34 |
+
/** Implementation of RuleTableTrie. A RuleTableUTrie is designed to store
|
| 35 |
+
* string-to-tree SCFG grammars only (i.e. rules can have distinct labels on
|
| 36 |
+
* the target side, but only a generic non-terminal on the source side).
|
| 37 |
+
* A key is the source RHS (one symbol per edge) of a rule and a mapped value
|
| 38 |
+
* is the collection of grammar rules that share the same source RHS.
|
| 39 |
+
*
|
| 40 |
+
* (The 'U' in UTrie stands for 'unlabelled' -- the keys are unlabelled and
|
| 41 |
+
* the target labels are stored on the node values, as opposed to the grammar
|
| 42 |
+
* being a monolingual projection with target labels projected onto the source
|
| 43 |
+
* side.)
|
| 44 |
+
*/
|
| 45 |
+
class RuleTableUTrie : public RuleTableTrie
|
| 46 |
+
{
|
| 47 |
+
public:
|
| 48 |
+
RuleTableUTrie(const std::string &line)
|
| 49 |
+
: RuleTableTrie(line) {
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
const UTrieNode &GetRootNode() const {
|
| 53 |
+
return m_root;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
ChartRuleLookupManager *CreateRuleLookupManager(const ChartParser &,
|
| 57 |
+
const ChartCellCollectionBase &, std::size_t);
|
| 58 |
+
|
| 59 |
+
private:
|
| 60 |
+
TargetPhraseCollection::shared_ptr
|
| 61 |
+
GetOrCreateTargetPhraseCollection(const Phrase &source,
|
| 62 |
+
const TargetPhrase &target,
|
| 63 |
+
const Word *sourceLHS);
|
| 64 |
+
|
| 65 |
+
UTrieNode &GetOrCreateNode(const Phrase &source, const TargetPhrase &target,
|
| 66 |
+
const Word *sourceLHS);
|
| 67 |
+
|
| 68 |
+
void SortAndPrune();
|
| 69 |
+
|
| 70 |
+
UTrieNode m_root;
|
| 71 |
+
};
|
| 72 |
+
|
| 73 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/fuzzy-match/Alignments.h
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <vector>
|
| 5 |
+
#include <map>
|
| 6 |
+
|
| 7 |
+
class Alignments
|
| 8 |
+
{
|
| 9 |
+
public:
|
| 10 |
+
std::vector< std::map<int, int> > m_alignS2T, m_alignT2S;
|
| 11 |
+
|
| 12 |
+
Alignments(const std::string &align, size_t sourceSize, size_t targetSize);
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
protected:
|
| 16 |
+
|
| 17 |
+
};
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
ADDED
|
@@ -0,0 +1,1029 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// FuzzyMatchWrapper.cpp
|
| 3 |
+
// moses
|
| 4 |
+
//
|
| 5 |
+
// Created by Hieu Hoang on 26/07/2012.
|
| 6 |
+
// Copyright 2012 __MyCompanyName__. All rights reserved.
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
#include <iostream>
|
| 10 |
+
#include "FuzzyMatchWrapper.h"
|
| 11 |
+
#include "SentenceAlignment.h"
|
| 12 |
+
#include "Match.h"
|
| 13 |
+
#include "create_xml.h"
|
| 14 |
+
#include "moses/Util.h"
|
| 15 |
+
#include "moses/StaticData.h"
|
| 16 |
+
#include "util/file.hh"
|
| 17 |
+
|
| 18 |
+
using namespace std;
|
| 19 |
+
|
| 20 |
+
namespace tmmt
|
| 21 |
+
{
|
| 22 |
+
|
| 23 |
+
FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath)
|
| 24 |
+
:basic_flag(false)
|
| 25 |
+
,lsed_flag(true)
|
| 26 |
+
,refined_flag(true)
|
| 27 |
+
,length_filter_flag(true)
|
| 28 |
+
,parse_flag(true)
|
| 29 |
+
,min_match(70)
|
| 30 |
+
,multiple_flag(true)
|
| 31 |
+
,multiple_slack(0)
|
| 32 |
+
,multiple_max(100)
|
| 33 |
+
{
|
| 34 |
+
cerr << "creating suffix array" << endl;
|
| 35 |
+
suffixArray = new tmmt::SuffixArray( sourcePath );
|
| 36 |
+
|
| 37 |
+
//cerr << "loading source data" << endl;
|
| 38 |
+
//load_corpus(sourcePath, source);
|
| 39 |
+
|
| 40 |
+
cerr << "loading target data" << endl;
|
| 41 |
+
load_target(targetPath, targetAndAlignment);
|
| 42 |
+
|
| 43 |
+
cerr << "loading alignment" << endl;
|
| 44 |
+
load_alignment(alignmentPath, targetAndAlignment);
|
| 45 |
+
|
| 46 |
+
// create suffix array
|
| 47 |
+
//load_corpus(m_config[0], input);
|
| 48 |
+
|
| 49 |
+
cerr << "loading completed" << endl;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
|
| 53 |
+
{
|
| 54 |
+
const Moses::StaticData &staticData = Moses::StaticData::Instance();
|
| 55 |
+
|
| 56 |
+
WordIndex wordIndex;
|
| 57 |
+
|
| 58 |
+
string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);
|
| 59 |
+
|
| 60 |
+
// create extrac files
|
| 61 |
+
create_xml(fuzzyMatchFile);
|
| 62 |
+
|
| 63 |
+
// create phrase table with usual Moses scoring and consolidate programs
|
| 64 |
+
string cmd;
|
| 65 |
+
cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > "
|
| 66 |
+
+ fuzzyMatchFile + ".extract.sorted.gz";
|
| 67 |
+
system(cmd.c_str());
|
| 68 |
+
cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > "
|
| 69 |
+
+ fuzzyMatchFile + ".extract.inv.sorted.gz";
|
| 70 |
+
system(cmd.c_str());
|
| 71 |
+
|
| 72 |
+
#ifdef IS_XCODE
|
| 73 |
+
cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin";
|
| 74 |
+
#elif IS_ECLIPSE
|
| 75 |
+
cmd = "/home/hieu/workspace/github/moses-smt/bin";
|
| 76 |
+
#else
|
| 77 |
+
cmd = staticData.GetBinDirectory();
|
| 78 |
+
#endif
|
| 79 |
+
|
| 80 |
+
cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ")
|
| 81 |
+
+ " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" "
|
| 82 |
+
+ " -phrase-translation-table " + fuzzyMatchFile + ".pt";
|
| 83 |
+
system(cmd.c_str());
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
return fuzzyMatchFile + ".pt.gz";
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
|
| 90 |
+
{
|
| 91 |
+
const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();
|
| 92 |
+
|
| 93 |
+
string inputPath = dirNameStr + "/in";
|
| 94 |
+
string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile";
|
| 95 |
+
ofstream fuzzyMatchStream(fuzzyMatchFile.c_str());
|
| 96 |
+
|
| 97 |
+
vector< vector< WORD_ID > > input;
|
| 98 |
+
load_corpus(inputPath, input);
|
| 99 |
+
|
| 100 |
+
assert(input.size() == 1);
|
| 101 |
+
size_t sentenceInd = 0;
|
| 102 |
+
|
| 103 |
+
clock_t start_clock = clock();
|
| 104 |
+
// if (i % 10 == 0) cerr << ".";
|
| 105 |
+
|
| 106 |
+
// establish some basic statistics
|
| 107 |
+
|
| 108 |
+
// int input_length = compute_length( input[i] );
|
| 109 |
+
int input_length = input[sentenceInd].size();
|
| 110 |
+
int best_cost = input_length * (100-min_match) / 100 + 1;
|
| 111 |
+
|
| 112 |
+
int match_count = 0; // how many substring matches to be considered
|
| 113 |
+
//cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
|
| 114 |
+
|
| 115 |
+
// find match ranges in suffix array
|
| 116 |
+
vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
|
| 117 |
+
for(int start=0; start<input[sentenceInd].size(); start++) {
|
| 118 |
+
SuffixArray::INDEX prior_first_match = 0;
|
| 119 |
+
SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
|
| 120 |
+
vector< string > substring;
|
| 121 |
+
bool stillMatched = true;
|
| 122 |
+
vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
|
| 123 |
+
//cerr << "start: " << start;
|
| 124 |
+
for(size_t word=start; stillMatched && word<input[sentenceInd].size(); word++) {
|
| 125 |
+
substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );
|
| 126 |
+
|
| 127 |
+
// only look up, if needed (i.e. no unnecessary short gram lookups)
|
| 128 |
+
// if (! word-start+1 <= short_match_max_length( input_length ) )
|
| 129 |
+
// {
|
| 130 |
+
SuffixArray::INDEX first_match, last_match;
|
| 131 |
+
stillMatched = false;
|
| 132 |
+
if (suffixArray->FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) {
|
| 133 |
+
stillMatched = true;
|
| 134 |
+
matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
|
| 135 |
+
//cerr << " (" << first_match << "," << last_match << ")";
|
| 136 |
+
//cerr << " " << ( last_match - first_match + 1 );
|
| 137 |
+
prior_first_match = first_match;
|
| 138 |
+
prior_last_match = last_match;
|
| 139 |
+
}
|
| 140 |
+
//}
|
| 141 |
+
}
|
| 142 |
+
//cerr << endl;
|
| 143 |
+
match_range.push_back( matchedAtThisStart );
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
clock_t clock_range = clock();
|
| 147 |
+
|
| 148 |
+
map< int, vector< Match > > sentence_match;
|
| 149 |
+
map< int, int > sentence_match_word_count;
|
| 150 |
+
|
| 151 |
+
// go through all matches, longest first
|
| 152 |
+
for(int length = input[sentenceInd].size(); length >= 1; length--) {
|
| 153 |
+
// do not create matches, if these are handled by the short match function
|
| 154 |
+
if (length <= short_match_max_length( input_length ) ) {
|
| 155 |
+
continue;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
unsigned int count = 0;
|
| 159 |
+
for(int start = 0; start <= input[sentenceInd].size() - length; start++) {
|
| 160 |
+
if (match_range[start].size() >= length) {
|
| 161 |
+
pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
|
| 162 |
+
// cerr << " (" << range.first << "," << range.second << ")";
|
| 163 |
+
count += range.second - range.first + 1;
|
| 164 |
+
|
| 165 |
+
for(SuffixArray::INDEX i=range.first; i<=range.second; i++) {
|
| 166 |
+
size_t position = suffixArray->GetPosition( i );
|
| 167 |
+
|
| 168 |
+
// sentence length mismatch
|
| 169 |
+
size_t sentence_id = suffixArray->GetSentence( position );
|
| 170 |
+
int sentence_length = suffixArray->GetSentenceLength( sentence_id );
|
| 171 |
+
int diff = abs( (int)sentence_length - (int)input_length );
|
| 172 |
+
// cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
|
| 173 |
+
//if (length <= 2 && input_length>=5 &&
|
| 174 |
+
// sentence_match.find( sentence_id ) == sentence_match.end())
|
| 175 |
+
// continue;
|
| 176 |
+
|
| 177 |
+
if (diff > best_cost)
|
| 178 |
+
continue;
|
| 179 |
+
|
| 180 |
+
// compute minimal cost
|
| 181 |
+
int start_pos = suffixArray->GetWordInSentence( position );
|
| 182 |
+
int end_pos = start_pos + length-1;
|
| 183 |
+
// cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. "
|
| 184 |
+
// << start << "-" << (start+length-1) << " (" << input_length << ")";
|
| 185 |
+
// different number of prior words -> cost is at least diff
|
| 186 |
+
int min_cost = abs( start - start_pos );
|
| 187 |
+
|
| 188 |
+
// same number of words, but not sent. start -> cost is at least 1
|
| 189 |
+
if (start == start_pos && start>0)
|
| 190 |
+
min_cost++;
|
| 191 |
+
|
| 192 |
+
// different number of remaining words -> cost is at least diff
|
| 193 |
+
min_cost += abs( ( sentence_length-1 - end_pos ) -
|
| 194 |
+
( input_length-1 - (start+length-1) ) );
|
| 195 |
+
|
| 196 |
+
// same number of words, but not sent. end -> cost is at least 1
|
| 197 |
+
if ( sentence_length-1 - end_pos ==
|
| 198 |
+
input_length-1 - (start+length-1)
|
| 199 |
+
&& end_pos != sentence_length-1 )
|
| 200 |
+
min_cost++;
|
| 201 |
+
|
| 202 |
+
// cerr << " -> min_cost " << min_cost;
|
| 203 |
+
if (min_cost > best_cost)
|
| 204 |
+
continue;
|
| 205 |
+
|
| 206 |
+
// valid match
|
| 207 |
+
match_count++;
|
| 208 |
+
|
| 209 |
+
// compute maximal cost
|
| 210 |
+
int max_cost = max( start, start_pos )
|
| 211 |
+
+ max( sentence_length-1 - end_pos,
|
| 212 |
+
input_length-1 - (start+length-1) );
|
| 213 |
+
// cerr << ", max_cost " << max_cost;
|
| 214 |
+
|
| 215 |
+
Match m = Match( start, start+length-1,
|
| 216 |
+
start_pos, start_pos+length-1,
|
| 217 |
+
min_cost, max_cost, 0);
|
| 218 |
+
sentence_match[ sentence_id ].push_back( m );
|
| 219 |
+
sentence_match_word_count[ sentence_id ] += length;
|
| 220 |
+
|
| 221 |
+
if (max_cost < best_cost) {
|
| 222 |
+
best_cost = max_cost;
|
| 223 |
+
if (best_cost == 0) break;
|
| 224 |
+
}
|
| 225 |
+
//if (match_count >= MAX_MATCH_COUNT) break;
|
| 226 |
+
}
|
| 227 |
+
}
|
| 228 |
+
// cerr << endl;
|
| 229 |
+
if (best_cost == 0) break;
|
| 230 |
+
//if (match_count >= MAX_MATCH_COUNT) break;
|
| 231 |
+
}
|
| 232 |
+
// cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
|
| 233 |
+
|
| 234 |
+
if (best_cost == 0) break;
|
| 235 |
+
//if (match_count >= MAX_MATCH_COUNT) break;
|
| 236 |
+
}
|
| 237 |
+
cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
|
| 238 |
+
|
| 239 |
+
clock_t clock_matches = clock();
|
| 240 |
+
|
| 241 |
+
// consider each sentence for which we have matches
|
| 242 |
+
int old_best_cost = best_cost;
|
| 243 |
+
int tm_count_word_match = 0;
|
| 244 |
+
int tm_count_word_match2 = 0;
|
| 245 |
+
int pruned_match_count = 0;
|
| 246 |
+
if (short_match_max_length( input_length )) {
|
| 247 |
+
init_short_matches(wordIndex, translationId, input[sentenceInd] );
|
| 248 |
+
}
|
| 249 |
+
vector< int > best_tm;
|
| 250 |
+
typedef map< int, vector< Match > >::iterator I;
|
| 251 |
+
|
| 252 |
+
clock_t clock_validation_sum = 0;
|
| 253 |
+
|
| 254 |
+
for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) {
|
| 255 |
+
int tmID = tm->first;
|
| 256 |
+
int tm_length = suffixArray->GetSentenceLength(tmID);
|
| 257 |
+
vector< Match > &match = tm->second;
|
| 258 |
+
add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );
|
| 259 |
+
|
| 260 |
+
//cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
|
| 261 |
+
|
| 262 |
+
// quick look: how many words are matched
|
| 263 |
+
int words_matched = 0;
|
| 264 |
+
for(size_t m=0; m<match.size(); m++) {
|
| 265 |
+
|
| 266 |
+
if (match[m].min_cost <= best_cost) // makes no difference
|
| 267 |
+
words_matched += match[m].input_end - match[m].input_start + 1;
|
| 268 |
+
}
|
| 269 |
+
if (max(input_length,tm_length) - words_matched > best_cost) {
|
| 270 |
+
if (length_filter_flag) continue;
|
| 271 |
+
}
|
| 272 |
+
tm_count_word_match++;
|
| 273 |
+
|
| 274 |
+
// prune, check again how many words are matched
|
| 275 |
+
vector< Match > pruned = prune_matches( match, best_cost );
|
| 276 |
+
words_matched = 0;
|
| 277 |
+
for(size_t p=0; p<pruned.size(); p++) {
|
| 278 |
+
words_matched += pruned[p].input_end - pruned[p].input_start + 1;
|
| 279 |
+
}
|
| 280 |
+
if (max(input_length,tm_length) - words_matched > best_cost) {
|
| 281 |
+
if (length_filter_flag) continue;
|
| 282 |
+
}
|
| 283 |
+
tm_count_word_match2++;
|
| 284 |
+
|
| 285 |
+
pruned_match_count += pruned.size();
|
| 286 |
+
int prior_best_cost = best_cost;
|
| 287 |
+
int cost;
|
| 288 |
+
|
| 289 |
+
clock_t clock_validation_start = clock();
|
| 290 |
+
if (! parse_flag ||
|
| 291 |
+
pruned.size()>=10) { // to prevent worst cases
|
| 292 |
+
string path;
|
| 293 |
+
cost = sed( input[sentenceInd], source[tmID], path, false );
|
| 294 |
+
if (cost < best_cost) {
|
| 295 |
+
best_cost = cost;
|
| 296 |
+
}
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
else {
|
| 300 |
+
cost = parse_matches( pruned, input_length, tm_length, best_cost );
|
| 301 |
+
if (prior_best_cost != best_cost) {
|
| 302 |
+
best_tm.clear();
|
| 303 |
+
}
|
| 304 |
+
}
|
| 305 |
+
clock_validation_sum += clock() - clock_validation_start;
|
| 306 |
+
if (cost == best_cost) {
|
| 307 |
+
best_tm.push_back( tmID );
|
| 308 |
+
}
|
| 309 |
+
}
|
| 310 |
+
cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
|
| 311 |
+
cerr << "tm considered: " << sentence_match.size()
|
| 312 |
+
<< " word-matched: " << tm_count_word_match
|
| 313 |
+
<< " word-matched2: " << tm_count_word_match2
|
| 314 |
+
<< " best: " << best_tm.size() << endl;
|
| 315 |
+
|
| 316 |
+
cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
|
| 317 |
+
|
| 318 |
+
// create xml and extract files
|
| 319 |
+
string inputStr, sourceStr;
|
| 320 |
+
for (size_t pos = 0; pos < input_length; ++pos) {
|
| 321 |
+
inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " ";
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
// do not try to find the best ... report multiple matches
|
| 325 |
+
if (multiple_flag) {
|
| 326 |
+
for(size_t si=0; si<best_tm.size(); si++) {
|
| 327 |
+
int s = best_tm[si];
|
| 328 |
+
string path;
|
| 329 |
+
sed( input[sentenceInd], source[s], path, true );
|
| 330 |
+
const vector<WORD_ID> &sourceSentence = source[s];
|
| 331 |
+
vector<SentenceAlignment> &targets = targetAndAlignment[s];
|
| 332 |
+
create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);
|
| 333 |
+
|
| 334 |
+
}
|
| 335 |
+
} // if (multiple_flag)
|
| 336 |
+
else {
|
| 337 |
+
|
| 338 |
+
// find the best matches according to letter sed
|
| 339 |
+
string best_path = "";
|
| 340 |
+
int best_match = -1;
|
| 341 |
+
unsigned int best_letter_cost;
|
| 342 |
+
if (lsed_flag) {
|
| 343 |
+
best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
|
| 344 |
+
for(size_t si=0; si<best_tm.size(); si++) {
|
| 345 |
+
int s = best_tm[si];
|
| 346 |
+
string path;
|
| 347 |
+
unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
|
| 348 |
+
if (letter_cost < best_letter_cost) {
|
| 349 |
+
best_letter_cost = letter_cost;
|
| 350 |
+
best_path = path;
|
| 351 |
+
best_match = s;
|
| 352 |
+
}
|
| 353 |
+
}
|
| 354 |
+
}
|
| 355 |
+
// if letter sed turned off, just compute path for first match
|
| 356 |
+
else {
|
| 357 |
+
if (best_tm.size() > 0) {
|
| 358 |
+
string path;
|
| 359 |
+
sed( input[sentenceInd], source[best_tm[0]], path, false );
|
| 360 |
+
best_path = path;
|
| 361 |
+
best_match = best_tm[0];
|
| 362 |
+
}
|
| 363 |
+
}
|
| 364 |
+
cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
|
| 365 |
+
<< " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
|
| 366 |
+
<< " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
|
| 367 |
+
<< " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
|
| 368 |
+
<< " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
|
| 369 |
+
<< " )" << endl;
|
| 370 |
+
if (lsed_flag) {
|
| 371 |
+
//cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " (";
|
| 372 |
+
}
|
| 373 |
+
//cout << best_cost <<"/" << input_length;
|
| 374 |
+
if (lsed_flag) {
|
| 375 |
+
//cout << ")";
|
| 376 |
+
}
|
| 377 |
+
//cout << " ||| " << best_match << " ||| " << best_path << endl;
|
| 378 |
+
|
| 379 |
+
if (best_match == -1) {
|
| 380 |
+
UTIL_THROW_IF2(source.size() == 0, "Empty source phrase");
|
| 381 |
+
best_match = 0;
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
// creat xml & extracts
|
| 385 |
+
const vector<WORD_ID> &sourceSentence = source[best_match];
|
| 386 |
+
vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
|
| 387 |
+
create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream);
|
| 388 |
+
|
| 389 |
+
} // else if (multiple_flag)
|
| 390 |
+
|
| 391 |
+
fuzzyMatchStream.close();
|
| 392 |
+
|
| 393 |
+
return fuzzyMatchFile;
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus )
|
| 397 |
+
{
|
| 398 |
+
// source
|
| 399 |
+
ifstream fileStream;
|
| 400 |
+
fileStream.open(fileName.c_str());
|
| 401 |
+
if (!fileStream) {
|
| 402 |
+
cerr << "file not found: " << fileName << endl;
|
| 403 |
+
exit(1);
|
| 404 |
+
}
|
| 405 |
+
cerr << "loading " << fileName << endl;
|
| 406 |
+
|
| 407 |
+
istream *fileStreamP = &fileStream;
|
| 408 |
+
|
| 409 |
+
string line;
|
| 410 |
+
while(getline(*fileStreamP, line)) {
|
| 411 |
+
corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) );
|
| 412 |
+
}
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus)
|
| 416 |
+
{
|
| 417 |
+
ifstream fileStream;
|
| 418 |
+
fileStream.open(fileName.c_str());
|
| 419 |
+
if (!fileStream) {
|
| 420 |
+
cerr << "file not found: " << fileName << endl;
|
| 421 |
+
exit(1);
|
| 422 |
+
}
|
| 423 |
+
cerr << "loading " << fileName << endl;
|
| 424 |
+
|
| 425 |
+
istream *fileStreamP = &fileStream;
|
| 426 |
+
|
| 427 |
+
WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
|
| 428 |
+
|
| 429 |
+
int lineNum = 0;
|
| 430 |
+
string line;
|
| 431 |
+
while(getline(*fileStreamP, line)) {
|
| 432 |
+
vector<WORD_ID> toks = GetVocabulary().Tokenize( line.c_str() );
|
| 433 |
+
|
| 434 |
+
corpus.push_back(vector< SentenceAlignment >());
|
| 435 |
+
vector< SentenceAlignment > &vec = corpus.back();
|
| 436 |
+
|
| 437 |
+
vec.push_back(SentenceAlignment());
|
| 438 |
+
SentenceAlignment *sentence = &vec.back();
|
| 439 |
+
|
| 440 |
+
const WORD &countStr = GetVocabulary().GetWord(toks[0]);
|
| 441 |
+
sentence->count = atoi(countStr.c_str());
|
| 442 |
+
|
| 443 |
+
for (size_t i = 1; i < toks.size(); ++i) {
|
| 444 |
+
WORD_ID wordId = toks[i];
|
| 445 |
+
|
| 446 |
+
if (wordId == delimiter) {
|
| 447 |
+
// target and alignments can have multiple sentences.
|
| 448 |
+
vec.push_back(SentenceAlignment());
|
| 449 |
+
sentence = &vec.back();
|
| 450 |
+
|
| 451 |
+
// count
|
| 452 |
+
++i;
|
| 453 |
+
|
| 454 |
+
const WORD &countStr = GetVocabulary().GetWord(toks[i]);
|
| 455 |
+
sentence->count = atoi(countStr.c_str());
|
| 456 |
+
} else {
|
| 457 |
+
// just a normal word, add
|
| 458 |
+
sentence->target.push_back(wordId);
|
| 459 |
+
}
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
++lineNum;
|
| 463 |
+
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus )
|
| 470 |
+
{
|
| 471 |
+
ifstream fileStream;
|
| 472 |
+
fileStream.open(fileName.c_str());
|
| 473 |
+
if (!fileStream) {
|
| 474 |
+
cerr << "file not found: " << fileName << endl;
|
| 475 |
+
exit(1);
|
| 476 |
+
}
|
| 477 |
+
cerr << "loading " << fileName << endl;
|
| 478 |
+
|
| 479 |
+
istream *fileStreamP = &fileStream;
|
| 480 |
+
|
| 481 |
+
string delimiter = "|||";
|
| 482 |
+
|
| 483 |
+
int lineNum = 0;
|
| 484 |
+
string line;
|
| 485 |
+
while(getline(*fileStreamP, line)) {
|
| 486 |
+
vector< SentenceAlignment > &vec = corpus[lineNum];
|
| 487 |
+
size_t targetInd = 0;
|
| 488 |
+
SentenceAlignment *sentence = &vec[targetInd];
|
| 489 |
+
|
| 490 |
+
vector<string> toks = Moses::Tokenize(line);
|
| 491 |
+
|
| 492 |
+
for (size_t i = 0; i < toks.size(); ++i) {
|
| 493 |
+
string &tok = toks[i];
|
| 494 |
+
|
| 495 |
+
if (tok == delimiter) {
|
| 496 |
+
// target and alignments can have multiple sentences.
|
| 497 |
+
++targetInd;
|
| 498 |
+
sentence = &vec[targetInd];
|
| 499 |
+
|
| 500 |
+
++i;
|
| 501 |
+
} else {
|
| 502 |
+
// just a normal alignment, add
|
| 503 |
+
vector<int> alignPoint = Moses::Tokenize<int>(tok, "-");
|
| 504 |
+
assert(alignPoint.size() == 2);
|
| 505 |
+
sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
|
| 506 |
+
}
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
++lineNum;
|
| 510 |
+
|
| 511 |
+
}
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
|
| 515 |
+
{
|
| 516 |
+
#ifdef WITH_THREADS
|
| 517 |
+
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
|
| 518 |
+
#endif
|
| 519 |
+
map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
|
| 520 |
+
if (lookup != m_lsed.end()) {
|
| 521 |
+
value = lookup->second;
|
| 522 |
+
return true;
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
return false;
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
|
| 529 |
+
{
|
| 530 |
+
#ifdef WITH_THREADS
|
| 531 |
+
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
|
| 532 |
+
#endif
|
| 533 |
+
m_lsed[ key ] = value;
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */
|
| 537 |
+
|
| 538 |
+
unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
|
| 539 |
+
{
|
| 540 |
+
// check if already computed -> lookup in cache
|
| 541 |
+
pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
|
| 542 |
+
unsigned int value;
|
| 543 |
+
bool ret = GetLSEDCache(pIdx, value);
|
| 544 |
+
if (ret) {
|
| 545 |
+
return value;
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
// get surface strings for word indices
|
| 549 |
+
const string &a = GetVocabulary().GetWord( aIdx );
|
| 550 |
+
const string &b = GetVocabulary().GetWord( bIdx );
|
| 551 |
+
|
| 552 |
+
// initialize cost matrix
|
| 553 |
+
unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
|
| 554 |
+
for( unsigned int i=0; i<=a.size(); i++ ) {
|
| 555 |
+
cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
|
| 556 |
+
cost[i][0] = i;
|
| 557 |
+
}
|
| 558 |
+
for( unsigned int j=0; j<=b.size(); j++ ) {
|
| 559 |
+
cost[0][j] = j;
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
// core string edit distance loop
|
| 563 |
+
for( unsigned int i=1; i<=a.size(); i++ ) {
|
| 564 |
+
for( unsigned int j=1; j<=b.size(); j++ ) {
|
| 565 |
+
|
| 566 |
+
unsigned int ins = cost[i-1][j] + 1;
|
| 567 |
+
unsigned int del = cost[i][j-1] + 1;
|
| 568 |
+
bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
|
| 569 |
+
unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
|
| 570 |
+
|
| 571 |
+
unsigned int min = (ins < del) ? ins : del;
|
| 572 |
+
min = (diag < min) ? diag : min;
|
| 573 |
+
|
| 574 |
+
cost[i][j] = min;
|
| 575 |
+
}
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
// clear out memory
|
| 579 |
+
unsigned int final = cost[a.size()][b.size()];
|
| 580 |
+
for( unsigned int i=0; i<=a.size(); i++ ) {
|
| 581 |
+
free( cost[i] );
|
| 582 |
+
}
|
| 583 |
+
free( cost );
|
| 584 |
+
|
| 585 |
+
// cache and return result
|
| 586 |
+
SetLSEDCache(pIdx, final);
|
| 587 |
+
return final;
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
/* string edit distance implementation */
|
| 591 |
+
|
| 592 |
+
unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed )
|
| 593 |
+
{
|
| 594 |
+
|
| 595 |
+
// initialize cost and path matrices
|
| 596 |
+
unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
|
| 597 |
+
char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
|
| 598 |
+
|
| 599 |
+
for( unsigned int i=0; i<=a.size(); i++ ) {
|
| 600 |
+
cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
|
| 601 |
+
path[i] = (char*) calloc( sizeof(char), b.size()+1 );
|
| 602 |
+
if (i>0) {
|
| 603 |
+
cost[i][0] = cost[i-1][0];
|
| 604 |
+
if (use_letter_sed) {
|
| 605 |
+
cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
|
| 606 |
+
} else {
|
| 607 |
+
cost[i][0]++;
|
| 608 |
+
}
|
| 609 |
+
} else {
|
| 610 |
+
cost[i][0] = 0;
|
| 611 |
+
}
|
| 612 |
+
path[i][0] = 'I';
|
| 613 |
+
}
|
| 614 |
+
|
| 615 |
+
for( unsigned int j=0; j<=b.size(); j++ ) {
|
| 616 |
+
if (j>0) {
|
| 617 |
+
cost[0][j] = cost[0][j-1];
|
| 618 |
+
if (use_letter_sed) {
|
| 619 |
+
cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size();
|
| 620 |
+
} else {
|
| 621 |
+
cost[0][j]++;
|
| 622 |
+
}
|
| 623 |
+
} else {
|
| 624 |
+
cost[0][j] = 0;
|
| 625 |
+
}
|
| 626 |
+
path[0][j] = 'D';
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
// core string edit distance algorithm
|
| 630 |
+
for( unsigned int i=1; i<=a.size(); i++ ) {
|
| 631 |
+
for( unsigned int j=1; j<=b.size(); j++ ) {
|
| 632 |
+
unsigned int ins = cost[i-1][j];
|
| 633 |
+
unsigned int del = cost[i][j-1];
|
| 634 |
+
unsigned int match;
|
| 635 |
+
if (use_letter_sed) {
|
| 636 |
+
ins += GetVocabulary().GetWord( a[i-1] ).size();
|
| 637 |
+
del += GetVocabulary().GetWord( b[j-1] ).size();
|
| 638 |
+
match = letter_sed( a[i-1], b[j-1] );
|
| 639 |
+
} else {
|
| 640 |
+
ins++;
|
| 641 |
+
del++;
|
| 642 |
+
match = ( a[i-1] == b[j-1] ) ? 0 : 1;
|
| 643 |
+
}
|
| 644 |
+
unsigned int diag = cost[i-1][j-1] + match;
|
| 645 |
+
|
| 646 |
+
char action = (ins < del) ? 'I' : 'D';
|
| 647 |
+
unsigned int min = (ins < del) ? ins : del;
|
| 648 |
+
if (diag < min) {
|
| 649 |
+
action = (match>0) ? 'S' : 'M';
|
| 650 |
+
min = diag;
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
cost[i][j] = min;
|
| 654 |
+
path[i][j] = action;
|
| 655 |
+
}
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
// construct string for best path
|
| 659 |
+
unsigned int i = a.size();
|
| 660 |
+
unsigned int j = b.size();
|
| 661 |
+
best_path = "";
|
| 662 |
+
while( i>0 || j>0 ) {
|
| 663 |
+
best_path = path[i][j] + best_path;
|
| 664 |
+
if (path[i][j] == 'I') {
|
| 665 |
+
i--;
|
| 666 |
+
} else if (path[i][j] == 'D') {
|
| 667 |
+
j--;
|
| 668 |
+
} else {
|
| 669 |
+
i--;
|
| 670 |
+
j--;
|
| 671 |
+
}
|
| 672 |
+
}
|
| 673 |
+
|
| 674 |
+
|
| 675 |
+
// clear out memory
|
| 676 |
+
unsigned int final = cost[a.size()][b.size()];
|
| 677 |
+
|
| 678 |
+
for( unsigned int i=0; i<=a.size(); i++ ) {
|
| 679 |
+
free( cost[i] );
|
| 680 |
+
free( path[i] );
|
| 681 |
+
}
|
| 682 |
+
free( cost );
|
| 683 |
+
free( path );
|
| 684 |
+
|
| 685 |
+
// return result
|
| 686 |
+
return final;
|
| 687 |
+
}
|
| 688 |
+
|
| 689 |
+
/* utlility function: compute length of sentence in characters
|
| 690 |
+
(spaces do not count) */
|
| 691 |
+
|
| 692 |
+
unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentence )
|
| 693 |
+
{
|
| 694 |
+
unsigned int length = 0;
|
| 695 |
+
for( unsigned int i=0; i<sentence.size(); i++ ) {
|
| 696 |
+
length += GetVocabulary().GetWord( sentence[i] ).size();
|
| 697 |
+
}
|
| 698 |
+
return length;
|
| 699 |
+
}
|
| 700 |
+
|
| 701 |
+
/* brute force method: compare input to all corpus sentences */
|
| 702 |
+
|
| 703 |
+
void FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
|
| 704 |
+
vector< vector< WORD_ID > > input )
|
| 705 |
+
{
|
| 706 |
+
// go through input set...
|
| 707 |
+
for(unsigned int i=0; i<input.size(); i++) {
|
| 708 |
+
bool use_letter_sed = false;
|
| 709 |
+
|
| 710 |
+
// compute sentence length and worst allowed cost
|
| 711 |
+
unsigned int input_length;
|
| 712 |
+
if (use_letter_sed) {
|
| 713 |
+
input_length = compute_length( input[i] );
|
| 714 |
+
} else {
|
| 715 |
+
input_length = input[i].size();
|
| 716 |
+
}
|
| 717 |
+
unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
|
| 718 |
+
string best_path = "";
|
| 719 |
+
//int best_match = -1;
|
| 720 |
+
|
| 721 |
+
// go through all corpus sentences
|
| 722 |
+
for(unsigned int s=0; s<source.size(); s++) {
|
| 723 |
+
int source_length;
|
| 724 |
+
if (use_letter_sed) {
|
| 725 |
+
source_length = compute_length( source[s] );
|
| 726 |
+
} else {
|
| 727 |
+
source_length = source[s].size();
|
| 728 |
+
}
|
| 729 |
+
int diff = abs((int)source_length - (int)input_length);
|
| 730 |
+
if (length_filter_flag && (diff >= best_cost)) {
|
| 731 |
+
continue;
|
| 732 |
+
}
|
| 733 |
+
|
| 734 |
+
// compute string edit distance
|
| 735 |
+
string path;
|
| 736 |
+
unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
|
| 737 |
+
|
| 738 |
+
// update if new best
|
| 739 |
+
if (cost < best_cost) {
|
| 740 |
+
best_cost = cost;
|
| 741 |
+
best_path = path;
|
| 742 |
+
//best_match = s;
|
| 743 |
+
}
|
| 744 |
+
}
|
| 745 |
+
//cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
|
| 746 |
+
}
|
| 747 |
+
}
|
| 748 |
+
|
| 749 |
+
/* definition of short matches
|
| 750 |
+
very short n-gram matches (1-grams) will not be looked up in
|
| 751 |
+
the suffix array, since there are too many matches
|
| 752 |
+
and for longer sentences, at least one 2-gram match must occur */
|
| 753 |
+
|
| 754 |
+
int FuzzyMatchWrapper::short_match_max_length( int input_length )
|
| 755 |
+
{
|
| 756 |
+
if ( ! refined_flag )
|
| 757 |
+
return 0;
|
| 758 |
+
if ( input_length >= 5 )
|
| 759 |
+
return 1;
|
| 760 |
+
return 0;
|
| 761 |
+
}
|
| 762 |
+
|
| 763 |
+
|
| 764 |
+
/* if we have non-short matches in a sentence, we need to
|
| 765 |
+
take a closer look at it.
|
| 766 |
+
this function creates a hash map for all input words and their positions
|
| 767 |
+
(to be used by the next function)
|
| 768 |
+
(done here, because this has be done only once for an input sentence) */
|
| 769 |
+
|
| 770 |
+
void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input )
|
| 771 |
+
{
|
| 772 |
+
int max_length = short_match_max_length( input.size() );
|
| 773 |
+
if (max_length == 0)
|
| 774 |
+
return;
|
| 775 |
+
|
| 776 |
+
wordIndex.clear();
|
| 777 |
+
|
| 778 |
+
// store input words and their positions in hash map
|
| 779 |
+
for(size_t i=0; i<input.size(); i++) {
|
| 780 |
+
if (wordIndex.find( input[i] ) == wordIndex.end()) {
|
| 781 |
+
vector< int > position_vector;
|
| 782 |
+
wordIndex[ input[i] ] = position_vector;
|
| 783 |
+
}
|
| 784 |
+
wordIndex[ input[i] ].push_back( i );
|
| 785 |
+
}
|
| 786 |
+
}
|
| 787 |
+
|
| 788 |
+
/* add all short matches to list of matches for a sentence */
|
| 789 |
+
|
| 790 |
+
void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
|
| 791 |
+
{
|
| 792 |
+
int max_length = short_match_max_length( input_length );
|
| 793 |
+
if (max_length == 0)
|
| 794 |
+
return;
|
| 795 |
+
|
| 796 |
+
int tm_length = tm.size();
|
| 797 |
+
map< WORD_ID,vector< int > >::iterator input_word_hit;
|
| 798 |
+
for(int t_pos=0; t_pos<tm.size(); t_pos++) {
|
| 799 |
+
input_word_hit = wordIndex.find( tm[t_pos] );
|
| 800 |
+
if (input_word_hit != wordIndex.end()) {
|
| 801 |
+
vector< int > &position_vector = input_word_hit->second;
|
| 802 |
+
for(size_t j=0; j<position_vector.size(); j++) {
|
| 803 |
+
int &i_pos = position_vector[j];
|
| 804 |
+
|
| 805 |
+
// before match
|
| 806 |
+
int max_cost = max( i_pos , t_pos );
|
| 807 |
+
int min_cost = abs( i_pos - t_pos );
|
| 808 |
+
if ( i_pos>0 && i_pos == t_pos )
|
| 809 |
+
min_cost++;
|
| 810 |
+
|
| 811 |
+
// after match
|
| 812 |
+
max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
|
| 813 |
+
min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
|
| 814 |
+
if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
|
| 815 |
+
min_cost++;
|
| 816 |
+
|
| 817 |
+
if (min_cost <= best_cost) {
|
| 818 |
+
Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
|
| 819 |
+
match.push_back( new_match );
|
| 820 |
+
}
|
| 821 |
+
}
|
| 822 |
+
}
|
| 823 |
+
}
|
| 824 |
+
}
|
| 825 |
+
|
| 826 |
+
/* remove matches that are subsumed by a larger match */
|
| 827 |
+
|
| 828 |
+
vector< Match > FuzzyMatchWrapper::prune_matches( const vector< Match > &match, int best_cost )
|
| 829 |
+
{
|
| 830 |
+
//cerr << "\tpruning";
|
| 831 |
+
vector< Match > pruned;
|
| 832 |
+
for(int i=match.size()-1; i>=0; i--) {
|
| 833 |
+
//cerr << " (" << match[i].input_start << "," << match[i].input_end
|
| 834 |
+
// << " ; " << match[i].tm_start << "," << match[i].tm_end
|
| 835 |
+
// << " * " << match[i].min_cost << ")";
|
| 836 |
+
|
| 837 |
+
//if (match[i].min_cost > best_cost)
|
| 838 |
+
// continue;
|
| 839 |
+
|
| 840 |
+
bool subsumed = false;
|
| 841 |
+
for(int j=match.size()-1; j>=0; j--) {
|
| 842 |
+
if (i!=j // do not compare match with itself
|
| 843 |
+
&& ( match[i].input_end - match[i].input_start <=
|
| 844 |
+
match[j].input_end - match[j].input_start ) // i shorter than j
|
| 845 |
+
&& ((match[i].input_start == match[j].input_start &&
|
| 846 |
+
match[i].tm_start == match[j].tm_start ) ||
|
| 847 |
+
(match[i].input_end == match[j].input_end &&
|
| 848 |
+
match[i].tm_end == match[j].tm_end) ) ) {
|
| 849 |
+
subsumed = true;
|
| 850 |
+
}
|
| 851 |
+
}
|
| 852 |
+
if (! subsumed && match[i].min_cost <= best_cost) {
|
| 853 |
+
//cerr << "*";
|
| 854 |
+
pruned.push_back( match[i] );
|
| 855 |
+
}
|
| 856 |
+
}
|
| 857 |
+
//cerr << endl;
|
| 858 |
+
return pruned;
|
| 859 |
+
}
|
| 860 |
+
|
| 861 |
+
/* A* parsing method to compute string edit distance */
|
| 862 |
+
|
| 863 |
+
int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
|
| 864 |
+
{
|
| 865 |
+
// cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
|
| 866 |
+
|
| 867 |
+
if (match.size() == 1)
|
| 868 |
+
return match[0].max_cost;
|
| 869 |
+
if (match.size() == 0)
|
| 870 |
+
return input_length+tm_length;
|
| 871 |
+
|
| 872 |
+
int this_best_cost = input_length + tm_length;
|
| 873 |
+
for(size_t i=0; i<match.size(); i++) {
|
| 874 |
+
this_best_cost = min( this_best_cost, match[i].max_cost );
|
| 875 |
+
}
|
| 876 |
+
// cerr << "\tthis best cost: " << this_best_cost << endl;
|
| 877 |
+
|
| 878 |
+
// bottom up combination of spans
|
| 879 |
+
vector< vector< Match > > multi_match;
|
| 880 |
+
multi_match.push_back( match );
|
| 881 |
+
|
| 882 |
+
int match_level = 1;
|
| 883 |
+
while(multi_match[ match_level-1 ].size()>0) {
|
| 884 |
+
// init vector
|
| 885 |
+
vector< Match > empty;
|
| 886 |
+
multi_match.push_back( empty );
|
| 887 |
+
|
| 888 |
+
for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) {
|
| 889 |
+
int second_level = match_level - first_level -1;
|
| 890 |
+
//cerr << "\tcombining level " << first_level << " and " << second_level << endl;
|
| 891 |
+
|
| 892 |
+
vector< Match > &first_match = multi_match[ first_level ];
|
| 893 |
+
vector< Match > &second_match = multi_match[ second_level ];
|
| 894 |
+
|
| 895 |
+
for(size_t i1 = 0; i1 < first_match.size(); i1++) {
|
| 896 |
+
for(size_t i2 = 0; i2 < second_match.size(); i2++) {
|
| 897 |
+
|
| 898 |
+
// do not combine the same pair twice
|
| 899 |
+
if (first_level == second_level && i2 <= i1) {
|
| 900 |
+
continue;
|
| 901 |
+
}
|
| 902 |
+
|
| 903 |
+
// get sorted matches (first is before second)
|
| 904 |
+
Match *first, *second;
|
| 905 |
+
if (first_match[i1].input_start < second_match[i2].input_start ) {
|
| 906 |
+
first = &first_match[i1];
|
| 907 |
+
second = &second_match[i2];
|
| 908 |
+
} else {
|
| 909 |
+
second = &first_match[i1];
|
| 910 |
+
first = &second_match[i2];
|
| 911 |
+
}
|
| 912 |
+
|
| 913 |
+
//cerr << "\tcombining "
|
| 914 |
+
// << "(" << first->input_start << "," << first->input_end << "), "
|
| 915 |
+
// << first->tm_start << " [" << first->internal_cost << "]"
|
| 916 |
+
// << " with "
|
| 917 |
+
// << "(" << second->input_start << "," << second->input_end << "), "
|
| 918 |
+
// << second->tm_start<< " [" << second->internal_cost << "]"
|
| 919 |
+
// << endl;
|
| 920 |
+
|
| 921 |
+
// do not process overlapping matches
|
| 922 |
+
if (first->input_end >= second->input_start) {
|
| 923 |
+
continue;
|
| 924 |
+
}
|
| 925 |
+
|
| 926 |
+
// no overlap / mismatch in tm
|
| 927 |
+
if (first->tm_end >= second->tm_start) {
|
| 928 |
+
continue;
|
| 929 |
+
}
|
| 930 |
+
|
| 931 |
+
// compute cost
|
| 932 |
+
int min_cost = 0;
|
| 933 |
+
int max_cost = 0;
|
| 934 |
+
|
| 935 |
+
// initial
|
| 936 |
+
min_cost += abs( first->input_start - first->tm_start );
|
| 937 |
+
max_cost += max( first->input_start, first->tm_start );
|
| 938 |
+
|
| 939 |
+
// same number of words, but not sent. start -> cost is at least 1
|
| 940 |
+
if (first->input_start == first->tm_start && first->input_start > 0) {
|
| 941 |
+
min_cost++;
|
| 942 |
+
}
|
| 943 |
+
|
| 944 |
+
// in-between
|
| 945 |
+
int skipped_words = second->input_start - first->input_end -1;
|
| 946 |
+
int skipped_words_tm = second->tm_start - first->tm_end -1;
|
| 947 |
+
int internal_cost = max( skipped_words, skipped_words_tm );
|
| 948 |
+
internal_cost += first->internal_cost + second->internal_cost;
|
| 949 |
+
min_cost += internal_cost;
|
| 950 |
+
max_cost += internal_cost;
|
| 951 |
+
|
| 952 |
+
// final
|
| 953 |
+
min_cost += abs( (tm_length-1 - second->tm_end) -
|
| 954 |
+
(input_length-1 - second->input_end) );
|
| 955 |
+
max_cost += max( (tm_length-1 - second->tm_end),
|
| 956 |
+
(input_length-1 - second->input_end) );
|
| 957 |
+
|
| 958 |
+
// same number of words, but not sent. end -> cost is at least 1
|
| 959 |
+
if ( ( input_length-1 - second->input_end
|
| 960 |
+
== tm_length-1 - second->tm_end )
|
| 961 |
+
&& input_length-1 != second->input_end ) {
|
| 962 |
+
min_cost++;
|
| 963 |
+
}
|
| 964 |
+
|
| 965 |
+
// cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
|
| 966 |
+
|
| 967 |
+
// if worst than best cost, forget it
|
| 968 |
+
if (min_cost > best_cost) {
|
| 969 |
+
continue;
|
| 970 |
+
}
|
| 971 |
+
|
| 972 |
+
// add match
|
| 973 |
+
Match new_match( first->input_start,
|
| 974 |
+
second->input_end,
|
| 975 |
+
first->tm_start,
|
| 976 |
+
second->tm_end,
|
| 977 |
+
min_cost,
|
| 978 |
+
max_cost,
|
| 979 |
+
internal_cost);
|
| 980 |
+
multi_match[ match_level ].push_back( new_match );
|
| 981 |
+
// cerr << "\tstored\n";
|
| 982 |
+
|
| 983 |
+
// possibly updating this_best_cost
|
| 984 |
+
if (max_cost < this_best_cost) {
|
| 985 |
+
// cerr << "\tupdating this best cost to " << max_cost << "\n";
|
| 986 |
+
this_best_cost = max_cost;
|
| 987 |
+
|
| 988 |
+
// possibly updating best_cost
|
| 989 |
+
if (max_cost < best_cost) {
|
| 990 |
+
// cerr << "\tupdating best cost to " << max_cost << "\n";
|
| 991 |
+
best_cost = max_cost;
|
| 992 |
+
}
|
| 993 |
+
}
|
| 994 |
+
}
|
| 995 |
+
}
|
| 996 |
+
}
|
| 997 |
+
match_level++;
|
| 998 |
+
}
|
| 999 |
+
return this_best_cost;
|
| 1000 |
+
}
|
| 1001 |
+
|
| 1002 |
+
|
| 1003 |
+
void FuzzyMatchWrapper::create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string &path, ofstream &outputFile)
|
| 1004 |
+
{
|
| 1005 |
+
string sourceStr;
|
| 1006 |
+
for (size_t pos = 0; pos < sourceSentence.size(); ++pos) {
|
| 1007 |
+
WORD_ID wordId = sourceSentence[pos];
|
| 1008 |
+
sourceStr += GetVocabulary().GetWord(wordId) + " ";
|
| 1009 |
+
}
|
| 1010 |
+
|
| 1011 |
+
for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
|
| 1012 |
+
const SentenceAlignment &sentenceAlignment = targets[targetInd];
|
| 1013 |
+
string targetStr = sentenceAlignment.getTargetString(GetVocabulary());
|
| 1014 |
+
string alignStr = sentenceAlignment.getAlignmentString();
|
| 1015 |
+
|
| 1016 |
+
outputFile
|
| 1017 |
+
<< sentenceInd << endl
|
| 1018 |
+
<< cost << endl
|
| 1019 |
+
<< sourceStr << endl
|
| 1020 |
+
<< inputStr << endl
|
| 1021 |
+
<< targetStr << endl
|
| 1022 |
+
<< alignStr << endl
|
| 1023 |
+
<< path << endl
|
| 1024 |
+
<< sentenceAlignment.count << endl;
|
| 1025 |
+
|
| 1026 |
+
}
|
| 1027 |
+
}
|
| 1028 |
+
|
| 1029 |
+
} // namespace
|
mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// FuzzyMatchWrapper.h
|
| 3 |
+
// moses
|
| 4 |
+
//
|
| 5 |
+
// Created by Hieu Hoang on 26/07/2012.
|
| 6 |
+
// Copyright 2012 __MyCompanyName__. All rights reserved.
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
#ifndef moses_FuzzyMatchWrapper_h
|
| 10 |
+
#define moses_FuzzyMatchWrapper_h
|
| 11 |
+
|
| 12 |
+
#ifdef WITH_THREADS
|
| 13 |
+
#include <boost/thread/shared_mutex.hpp>
|
| 14 |
+
#endif
|
| 15 |
+
|
| 16 |
+
#include <fstream>
|
| 17 |
+
#include <string>
|
| 18 |
+
#include "SuffixArray.h"
|
| 19 |
+
#include "Vocabulary.h"
|
| 20 |
+
#include "Match.h"
|
| 21 |
+
#include "moses/InputType.h"
|
| 22 |
+
|
| 23 |
+
namespace tmmt
|
| 24 |
+
{
|
| 25 |
+
class Match;
|
| 26 |
+
struct SentenceAlignment;
|
| 27 |
+
|
| 28 |
+
class FuzzyMatchWrapper
|
| 29 |
+
{
|
| 30 |
+
public:
|
| 31 |
+
FuzzyMatchWrapper(const std::string &source, const std::string &target, const std::string &alignment);
|
| 32 |
+
|
| 33 |
+
std::string Extract(long translationId, const std::string &dirNameStr);
|
| 34 |
+
|
| 35 |
+
protected:
|
| 36 |
+
// tm-mt
|
| 37 |
+
std::vector< std::vector< tmmt::SentenceAlignment > > targetAndAlignment;
|
| 38 |
+
tmmt::SuffixArray *suffixArray;
|
| 39 |
+
int basic_flag;
|
| 40 |
+
int lsed_flag;
|
| 41 |
+
int refined_flag;
|
| 42 |
+
int length_filter_flag;
|
| 43 |
+
int parse_flag;
|
| 44 |
+
int min_match;
|
| 45 |
+
int multiple_flag;
|
| 46 |
+
int multiple_slack;
|
| 47 |
+
int multiple_max;
|
| 48 |
+
|
| 49 |
+
typedef std::map< WORD_ID,std::vector< int > > WordIndex;
|
| 50 |
+
|
| 51 |
+
// global cache for word pairs
|
| 52 |
+
std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > m_lsed;
|
| 53 |
+
#ifdef WITH_THREADS
|
| 54 |
+
//reader-writer lock
|
| 55 |
+
mutable boost::shared_mutex m_accessLock;
|
| 56 |
+
#endif
|
| 57 |
+
|
| 58 |
+
void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus );
|
| 59 |
+
void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus);
|
| 60 |
+
void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus );
|
| 61 |
+
|
| 62 |
+
/** brute force method: compare input to all corpus sentences */
|
| 63 |
+
void basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
|
| 64 |
+
std::vector< std::vector< tmmt::WORD_ID > > input ) ;
|
| 65 |
+
|
| 66 |
+
/** utlility function: compute length of sentence in characters
|
| 67 |
+
(spaces do not count) */
|
| 68 |
+
unsigned int compute_length( const std::vector< tmmt::WORD_ID > &sentence );
|
| 69 |
+
unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx );
|
| 70 |
+
unsigned int sed( const std::vector< WORD_ID > &a, const std::vector< WORD_ID > &b, std::string &best_path, bool use_letter_sed );
|
| 71 |
+
void init_short_matches(WordIndex &wordIndex, long translationId, const std::vector< WORD_ID > &input );
|
| 72 |
+
int short_match_max_length( int input_length );
|
| 73 |
+
void add_short_matches(WordIndex &wordIndex, long translationId, std::vector< Match > &match, const std::vector< WORD_ID > &tm, int input_length, int best_cost );
|
| 74 |
+
std::vector< Match > prune_matches( const std::vector< Match > &match, int best_cost );
|
| 75 |
+
int parse_matches( std::vector< Match > &match, int input_length, int tm_length, int &best_cost );
|
| 76 |
+
|
| 77 |
+
void create_extract(int sentenceInd, int cost, const std::vector< WORD_ID > &sourceSentence, const std::vector<SentenceAlignment> &targets, const std::string &inputStr, const std::string &path, std::ofstream &outputFile);
|
| 78 |
+
|
| 79 |
+
std::string ExtractTM(WordIndex &wordIndex, long translationId, const std::string &inputPath);
|
| 80 |
+
Vocabulary &GetVocabulary() {
|
| 81 |
+
return suffixArray->GetVocabulary();
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const;
|
| 85 |
+
void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value);
|
| 86 |
+
|
| 87 |
+
};
|
| 88 |
+
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
#endif
|
mosesdecoder/moses/TranslationModel/fuzzy-match/Match.h
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Match.h
|
| 3 |
+
// fuzzy-match
|
| 4 |
+
//
|
| 5 |
+
// Created by Hieu Hoang on 25/07/2012.
|
| 6 |
+
// Copyright 2012 __MyCompanyName__. All rights reserved.
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
#ifndef fuzzy_match_Match_h
|
| 10 |
+
#define fuzzy_match_Match_h
|
| 11 |
+
|
| 12 |
+
namespace tmmt
|
| 13 |
+
{
|
| 14 |
+
|
| 15 |
+
/* data structure for n-gram match between input and corpus */
|
| 16 |
+
|
| 17 |
+
class Match
|
| 18 |
+
{
|
| 19 |
+
public:
|
| 20 |
+
int input_start;
|
| 21 |
+
int input_end;
|
| 22 |
+
int tm_start;
|
| 23 |
+
int tm_end;
|
| 24 |
+
int min_cost;
|
| 25 |
+
int max_cost;
|
| 26 |
+
int internal_cost;
|
| 27 |
+
Match( int is, int ie, int ts, int te, int min, int max, int i )
|
| 28 |
+
:input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i) {
|
| 29 |
+
}
|
| 30 |
+
};
|
| 31 |
+
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
#endif
|
mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.cpp
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// SentenceAlignment.cpp
|
| 3 |
+
// moses
|
| 4 |
+
//
|
| 5 |
+
// Created by Hieu Hoang on 26/07/2012.
|
| 6 |
+
// Copyright 2012 __MyCompanyName__. All rights reserved.
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
#include <iostream>
|
| 10 |
+
#include "util/string_stream.hh"
|
| 11 |
+
#include "SentenceAlignment.h"
|
| 12 |
+
|
| 13 |
+
namespace tmmt
|
| 14 |
+
{
|
| 15 |
+
std::string SentenceAlignment::getTargetString(const Vocabulary &vocab) const
|
| 16 |
+
{
|
| 17 |
+
util::StringStream strme;
|
| 18 |
+
for (size_t i = 0; i < target.size(); ++i) {
|
| 19 |
+
const WORD &word = vocab.GetWord(target[i]);
|
| 20 |
+
strme << word << " ";
|
| 21 |
+
}
|
| 22 |
+
return strme.str();
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
}
|
mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.h
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// SentenceAlignment.h
|
| 3 |
+
// fuzzy-match
|
| 4 |
+
//
|
| 5 |
+
// Created by Hieu Hoang on 25/07/2012.
|
| 6 |
+
// Copyright 2012 __MyCompanyName__. All rights reserved.
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
#ifndef fuzzy_match_SentenceAlignment_h
|
| 10 |
+
#define fuzzy_match_SentenceAlignment_h
|
| 11 |
+
|
| 12 |
+
#include <sstream>
|
| 13 |
+
#include <vector>
|
| 14 |
+
#include "Vocabulary.h"
|
| 15 |
+
#include "util/string_stream.hh"
|
| 16 |
+
|
| 17 |
+
namespace tmmt
|
| 18 |
+
{
|
| 19 |
+
|
| 20 |
+
struct SentenceAlignment {
|
| 21 |
+
int count;
|
| 22 |
+
std::vector< WORD_ID > target;
|
| 23 |
+
std::vector< std::pair<int,int> > alignment;
|
| 24 |
+
|
| 25 |
+
SentenceAlignment() {
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
std::string getTargetString(const Vocabulary &vocab) const;
|
| 29 |
+
|
| 30 |
+
std::string getAlignmentString() const {
|
| 31 |
+
util::StringStream strme;
|
| 32 |
+
for (size_t i = 0; i < alignment.size(); ++i) {
|
| 33 |
+
const std::pair<int,int> &alignPair = alignment[i];
|
| 34 |
+
strme << alignPair.first << "-" << alignPair.second << " ";
|
| 35 |
+
}
|
| 36 |
+
return strme.str();
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
};
|
| 40 |
+
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
#endif
|
mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "SuffixArray.h"
|
| 2 |
+
#include <string>
|
| 3 |
+
#include <stdlib.h>
|
| 4 |
+
#include <cstring>
|
| 5 |
+
|
| 6 |
+
using namespace std;
|
| 7 |
+
|
| 8 |
+
namespace tmmt
|
| 9 |
+
{
|
| 10 |
+
|
| 11 |
+
SuffixArray::SuffixArray( string fileName )
|
| 12 |
+
{
|
| 13 |
+
m_vcb.StoreIfNew( "<uNk>" );
|
| 14 |
+
m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
|
| 15 |
+
|
| 16 |
+
ifstream extractFile;
|
| 17 |
+
|
| 18 |
+
// count the number of words first;
|
| 19 |
+
extractFile.open(fileName.c_str());
|
| 20 |
+
istream *fileP = &extractFile;
|
| 21 |
+
m_size = 0;
|
| 22 |
+
size_t sentenceCount = 0;
|
| 23 |
+
string line;
|
| 24 |
+
while(getline(*fileP, line)) {
|
| 25 |
+
|
| 26 |
+
vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
|
| 27 |
+
m_size += words.size() + 1;
|
| 28 |
+
sentenceCount++;
|
| 29 |
+
}
|
| 30 |
+
extractFile.close();
|
| 31 |
+
cerr << m_size << " words (incl. sentence boundaries)" << endl;
|
| 32 |
+
|
| 33 |
+
// allocate memory
|
| 34 |
+
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
|
| 35 |
+
m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
|
| 36 |
+
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
|
| 37 |
+
m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
|
| 38 |
+
m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );
|
| 39 |
+
|
| 40 |
+
// fill the array
|
| 41 |
+
int wordIndex = 0;
|
| 42 |
+
int sentenceId = 0;
|
| 43 |
+
extractFile.open(fileName.c_str());
|
| 44 |
+
fileP = &extractFile;
|
| 45 |
+
while(getline(*fileP, line)) {
|
| 46 |
+
vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
|
| 47 |
+
|
| 48 |
+
// add to corpus vector
|
| 49 |
+
corpus.push_back(words);
|
| 50 |
+
|
| 51 |
+
// create SA
|
| 52 |
+
|
| 53 |
+
vector< WORD_ID >::const_iterator i;
|
| 54 |
+
for( i=words.begin(); i!=words.end(); i++) {
|
| 55 |
+
m_index[ wordIndex ] = wordIndex;
|
| 56 |
+
m_sentence[ wordIndex ] = sentenceId;
|
| 57 |
+
m_wordInSentence[ wordIndex ] = i-words.begin();
|
| 58 |
+
m_array[ wordIndex++ ] = *i;
|
| 59 |
+
}
|
| 60 |
+
m_index[ wordIndex ] = wordIndex;
|
| 61 |
+
m_array[ wordIndex++ ] = m_endOfSentence;
|
| 62 |
+
m_sentenceLength[ sentenceId++ ] = words.size();
|
| 63 |
+
}
|
| 64 |
+
extractFile.close();
|
| 65 |
+
cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
|
| 66 |
+
// List(0,9);
|
| 67 |
+
|
| 68 |
+
// sort
|
| 69 |
+
m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
|
| 70 |
+
Sort( 0, m_size-1 );
|
| 71 |
+
free( m_buffer );
|
| 72 |
+
cerr << "done sorting" << endl;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
// good ol' quick sort
|
| 76 |
+
void SuffixArray::Sort(INDEX start, INDEX end)
|
| 77 |
+
{
|
| 78 |
+
if (start == end) return;
|
| 79 |
+
INDEX mid = (start+end+1)/2;
|
| 80 |
+
Sort( start, mid-1 );
|
| 81 |
+
Sort( mid, end );
|
| 82 |
+
|
| 83 |
+
// merge
|
| 84 |
+
size_t i = start;
|
| 85 |
+
size_t j = mid;
|
| 86 |
+
size_t k = 0;
|
| 87 |
+
size_t length = end-start+1;
|
| 88 |
+
while( k<length ) {
|
| 89 |
+
if (i == mid ) {
|
| 90 |
+
m_buffer[ k++ ] = m_index[ j++ ];
|
| 91 |
+
} else if (j > end ) {
|
| 92 |
+
m_buffer[ k++ ] = m_index[ i++ ];
|
| 93 |
+
} else {
|
| 94 |
+
if (CompareIndex( m_index[i], m_index[j] ) < 0) {
|
| 95 |
+
m_buffer[ k++ ] = m_index[ i++ ];
|
| 96 |
+
} else {
|
| 97 |
+
m_buffer[ k++ ] = m_index[ j++ ];
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
|
| 103 |
+
((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
SuffixArray::~SuffixArray()
|
| 107 |
+
{
|
| 108 |
+
free(m_index);
|
| 109 |
+
free(m_array);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
|
| 113 |
+
{
|
| 114 |
+
// skip over identical words
|
| 115 |
+
INDEX offset = 0;
|
| 116 |
+
while( a+offset < m_size &&
|
| 117 |
+
b+offset < m_size &&
|
| 118 |
+
m_array[ a+offset ] == m_array[ b+offset ] ) {
|
| 119 |
+
offset++;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
if( a+offset == m_size ) return -1;
|
| 123 |
+
if( b+offset == m_size ) return 1;
|
| 124 |
+
return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
|
| 128 |
+
{
|
| 129 |
+
// cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
|
| 130 |
+
return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
int SuffixArray::Count( const vector< WORD > &phrase )
|
| 134 |
+
{
|
| 135 |
+
INDEX dummy;
|
| 136 |
+
return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
|
| 140 |
+
{
|
| 141 |
+
INDEX dummy;
|
| 142 |
+
return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
bool SuffixArray::Exists( const vector< WORD > &phrase )
|
| 146 |
+
{
|
| 147 |
+
INDEX dummy;
|
| 148 |
+
return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
|
| 152 |
+
{
|
| 153 |
+
return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
|
| 157 |
+
{
|
| 158 |
+
// cerr << "FindFirst\n";
|
| 159 |
+
INDEX start = search_start;
|
| 160 |
+
INDEX end = (search_end == -1) ? (m_size-1) : search_end;
|
| 161 |
+
INDEX mid = FindFirst( phrase, start, end );
|
| 162 |
+
// cerr << "done\n";
|
| 163 |
+
if (mid == m_size) return 0; // no matches
|
| 164 |
+
if (min == 1) return 1; // only existance check
|
| 165 |
+
|
| 166 |
+
int matchCount = 1;
|
| 167 |
+
|
| 168 |
+
//cerr << "before...\n";
|
| 169 |
+
firstMatch = FindLast( phrase, mid, start, -1 );
|
| 170 |
+
matchCount += mid - firstMatch;
|
| 171 |
+
|
| 172 |
+
//cerr << "after...\n";
|
| 173 |
+
lastMatch = FindLast( phrase, mid, end, 1 );
|
| 174 |
+
matchCount += lastMatch - mid;
|
| 175 |
+
|
| 176 |
+
return matchCount;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
|
| 180 |
+
{
|
| 181 |
+
end += direction;
|
| 182 |
+
while(true) {
|
| 183 |
+
INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
|
| 184 |
+
|
| 185 |
+
int match = Match( phrase, mid );
|
| 186 |
+
int matchNext = Match( phrase, mid+direction );
|
| 187 |
+
//cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
|
| 188 |
+
|
| 189 |
+
if (match == 0 && matchNext != 0) return mid;
|
| 190 |
+
|
| 191 |
+
if (match == 0) // mid point is a match
|
| 192 |
+
start = mid;
|
| 193 |
+
else
|
| 194 |
+
end = mid;
|
| 195 |
+
}
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
|
| 199 |
+
{
|
| 200 |
+
while(true) {
|
| 201 |
+
INDEX mid = ( start + end + 1 )/2;
|
| 202 |
+
//cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
|
| 203 |
+
int match = Match( phrase, mid );
|
| 204 |
+
|
| 205 |
+
if (match == 0) return mid;
|
| 206 |
+
if (start >= end && match != 0 ) return m_size;
|
| 207 |
+
|
| 208 |
+
if (match > 0)
|
| 209 |
+
start = mid+1;
|
| 210 |
+
else
|
| 211 |
+
end = mid-1;
|
| 212 |
+
}
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
|
| 216 |
+
{
|
| 217 |
+
INDEX pos = m_index[ index ];
|
| 218 |
+
for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++) {
|
| 219 |
+
int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
|
| 220 |
+
// cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
|
| 221 |
+
if (match != 0)
|
| 222 |
+
return match;
|
| 223 |
+
}
|
| 224 |
+
return 0;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
void SuffixArray::List(INDEX start, INDEX end)
|
| 228 |
+
{
|
| 229 |
+
for(INDEX i=start; i<=end; i++) {
|
| 230 |
+
INDEX pos = m_index[ i ];
|
| 231 |
+
// cerr << i << ":" << pos << "\t";
|
| 232 |
+
for(int j=0; j<5 && j+pos<m_size; j++) {
|
| 233 |
+
//cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
|
| 234 |
+
}
|
| 235 |
+
// cerr << "\n";
|
| 236 |
+
}
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
}
|
| 240 |
+
|
mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.h
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "Vocabulary.h"
|
| 2 |
+
|
| 3 |
+
#pragma once
|
| 4 |
+
|
| 5 |
+
#define LINE_MAX_LENGTH 10000
|
| 6 |
+
|
| 7 |
+
namespace tmmt
|
| 8 |
+
{
|
| 9 |
+
|
| 10 |
+
class SuffixArray
|
| 11 |
+
{
|
| 12 |
+
public:
|
| 13 |
+
typedef unsigned int INDEX;
|
| 14 |
+
|
| 15 |
+
private:
|
| 16 |
+
std::vector< std::vector< WORD_ID > > corpus;
|
| 17 |
+
|
| 18 |
+
WORD_ID *m_array;
|
| 19 |
+
INDEX *m_index;
|
| 20 |
+
INDEX *m_buffer;
|
| 21 |
+
char *m_wordInSentence;
|
| 22 |
+
size_t *m_sentence;
|
| 23 |
+
char *m_sentenceLength;
|
| 24 |
+
WORD_ID m_endOfSentence;
|
| 25 |
+
Vocabulary m_vcb;
|
| 26 |
+
INDEX m_size;
|
| 27 |
+
|
| 28 |
+
public:
|
| 29 |
+
SuffixArray( std::string fileName );
|
| 30 |
+
~SuffixArray();
|
| 31 |
+
|
| 32 |
+
void Sort(INDEX start, INDEX end);
|
| 33 |
+
int CompareIndex( INDEX a, INDEX b ) const;
|
| 34 |
+
inline int CompareWord( WORD_ID a, WORD_ID b ) const;
|
| 35 |
+
int Count( const std::vector< WORD > &phrase );
|
| 36 |
+
bool MinCount( const std::vector< WORD > &phrase, INDEX min );
|
| 37 |
+
bool Exists( const std::vector< WORD > &phrase );
|
| 38 |
+
int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
|
| 39 |
+
int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
|
| 40 |
+
INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
|
| 41 |
+
INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
|
| 42 |
+
int Match( const std::vector< WORD > &phrase, INDEX index );
|
| 43 |
+
void List( INDEX start, INDEX end );
|
| 44 |
+
inline INDEX GetPosition( INDEX index ) {
|
| 45 |
+
return m_index[ index ];
|
| 46 |
+
}
|
| 47 |
+
inline size_t GetSentence( INDEX position ) {
|
| 48 |
+
return m_sentence[position];
|
| 49 |
+
}
|
| 50 |
+
inline char GetWordInSentence( INDEX position ) {
|
| 51 |
+
return m_wordInSentence[position];
|
| 52 |
+
}
|
| 53 |
+
inline char GetSentenceLength( size_t sentenceId ) {
|
| 54 |
+
return m_sentenceLength[sentenceId];
|
| 55 |
+
}
|
| 56 |
+
inline INDEX GetSize() {
|
| 57 |
+
return m_size;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
Vocabulary &GetVocabulary() {
|
| 61 |
+
return m_vcb;
|
| 62 |
+
}
|
| 63 |
+
const std::vector< std::vector< WORD_ID > > &GetCorpus() const {
|
| 64 |
+
return corpus;
|
| 65 |
+
}
|
| 66 |
+
};
|
| 67 |
+
|
| 68 |
+
}
|
| 69 |
+
|
mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.cpp
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
|
| 2 |
+
#include "Vocabulary.h"
|
| 3 |
+
#ifdef WITH_THREADS
|
| 4 |
+
#include <boost/thread/locks.hpp>
|
| 5 |
+
#endif
|
| 6 |
+
|
| 7 |
+
using namespace std;
|
| 8 |
+
|
| 9 |
+
namespace tmmt
|
| 10 |
+
{
|
| 11 |
+
|
| 12 |
+
// as in beamdecoder/tables.cpp
|
| 13 |
+
vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
|
| 14 |
+
{
|
| 15 |
+
vector< WORD_ID > token;
|
| 16 |
+
bool betweenWords = true;
|
| 17 |
+
int start=0;
|
| 18 |
+
int i=0;
|
| 19 |
+
for(; input[i] != '\0'; i++) {
|
| 20 |
+
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
| 21 |
+
|
| 22 |
+
if (!isSpace && betweenWords) {
|
| 23 |
+
start = i;
|
| 24 |
+
betweenWords = false;
|
| 25 |
+
} else if (isSpace && !betweenWords) {
|
| 26 |
+
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
|
| 27 |
+
betweenWords = true;
|
| 28 |
+
}
|
| 29 |
+
}
|
| 30 |
+
if (!betweenWords)
|
| 31 |
+
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
|
| 32 |
+
return token;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
WORD_ID Vocabulary::StoreIfNew( const WORD& word )
|
| 36 |
+
{
|
| 37 |
+
|
| 38 |
+
{
|
| 39 |
+
// read=lock scope
|
| 40 |
+
#ifdef WITH_THREADS
|
| 41 |
+
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
|
| 42 |
+
#endif
|
| 43 |
+
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
| 44 |
+
|
| 45 |
+
if( i != lookup.end() )
|
| 46 |
+
return i->second;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
#ifdef WITH_THREADS
|
| 50 |
+
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
|
| 51 |
+
#endif
|
| 52 |
+
WORD_ID id = vocab.size();
|
| 53 |
+
vocab.push_back( word );
|
| 54 |
+
lookup[ word ] = id;
|
| 55 |
+
return id;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
WORD_ID Vocabulary::GetWordID( const WORD &word )
|
| 59 |
+
{
|
| 60 |
+
#ifdef WITH_THREADS
|
| 61 |
+
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
|
| 62 |
+
#endif
|
| 63 |
+
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
| 64 |
+
if( i == lookup.end() )
|
| 65 |
+
return 0;
|
| 66 |
+
WORD_ID w= (WORD_ID) i->second;
|
| 67 |
+
return w;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
}
|
| 71 |
+
|
mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.h
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $
|
| 2 |
+
|
| 3 |
+
#pragma once
|
| 4 |
+
|
| 5 |
+
#include <iostream>
|
| 6 |
+
#include <fstream>
|
| 7 |
+
#include <cassert>
|
| 8 |
+
#include <cstdlib>
|
| 9 |
+
#include <string>
|
| 10 |
+
#include <queue>
|
| 11 |
+
#include <map>
|
| 12 |
+
#include <cmath>
|
| 13 |
+
|
| 14 |
+
#ifdef WITH_THREADS
|
| 15 |
+
#include <boost/thread/shared_mutex.hpp>
|
| 16 |
+
#endif
|
| 17 |
+
|
| 18 |
+
namespace tmmt
|
| 19 |
+
{
|
| 20 |
+
typedef std::string WORD;
|
| 21 |
+
typedef unsigned int WORD_ID;
|
| 22 |
+
|
| 23 |
+
class Vocabulary
|
| 24 |
+
{
|
| 25 |
+
public:
|
| 26 |
+
std::map<WORD, WORD_ID> lookup;
|
| 27 |
+
std::vector< WORD > vocab;
|
| 28 |
+
WORD_ID StoreIfNew( const WORD& );
|
| 29 |
+
WORD_ID GetWordID( const WORD& );
|
| 30 |
+
std::vector<WORD_ID> Tokenize( const char[] );
|
| 31 |
+
inline WORD &GetWord( WORD_ID id ) const {
|
| 32 |
+
WORD &i = (WORD&) vocab[ id ];
|
| 33 |
+
return i;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
protected:
|
| 37 |
+
#ifdef WITH_THREADS
|
| 38 |
+
//reader-writer lock
|
| 39 |
+
mutable boost::shared_mutex m_accessLock;
|
| 40 |
+
#endif
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
};
|
| 44 |
+
|
| 45 |
+
}
|
| 46 |
+
|
mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.cpp
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
#include <iostream>
|
| 3 |
+
#include <fstream>
|
| 4 |
+
#include <cassert>
|
| 5 |
+
#include <vector>
|
| 6 |
+
#include <string>
|
| 7 |
+
#include "moses/Util.h"
|
| 8 |
+
#include "Alignments.h"
|
| 9 |
+
|
| 10 |
+
using namespace std;
|
| 11 |
+
using namespace Moses;
|
| 12 |
+
|
| 13 |
+
inline const std::string TrimInternal(const std::string& str, const std::string dropChars = " \t\n\r")
|
| 14 |
+
{
|
| 15 |
+
std::string res = str;
|
| 16 |
+
res.erase(str.find_last_not_of(dropChars)+1);
|
| 17 |
+
return res.erase(0, res.find_first_not_of(dropChars));
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
class CreateXMLRetValues
|
| 21 |
+
{
|
| 22 |
+
public:
|
| 23 |
+
string frame, ruleS, ruleT, ruleAlignment, ruleAlignmentInv;
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path );
|
| 27 |
+
|
| 28 |
+
void create_xml(const string &inPath)
|
| 29 |
+
{
|
| 30 |
+
ifstream inStrme(inPath.c_str());
|
| 31 |
+
ofstream rule((inPath + ".extract").c_str());
|
| 32 |
+
ofstream ruleInv((inPath + ".extract.inv").c_str());
|
| 33 |
+
|
| 34 |
+
// int setenceId;
|
| 35 |
+
// float score;
|
| 36 |
+
string source, target, align, path;
|
| 37 |
+
string *input = NULL;
|
| 38 |
+
int count;
|
| 39 |
+
|
| 40 |
+
int lineCount = 1;
|
| 41 |
+
int ruleCount = 1;
|
| 42 |
+
string inLine;
|
| 43 |
+
|
| 44 |
+
int step = 0;
|
| 45 |
+
while (!inStrme.eof()) {
|
| 46 |
+
getline(inStrme, inLine);
|
| 47 |
+
//cout << inLine << endl;
|
| 48 |
+
switch (step) {
|
| 49 |
+
case 0:
|
| 50 |
+
/*setenceId = */
|
| 51 |
+
Scan<int>(inLine);
|
| 52 |
+
++step;
|
| 53 |
+
break;
|
| 54 |
+
case 1:
|
| 55 |
+
/*score = */
|
| 56 |
+
Scan<float>(inLine);
|
| 57 |
+
++step;
|
| 58 |
+
break;
|
| 59 |
+
case 2:
|
| 60 |
+
source = inLine;
|
| 61 |
+
++step;
|
| 62 |
+
break;
|
| 63 |
+
case 3:
|
| 64 |
+
if (input == NULL) {
|
| 65 |
+
input = new string(inLine);
|
| 66 |
+
} else {
|
| 67 |
+
assert(inLine == *input);
|
| 68 |
+
}
|
| 69 |
+
++step;
|
| 70 |
+
break;
|
| 71 |
+
case 4:
|
| 72 |
+
target = inLine;
|
| 73 |
+
++step;
|
| 74 |
+
break;
|
| 75 |
+
case 5:
|
| 76 |
+
align = inLine;
|
| 77 |
+
++step;
|
| 78 |
+
break;
|
| 79 |
+
case 6:
|
| 80 |
+
path = inLine + "X";
|
| 81 |
+
++step;
|
| 82 |
+
break;
|
| 83 |
+
case 7:
|
| 84 |
+
count = Scan<int>(inLine);
|
| 85 |
+
CreateXMLRetValues ret = createXML(ruleCount, source, *input, target, align, path);
|
| 86 |
+
|
| 87 |
+
//print STDOUT $frame."\n";
|
| 88 |
+
rule << ret.ruleS << " [X] ||| " << ret.ruleT << " [X] ||| " << ret.ruleAlignment
|
| 89 |
+
<< " ||| " << count << endl;
|
| 90 |
+
ruleInv << ret.ruleT << " [X] ||| " << ret.ruleS << " [X] ||| " << ret.ruleAlignmentInv
|
| 91 |
+
<< " ||| " << count << endl;
|
| 92 |
+
|
| 93 |
+
//print STDOUT "$sentenceInd ||| $score ||| $count\n";
|
| 94 |
+
++ruleCount;
|
| 95 |
+
step = 0;
|
| 96 |
+
break;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
++lineCount;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
delete input;
|
| 103 |
+
ruleInv.close();
|
| 104 |
+
rule.close();
|
| 105 |
+
inStrme.close();
|
| 106 |
+
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path)
|
| 111 |
+
{
|
| 112 |
+
CreateXMLRetValues ret;
|
| 113 |
+
vector<string> sourceToks = Tokenize(source, " ")
|
| 114 |
+
,inputToks = Tokenize(input, " ")
|
| 115 |
+
,targetsToks = Tokenize(target, " ");
|
| 116 |
+
Alignments alignments(align, sourceToks.size(), targetsToks.size());
|
| 117 |
+
map<int, string> frameInput;
|
| 118 |
+
map<int, int> alignI2S;
|
| 119 |
+
vector< map<string, int> > nonTerms;
|
| 120 |
+
vector<bool> targetBitmap(targetsToks.size(), true);
|
| 121 |
+
vector<bool> inputBitmap;
|
| 122 |
+
|
| 123 |
+
// STEP 1: FIND MISMATCHES
|
| 124 |
+
int s = 0, i = 0;
|
| 125 |
+
bool currently_matching = false;
|
| 126 |
+
int start_s = 0, start_i = 0;
|
| 127 |
+
|
| 128 |
+
//cerr << input << endl << source << endl << target << endl << path << endl;
|
| 129 |
+
for ( int p = 0 ; p < int(path.length()) ; p++ ) {
|
| 130 |
+
string action = path.substr(p, 1);
|
| 131 |
+
|
| 132 |
+
// beginning of a mismatch
|
| 133 |
+
if ( currently_matching && action != "M" && action != "X" ) {
|
| 134 |
+
start_i = i;
|
| 135 |
+
start_s = s;
|
| 136 |
+
currently_matching = 0;
|
| 137 |
+
} // if ( currently_matching
|
| 138 |
+
// end of a mismatch
|
| 139 |
+
else if ( !currently_matching && ( action == "M" || action == "X" ) ) {
|
| 140 |
+
|
| 141 |
+
// remove use of affected target words
|
| 142 |
+
for ( int ss = start_s ; ss < s ; ss++ ) {
|
| 143 |
+
const std::map<int, int> &targets = alignments.m_alignS2T[ss];
|
| 144 |
+
|
| 145 |
+
std::map<int, int>::const_iterator iter;
|
| 146 |
+
for (iter = targets.begin(); iter != targets.end(); ++iter) {
|
| 147 |
+
int tt = iter->first;
|
| 148 |
+
targetBitmap[tt] = 0;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
// also remove enclosed unaligned words?
|
| 152 |
+
} //for ( int ss = start_s ; ss < s ; ss++ ) {
|
| 153 |
+
|
| 154 |
+
// are there input words that need to be inserted ?
|
| 155 |
+
//cerr << start_i << "<" << i << "?" << endl;
|
| 156 |
+
if (start_i < i ) {
|
| 157 |
+
|
| 158 |
+
// take note of input words to be inserted
|
| 159 |
+
string insertion = "";
|
| 160 |
+
for (int ii = start_i ; ii < i ; ii++ ) {
|
| 161 |
+
insertion += inputToks[ii] + " ";
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
// find position for inserted input words
|
| 165 |
+
|
| 166 |
+
// find first removed target word
|
| 167 |
+
int start_t = 1000;
|
| 168 |
+
for ( int ss = start_s ; ss < s ; ss++ ) {
|
| 169 |
+
const std::map<int, int> &targets = alignments.m_alignS2T[ss];
|
| 170 |
+
|
| 171 |
+
std::map<int, int>::const_iterator iter;
|
| 172 |
+
for (iter = targets.begin(); iter != targets.end(); ++iter) {
|
| 173 |
+
int tt = iter->first;
|
| 174 |
+
if (tt < start_t) {
|
| 175 |
+
start_t = tt;
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
// end of sentence? add to end
|
| 181 |
+
if ( start_t == 1000 && i > int(inputToks.size()) - 1 ) {
|
| 182 |
+
start_t = targetsToks.size() - 1;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
// backtrack to previous words if unaligned
|
| 186 |
+
if ( start_t == 1000 ) {
|
| 187 |
+
start_t = -1;
|
| 188 |
+
for ( int ss = s - 1 ; start_t == -1 && ss >= 0 ; ss-- ) {
|
| 189 |
+
const std::map<int, int> &targets = alignments.m_alignS2T[ss];
|
| 190 |
+
|
| 191 |
+
std::map<int, int>::const_iterator iter;
|
| 192 |
+
for (iter = targets.begin(); iter != targets.end(); ++iter) {
|
| 193 |
+
int tt = iter->first;
|
| 194 |
+
if (tt > start_t) {
|
| 195 |
+
start_t = tt;
|
| 196 |
+
}
|
| 197 |
+
}
|
| 198 |
+
}
|
| 199 |
+
} // if ( start_t == 1000 ) {
|
| 200 |
+
|
| 201 |
+
frameInput[start_t] += insertion;
|
| 202 |
+
map<string, int> nt;
|
| 203 |
+
nt["start_t"] = start_t;
|
| 204 |
+
nt["start_i"] = start_i;
|
| 205 |
+
nonTerms.push_back(nt);
|
| 206 |
+
|
| 207 |
+
} // if (start_i < i ) {
|
| 208 |
+
|
| 209 |
+
currently_matching = 1;
|
| 210 |
+
} // else if ( !currently_matching
|
| 211 |
+
|
| 212 |
+
/*
|
| 213 |
+
cerr << action << " " << s << " " << i
|
| 214 |
+
<< "(" << start_s << " " << start_i << ")"
|
| 215 |
+
<< currently_matching;
|
| 216 |
+
*/
|
| 217 |
+
|
| 218 |
+
if ( action != "I" ) {
|
| 219 |
+
//cerr << " ->";
|
| 220 |
+
|
| 221 |
+
if (s < int(alignments.m_alignS2T.size())) {
|
| 222 |
+
const std::map<int, int> &targets = alignments.m_alignS2T[s];
|
| 223 |
+
//cerr << "s=" << s << endl;
|
| 224 |
+
|
| 225 |
+
std::map<int, int>::const_iterator iter;
|
| 226 |
+
for (iter = targets.begin(); iter != targets.end(); ++iter) {
|
| 227 |
+
// int tt = iter->first;
|
| 228 |
+
//cerr << " " << tt;
|
| 229 |
+
}
|
| 230 |
+
}
|
| 231 |
+
}
|
| 232 |
+
//cerr << endl;
|
| 233 |
+
|
| 234 |
+
if (action != "I")
|
| 235 |
+
s++;
|
| 236 |
+
if (action != "D") {
|
| 237 |
+
i++;
|
| 238 |
+
alignI2S[i] = s;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
if (action == "M") {
|
| 242 |
+
inputBitmap.push_back(1);
|
| 243 |
+
} else if (action == "I" || action == "S") {
|
| 244 |
+
inputBitmap.push_back(0);
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
} // for ( int p = 0
|
| 248 |
+
|
| 249 |
+
//cerr << target << endl;
|
| 250 |
+
for (size_t i = 0; i < targetBitmap.size(); ++i) {
|
| 251 |
+
//cerr << targetBitmap[i];
|
| 252 |
+
}
|
| 253 |
+
//cerr << endl;
|
| 254 |
+
|
| 255 |
+
for (map<int, string>::const_iterator iter = frameInput.begin(); iter != frameInput.end(); ++iter) {
|
| 256 |
+
//cerr << iter->first << ":" <<iter->second << endl;
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
// STEP 2: BUILD RULE AND FRAME
|
| 260 |
+
|
| 261 |
+
// hierarchical rule
|
| 262 |
+
int rule_pos_s = 0;
|
| 263 |
+
map<int, int> ruleAlignS;
|
| 264 |
+
|
| 265 |
+
for (int i = 0 ; i < int(inputBitmap.size()) ; ++i ) {
|
| 266 |
+
if ( inputBitmap[i] ) {
|
| 267 |
+
ret.ruleS += inputToks[i] + " ";
|
| 268 |
+
ruleAlignS[ alignI2S[i] ] = rule_pos_s++;
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
for (size_t j = 0; j < nonTerms.size(); ++j) {
|
| 272 |
+
map<string, int> &nt = nonTerms[j];
|
| 273 |
+
if (i == nt["start_i"]) {
|
| 274 |
+
ret.ruleS += "[X][X] ";
|
| 275 |
+
nt["rule_pos_s"] = rule_pos_s++;
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
int rule_pos_t = 0;
|
| 281 |
+
map<int, int> ruleAlignT;
|
| 282 |
+
|
| 283 |
+
for (int t = -1 ; t < (int) targetBitmap.size(); t++ ) {
|
| 284 |
+
if (t >= 0 && targetBitmap[t]) {
|
| 285 |
+
ret.ruleT += targetsToks[t] + " ";
|
| 286 |
+
ruleAlignT[t] = rule_pos_t++;
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
for (size_t i = 0; i < nonTerms.size(); ++i) {
|
| 290 |
+
map<string, int> &nt = nonTerms[i];
|
| 291 |
+
|
| 292 |
+
if (t == nt["start_t"]) {
|
| 293 |
+
ret.ruleT += "[X][X] ";
|
| 294 |
+
nt["rule_pos_t"] = rule_pos_t++;
|
| 295 |
+
}
|
| 296 |
+
}
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
int numAlign = 0;
|
| 300 |
+
ret.ruleAlignment = "";
|
| 301 |
+
|
| 302 |
+
for (map<int, int>::const_iterator iter = ruleAlignS.begin(); iter != ruleAlignS.end(); ++iter) {
|
| 303 |
+
int s = iter->first;
|
| 304 |
+
|
| 305 |
+
if (s < int(alignments.m_alignS2T.size())) {
|
| 306 |
+
const std::map<int, int> &targets = alignments.m_alignS2T[s];
|
| 307 |
+
|
| 308 |
+
std::map<int, int>::const_iterator iter;
|
| 309 |
+
for (iter = targets.begin(); iter != targets.end(); ++iter) {
|
| 310 |
+
int t =iter->first;
|
| 311 |
+
if (ruleAlignT.find(t) == ruleAlignT.end())
|
| 312 |
+
continue;
|
| 313 |
+
ret.ruleAlignment += SPrint(ruleAlignS[s]) + "-" + SPrint(ruleAlignT[t]) + " ";
|
| 314 |
+
++numAlign;
|
| 315 |
+
}
|
| 316 |
+
}
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
//cerr << "numAlign=" << numAlign << endl;
|
| 320 |
+
|
| 321 |
+
for (size_t i = 0; i < nonTerms.size(); ++i) {
|
| 322 |
+
map<string, int> &nt = nonTerms[i];
|
| 323 |
+
ret.ruleAlignment += SPrint(nt["rule_pos_s"]) + "-" + SPrint(nt["rule_pos_t"]) + " ";
|
| 324 |
+
++numAlign;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
//cerr << "numAlign=" << numAlign << endl;
|
| 328 |
+
|
| 329 |
+
ret.ruleS = TrimInternal(ret.ruleS);
|
| 330 |
+
ret.ruleT = TrimInternal(ret.ruleT);
|
| 331 |
+
ret.ruleAlignment = TrimInternal(ret.ruleAlignment);
|
| 332 |
+
|
| 333 |
+
vector<string> ruleAlignmentToks = Tokenize(ret.ruleAlignment);
|
| 334 |
+
for (size_t i = 0; i < ruleAlignmentToks.size(); ++i) {
|
| 335 |
+
const string &alignPoint = ruleAlignmentToks[i];
|
| 336 |
+
vector<string> toks = Tokenize(alignPoint, "-");
|
| 337 |
+
assert(toks.size() == 2);
|
| 338 |
+
ret.ruleAlignmentInv += toks[1] + "-" +toks[0];
|
| 339 |
+
}
|
| 340 |
+
ret.ruleAlignmentInv = TrimInternal(ret.ruleAlignmentInv);
|
| 341 |
+
|
| 342 |
+
// frame
|
| 343 |
+
// ret.frame;
|
| 344 |
+
if (frameInput.find(-1) == frameInput.end())
|
| 345 |
+
ret.frame = frameInput[-1];
|
| 346 |
+
|
| 347 |
+
int currently_included = 0;
|
| 348 |
+
int start_t = -1;
|
| 349 |
+
targetBitmap.push_back(0);
|
| 350 |
+
|
| 351 |
+
for (size_t t = 0 ; t <= targetsToks.size() ; t++ ) {
|
| 352 |
+
// beginning of tm target inclusion
|
| 353 |
+
if ( !currently_included && targetBitmap[t] ) {
|
| 354 |
+
start_t = t;
|
| 355 |
+
currently_included = 1;
|
| 356 |
+
}
|
| 357 |
+
// end of tm target inclusion (not included word or inserted input)
|
| 358 |
+
else if (currently_included
|
| 359 |
+
&& ( targetBitmap[t] || frameInput.find(t) != frameInput.end() )
|
| 360 |
+
) {
|
| 361 |
+
// add xml (unless change is at the beginning of the sentence
|
| 362 |
+
if ( start_t >= 0 ) {
|
| 363 |
+
string target = "";
|
| 364 |
+
//cerr << "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
|
| 365 |
+
for (size_t tt = start_t ; tt < t + targetBitmap[t] ; tt++ ) {
|
| 366 |
+
target += targetsToks[tt] + " ";
|
| 367 |
+
}
|
| 368 |
+
// target = Trim(target); TODO
|
| 369 |
+
ret.frame += "<xml translation=\"" + target + "\"> x </xml> ";
|
| 370 |
+
}
|
| 371 |
+
currently_included = 0;
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
if (frameInput.find(t) != frameInput.end())
|
| 375 |
+
ret.frame += frameInput[t];
|
| 376 |
+
//cerr << targetBitmap[t] << " " << t << " " << "(" << start_t << ")"
|
| 377 |
+
// << currently_included << endl;
|
| 378 |
+
|
| 379 |
+
} //for (int t = 0
|
| 380 |
+
|
| 381 |
+
cerr << ret.frame << "\n-------------------------------------\n";
|
| 382 |
+
return ret;
|
| 383 |
+
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
|
mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.h
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
|
| 5 |
+
void create_xml(const std::string &inPath);
|
mosesdecoder/moses/server/Hypothesis_4server.cpp
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
#include "moses/Hypothesis.h"
|
| 3 |
+
#include "moses/Manager.h"
|
| 4 |
+
#include <boost/foreach.hpp>
|
| 5 |
+
namespace Moses {
|
| 6 |
+
void
|
| 7 |
+
Hypothesis::
|
| 8 |
+
OutputLocalWordAlignment(std::vector<xmlrpc_c::value>& dest) const
|
| 9 |
+
{
|
| 10 |
+
using namespace std;
|
| 11 |
+
Range const& src = this->GetCurrSourceWordsRange();
|
| 12 |
+
Range const& trg = this->GetCurrTargetWordsRange();
|
| 13 |
+
|
| 14 |
+
WordAlignmentSort waso = m_manager.options()->output.WA_SortOrder;
|
| 15 |
+
vector<pair<size_t,size_t> const* > a
|
| 16 |
+
= this->GetCurrTargetPhrase().GetAlignTerm().GetSortedAlignments(waso);
|
| 17 |
+
typedef pair<size_t,size_t> item;
|
| 18 |
+
BOOST_FOREACH(item const* p, a) {
|
| 19 |
+
map<string, xmlrpc_c::value> M;
|
| 20 |
+
M["source-word"] = xmlrpc_c::value_int(src.GetStartPos() + p->first);
|
| 21 |
+
M["target-word"] = xmlrpc_c::value_int(trg.GetStartPos() + p->second);
|
| 22 |
+
dest.push_back(xmlrpc_c::value_struct(M));
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
void
|
| 27 |
+
Hypothesis::
|
| 28 |
+
OutputWordAlignment(std::vector<xmlrpc_c::value>& out) const
|
| 29 |
+
{
|
| 30 |
+
std::vector<Hypothesis const*> tmp;
|
| 31 |
+
for (Hypothesis const* h = this; h; h = h->GetPrevHypo())
|
| 32 |
+
tmp.push_back(h);
|
| 33 |
+
for (size_t i = tmp.size(); i-- > 0;)
|
| 34 |
+
tmp[i]->OutputLocalWordAlignment(out);
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
}
|
mosesdecoder/moses/server/Optimizer.h
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- c++ -*-
|
| 2 |
+
|
| 3 |
+
#include <xmlrpc-c/base.hpp>
|
| 4 |
+
#include <xmlrpc-c/registry.hpp>
|
| 5 |
+
#include <xmlrpc-c/server_abyss.hpp>
|
| 6 |
+
|
| 7 |
+
namespace MosesServer
|
| 8 |
+
{
|
| 9 |
+
class
|
| 10 |
+
Optimizer : public xmlrpc_c::method
|
| 11 |
+
{
|
| 12 |
+
public:
|
| 13 |
+
Optimizer();
|
| 14 |
+
void execute(xmlrpc_c::paramList const& paramList,
|
| 15 |
+
xmlrpc_c::value * const retvalP);
|
| 16 |
+
};
|
| 17 |
+
}
|
mosesdecoder/moses/server/PackScores.cpp
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
#include "PackScores.h"
|
| 3 |
+
#include "moses/FF/StatefulFeatureFunction.h"
|
| 4 |
+
#include "moses/FF/StatelessFeatureFunction.h"
|
| 5 |
+
#include <boost/foreach.hpp>
|
| 6 |
+
namespace Moses {
|
| 7 |
+
|
| 8 |
+
void
|
| 9 |
+
PackScores(FeatureFunction const& ff, FVector const& S,
|
| 10 |
+
std::map<std::string, xmlrpc_c::value>& M)
|
| 11 |
+
{
|
| 12 |
+
std::vector<xmlrpc_c::value> v;
|
| 13 |
+
size_t N = ff.GetNumScoreComponents();
|
| 14 |
+
|
| 15 |
+
std::vector<xmlrpc_c::value> dense;
|
| 16 |
+
dense.reserve(N);
|
| 17 |
+
size_t o = ff.GetIndex();
|
| 18 |
+
for (size_t i = 0; i < N; ++i)
|
| 19 |
+
if (ff.IsTuneableComponent(i))
|
| 20 |
+
dense.push_back(xmlrpc_c::value_double(S[o+i]));
|
| 21 |
+
v.push_back(xmlrpc_c::value_array(dense));
|
| 22 |
+
|
| 23 |
+
std::map<std::string,xmlrpc_c::value> sparse;
|
| 24 |
+
typedef FVector::FNVmap::const_iterator iter;
|
| 25 |
+
for(iter m = S.cbegin(); m != S.cend(); ++m)
|
| 26 |
+
sparse[m->first.name()] = xmlrpc_c::value_double(m->second);
|
| 27 |
+
v.push_back(xmlrpc_c::value_struct(sparse));
|
| 28 |
+
M[ff.GetScoreProducerDescription()] = xmlrpc_c::value_array(v);
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
xmlrpc_c::value
|
| 32 |
+
PackScores(ScoreComponentCollection const& S)
|
| 33 |
+
{
|
| 34 |
+
std::map<std::string, xmlrpc_c::value> M;
|
| 35 |
+
typedef StatefulFeatureFunction SFFF;
|
| 36 |
+
typedef StatelessFeatureFunction SLFF;
|
| 37 |
+
BOOST_FOREACH(SFFF const* ff, SFFF::GetStatefulFeatureFunctions())
|
| 38 |
+
if (ff->IsTuneable())
|
| 39 |
+
PackScores(*ff, S.GetScoresVector(), M);
|
| 40 |
+
BOOST_FOREACH(SLFF const* ff, SLFF::GetStatelessFeatureFunctions())
|
| 41 |
+
if (ff->IsTuneable())
|
| 42 |
+
PackScores(*ff, S.GetScoresVector(), M);
|
| 43 |
+
return xmlrpc_c::value_struct(M);
|
| 44 |
+
}
|
| 45 |
+
}
|
mosesdecoder/moses/server/PackScores.h
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
#include <xmlrpc-c/base.hpp>
|
| 4 |
+
#include "moses/FF/FeatureFunction.h"
|
| 5 |
+
#include "moses/ScoreComponentCollection.h"
|
| 6 |
+
|
| 7 |
+
namespace Moses {
|
| 8 |
+
xmlrpc_c::value
|
| 9 |
+
PackScores(ScoreComponentCollection const& S);
|
| 10 |
+
}
|
mosesdecoder/moses/server/Server.h
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
#include "moses/TypeDef.h"
|
| 4 |
+
|
| 5 |
+
#ifdef WITH_THREADS
|
| 6 |
+
#include <boost/thread.hpp>
|
| 7 |
+
#include "moses/ThreadPool.h"
|
| 8 |
+
#endif
|
| 9 |
+
|
| 10 |
+
#include <xmlrpc-c/base.hpp>
|
| 11 |
+
#include <xmlrpc-c/registry.hpp>
|
| 12 |
+
#include <xmlrpc-c/server_abyss.hpp>
|
| 13 |
+
#include "Translator.h"
|
| 14 |
+
#include "Optimizer.h"
|
| 15 |
+
#include "Updater.h"
|
| 16 |
+
#include "CloseSession.h"
|
| 17 |
+
#include "Session.h"
|
| 18 |
+
#include "moses/parameters/ServerOptions.h"
|
| 19 |
+
#include <string>
|
| 20 |
+
|
| 21 |
+
namespace MosesServer
|
| 22 |
+
{
|
| 23 |
+
class Server
|
| 24 |
+
{
|
| 25 |
+
Moses::ServerOptions m_server_options;
|
| 26 |
+
SessionCache m_session_cache;
|
| 27 |
+
xmlrpc_c::registry m_registry;
|
| 28 |
+
xmlrpc_c::methodPtr const m_updater;
|
| 29 |
+
xmlrpc_c::methodPtr const m_optimizer;
|
| 30 |
+
xmlrpc_c::methodPtr const m_translator;
|
| 31 |
+
xmlrpc_c::methodPtr const m_close_session;
|
| 32 |
+
std::string m_pidfile;
|
| 33 |
+
public:
|
| 34 |
+
Server(Moses::Parameter& params);
|
| 35 |
+
~Server();
|
| 36 |
+
int run();
|
| 37 |
+
void delete_session(uint64_t const session_id);
|
| 38 |
+
|
| 39 |
+
Moses::ServerOptions const&
|
| 40 |
+
options() const;
|
| 41 |
+
|
| 42 |
+
Session const&
|
| 43 |
+
get_session(uint64_t session_id);
|
| 44 |
+
|
| 45 |
+
};
|
| 46 |
+
}
|
mosesdecoder/moses/server/Session.h
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
#include "moses/Util.h"
|
| 4 |
+
#include "moses/ContextScope.h"
|
| 5 |
+
#include "moses/parameters/AllOptions.h"
|
| 6 |
+
#include <sys/time.h>
|
| 7 |
+
#include <boost/unordered_map.hpp>
|
| 8 |
+
|
| 9 |
+
#ifdef WITH_THREADS
|
| 10 |
+
#include <boost/thread/shared_mutex.hpp>
|
| 11 |
+
#include <boost/thread/locks.hpp>
|
| 12 |
+
#endif
|
| 13 |
+
namespace MosesServer{
|
| 14 |
+
|
| 15 |
+
struct Session
|
| 16 |
+
{
|
| 17 |
+
uint64_t const id;
|
| 18 |
+
time_t start_time;
|
| 19 |
+
time_t last_access;
|
| 20 |
+
boost::shared_ptr<Moses::ContextScope> const scope; // stores local info
|
| 21 |
+
SPTR<std::map<std::string,float> > m_context_weights;
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
Session(uint64_t const session_id)
|
| 25 |
+
: id(session_id)
|
| 26 |
+
, scope(new Moses::ContextScope)
|
| 27 |
+
{
|
| 28 |
+
last_access = start_time = time(NULL);
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
bool is_new() const { return last_access == start_time; }
|
| 32 |
+
|
| 33 |
+
void setup(std::map<std::string, xmlrpc_c::value> const& params);
|
| 34 |
+
};
|
| 35 |
+
|
| 36 |
+
class SessionCache
|
| 37 |
+
{
|
| 38 |
+
mutable boost::shared_mutex m_lock;
|
| 39 |
+
uint64_t m_session_counter;
|
| 40 |
+
boost::unordered_map<uint64_t,Session> m_cache;
|
| 41 |
+
public:
|
| 42 |
+
|
| 43 |
+
SessionCache() : m_session_counter(1) {}
|
| 44 |
+
|
| 45 |
+
Session const&
|
| 46 |
+
operator[](uint32_t id)
|
| 47 |
+
{
|
| 48 |
+
boost::upgrade_lock<boost::shared_mutex> lock(m_lock);
|
| 49 |
+
if (id > 1)
|
| 50 |
+
{
|
| 51 |
+
boost::unordered_map<uint64_t, Session>::iterator m = m_cache.find(id);
|
| 52 |
+
if (m != m_cache.end())
|
| 53 |
+
{
|
| 54 |
+
m->second.last_access = time(NULL);
|
| 55 |
+
return m->second;
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
boost::upgrade_to_unique_lock<boost::shared_mutex> xlock(lock);
|
| 59 |
+
id = ++m_session_counter;
|
| 60 |
+
std::pair<uint64_t, Session> foo(id, Session(id));
|
| 61 |
+
return m_cache.insert(foo).first->second;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
void
|
| 65 |
+
erase(uint32_t const id)
|
| 66 |
+
{
|
| 67 |
+
boost::unique_lock<boost::shared_mutex> lock(m_lock);
|
| 68 |
+
m_cache.erase(id);
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
};
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
}
|
mosesdecoder/moses/server/TranslationRequest.cpp
ADDED
|
@@ -0,0 +1,524 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "TranslationRequest.h"
|
| 2 |
+
#include "PackScores.h"
|
| 3 |
+
#include "moses/ContextScope.h"
|
| 4 |
+
#include <boost/foreach.hpp>
|
| 5 |
+
#include "moses/Util.h"
|
| 6 |
+
#include "moses/Hypothesis.h"
|
| 7 |
+
|
| 8 |
+
namespace MosesServer
|
| 9 |
+
{
|
| 10 |
+
using namespace std;
|
| 11 |
+
using Moses::Hypothesis;
|
| 12 |
+
using Moses::StaticData;
|
| 13 |
+
using Moses::Range;
|
| 14 |
+
using Moses::ChartHypothesis;
|
| 15 |
+
using Moses::Phrase;
|
| 16 |
+
using Moses::Manager;
|
| 17 |
+
using Moses::SearchGraphNode;
|
| 18 |
+
using Moses::TrellisPathList;
|
| 19 |
+
using Moses::TranslationOptionCollection;
|
| 20 |
+
using Moses::TranslationOptionList;
|
| 21 |
+
using Moses::TranslationOption;
|
| 22 |
+
using Moses::TargetPhrase;
|
| 23 |
+
using Moses::FValue;
|
| 24 |
+
using Moses::PhraseDictionaryMultiModel;
|
| 25 |
+
using Moses::FindPhraseDictionary;
|
| 26 |
+
using Moses::Sentence;
|
| 27 |
+
using Moses::TokenizeMultiCharSeparator;
|
| 28 |
+
using Moses::FeatureFunction;
|
| 29 |
+
using Moses::Scan;
|
| 30 |
+
|
| 31 |
+
boost::shared_ptr<TranslationRequest>
|
| 32 |
+
TranslationRequest::
|
| 33 |
+
create(Translator* translator, xmlrpc_c::paramList const& paramList,
|
| 34 |
+
boost::condition_variable& cond, boost::mutex& mut)
|
| 35 |
+
{
|
| 36 |
+
boost::shared_ptr<TranslationRequest> ret;
|
| 37 |
+
ret.reset(new TranslationRequest(paramList, cond, mut));
|
| 38 |
+
ret->m_self = ret;
|
| 39 |
+
ret->m_translator = translator;
|
| 40 |
+
return ret;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
void
|
| 44 |
+
SetContextWeights(Moses::ContextScope& s, xmlrpc_c::value const& w)
|
| 45 |
+
{
|
| 46 |
+
SPTR<std::map<std::string,float> > M(new std::map<std::string, float>);
|
| 47 |
+
typedef std::map<std::string,xmlrpc_c::value> tmap;
|
| 48 |
+
tmap const tmp = static_cast<tmap>(xmlrpc_c::value_struct(w));
|
| 49 |
+
for(tmap::const_iterator m = tmp.begin(); m != tmp.end(); ++m)
|
| 50 |
+
(*M)[m->first] = xmlrpc_c::value_double(m->second);
|
| 51 |
+
s.SetContextWeights(M);
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
void
|
| 55 |
+
TranslationRequest::
|
| 56 |
+
Run()
|
| 57 |
+
{
|
| 58 |
+
typedef std::map<std::string,xmlrpc_c::value> param_t;
|
| 59 |
+
param_t const& params = m_paramList.getStruct(0);
|
| 60 |
+
parse_request(params);
|
| 61 |
+
// cerr << "SESSION ID" << ret->m_session_id << endl;
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
// settings within the session scope
|
| 65 |
+
param_t::const_iterator si = params.find("context-weights");
|
| 66 |
+
if (si != params.end()) SetContextWeights(*m_scope, si->second);
|
| 67 |
+
|
| 68 |
+
Moses::StaticData const& SD = Moses::StaticData::Instance();
|
| 69 |
+
|
| 70 |
+
if (is_syntax(m_options->search.algo))
|
| 71 |
+
run_chart_decoder();
|
| 72 |
+
else
|
| 73 |
+
run_phrase_decoder();
|
| 74 |
+
|
| 75 |
+
{
|
| 76 |
+
boost::lock_guard<boost::mutex> lock(m_mutex);
|
| 77 |
+
m_done = true;
|
| 78 |
+
}
|
| 79 |
+
m_cond.notify_one();
|
| 80 |
+
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
/// add phrase alignment information from a Hypothesis
|
| 84 |
+
void
|
| 85 |
+
TranslationRequest::
|
| 86 |
+
add_phrase_aln_info(Hypothesis const& h, vector<xmlrpc_c::value>& aInfo) const
|
| 87 |
+
{
|
| 88 |
+
if (!m_withAlignInfo) return;
|
| 89 |
+
// if (!options()->output.ReportSegmentation) return;
|
| 90 |
+
Range const& trg = h.GetCurrTargetWordsRange();
|
| 91 |
+
Range const& src = h.GetCurrSourceWordsRange();
|
| 92 |
+
|
| 93 |
+
std::map<std::string, xmlrpc_c::value> pAlnInfo;
|
| 94 |
+
pAlnInfo["tgt-start"] = xmlrpc_c::value_int(trg.GetStartPos());
|
| 95 |
+
pAlnInfo["tgt-end"] = xmlrpc_c::value_int(trg.GetEndPos());
|
| 96 |
+
pAlnInfo["src-start"] = xmlrpc_c::value_int(src.GetStartPos());
|
| 97 |
+
pAlnInfo["src-end"] = xmlrpc_c::value_int(src.GetEndPos());
|
| 98 |
+
aInfo.push_back(xmlrpc_c::value_struct(pAlnInfo));
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
void
|
| 102 |
+
TranslationRequest::
|
| 103 |
+
outputChartHypo(ostream& out, const ChartHypothesis* hypo)
|
| 104 |
+
{
|
| 105 |
+
Phrase outPhrase(20);
|
| 106 |
+
hypo->GetOutputPhrase(outPhrase);
|
| 107 |
+
|
| 108 |
+
// delete 1st & last
|
| 109 |
+
assert(outPhrase.GetSize() >= 2);
|
| 110 |
+
outPhrase.RemoveWord(0);
|
| 111 |
+
outPhrase.RemoveWord(outPhrase.GetSize() - 1);
|
| 112 |
+
for (size_t pos = 0 ; pos < outPhrase.GetSize() ; pos++)
|
| 113 |
+
out << *outPhrase.GetFactor(pos, 0) << " ";
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
bool
|
| 117 |
+
TranslationRequest::
|
| 118 |
+
compareSearchGraphNode(const Moses::SearchGraphNode& a,
|
| 119 |
+
const Moses::SearchGraphNode& b)
|
| 120 |
+
{
|
| 121 |
+
return a.hypo->GetId() < b.hypo->GetId();
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
void
|
| 125 |
+
TranslationRequest::
|
| 126 |
+
insertGraphInfo(Manager& manager, map<string, xmlrpc_c::value>& retData)
|
| 127 |
+
{
|
| 128 |
+
using xmlrpc_c::value_int;
|
| 129 |
+
using xmlrpc_c::value_double;
|
| 130 |
+
using xmlrpc_c::value_struct;
|
| 131 |
+
using xmlrpc_c::value_string;
|
| 132 |
+
vector<xmlrpc_c::value> searchGraphXml;
|
| 133 |
+
vector<SearchGraphNode> searchGraph;
|
| 134 |
+
manager.GetSearchGraph(searchGraph);
|
| 135 |
+
std::sort(searchGraph.begin(), searchGraph.end());
|
| 136 |
+
BOOST_FOREACH(Moses::SearchGraphNode const& n, searchGraph) {
|
| 137 |
+
map<string, xmlrpc_c::value> x; // search graph xml node
|
| 138 |
+
x["forward"] = value_double(n.forward);
|
| 139 |
+
x["fscore"] = value_double(n.fscore);
|
| 140 |
+
const Hypothesis* hypo = n.hypo;
|
| 141 |
+
x["hyp"] = value_int(hypo->GetId());
|
| 142 |
+
x["stack"] = value_int(hypo->GetWordsBitmap().GetNumWordsCovered());
|
| 143 |
+
if (hypo->GetId() != 0) {
|
| 144 |
+
const Hypothesis *prevHypo = hypo->GetPrevHypo();
|
| 145 |
+
x["back"] = value_int(prevHypo->GetId());
|
| 146 |
+
x["score"] = value_double(hypo->GetScore());
|
| 147 |
+
x["transition"] = value_double(hypo->GetScore() - prevHypo->GetScore());
|
| 148 |
+
if (n.recombinationHypo)
|
| 149 |
+
x["recombined"] = value_int(n.recombinationHypo->GetId());
|
| 150 |
+
x["cover-start"] = value_int(hypo->GetCurrSourceWordsRange().GetStartPos());
|
| 151 |
+
x["cover-end"] = value_int(hypo->GetCurrSourceWordsRange().GetEndPos());
|
| 152 |
+
x["out"] = value_string(hypo->GetCurrTargetPhrase().GetStringRep(options()->output.factor_order));
|
| 153 |
+
}
|
| 154 |
+
searchGraphXml.push_back(value_struct(x));
|
| 155 |
+
}
|
| 156 |
+
retData["sg"] = xmlrpc_c::value_array(searchGraphXml);
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
void
|
| 160 |
+
TranslationRequest::
|
| 161 |
+
outputNBest(const Manager& manager, map<string, xmlrpc_c::value>& retData)
|
| 162 |
+
{
|
| 163 |
+
TrellisPathList nBestList;
|
| 164 |
+
vector<xmlrpc_c::value> nBestXml;
|
| 165 |
+
|
| 166 |
+
Moses::NBestOptions const& nbo = m_options->nbest;
|
| 167 |
+
manager.CalcNBest(nbo.nbest_size, nBestList, nbo.only_distinct);
|
| 168 |
+
manager.OutputNBest(cout, nBestList);
|
| 169 |
+
|
| 170 |
+
BOOST_FOREACH(Moses::TrellisPath const* path, nBestList) {
|
| 171 |
+
vector<const Hypothesis *> const& E = path->GetEdges();
|
| 172 |
+
if (!E.size()) continue;
|
| 173 |
+
std::map<std::string, xmlrpc_c::value> nBestXmlItem;
|
| 174 |
+
pack_hypothesis(manager, E, "hyp", nBestXmlItem);
|
| 175 |
+
if (m_withScoreBreakdown) {
|
| 176 |
+
// should the score breakdown be reported in a more structured manner?
|
| 177 |
+
ostringstream buf;
|
| 178 |
+
bool with_labels = nbo.include_feature_labels;
|
| 179 |
+
path->GetScoreBreakdown()->OutputAllFeatureScores(buf, with_labels);
|
| 180 |
+
nBestXmlItem["fvals"] = xmlrpc_c::value_string(buf.str());
|
| 181 |
+
nBestXmlItem["scores"] = PackScores(*path->GetScoreBreakdown());
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
// weighted score
|
| 185 |
+
nBestXmlItem["totalScore"] = xmlrpc_c::value_double(path->GetFutureScore());
|
| 186 |
+
nBestXml.push_back(xmlrpc_c::value_struct(nBestXmlItem));
|
| 187 |
+
}
|
| 188 |
+
retData["nbest"] = xmlrpc_c::value_array(nBestXml);
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
void
|
| 192 |
+
TranslationRequest::
|
| 193 |
+
insertTranslationOptions(Moses::Manager& manager,
|
| 194 |
+
std::map<std::string, xmlrpc_c::value>& retData)
|
| 195 |
+
{
|
| 196 |
+
std::vector<Moses::FactorType> const& ofactor_order = options()->output.factor_order;
|
| 197 |
+
|
| 198 |
+
const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions();
|
| 199 |
+
vector<xmlrpc_c::value> toptsXml;
|
| 200 |
+
size_t const stop = toptsColl->GetSource().GetSize();
|
| 201 |
+
TranslationOptionList const* tol;
|
| 202 |
+
for (size_t s = 0 ; s < stop ; ++s) {
|
| 203 |
+
for (size_t e=s;(tol=toptsColl->GetTranslationOptionList(s,e))!=NULL;++e) {
|
| 204 |
+
BOOST_FOREACH(TranslationOption const* topt, *tol) {
|
| 205 |
+
std::map<std::string, xmlrpc_c::value> toptXml;
|
| 206 |
+
TargetPhrase const& tp = topt->GetTargetPhrase();
|
| 207 |
+
std::string tphrase = tp.GetStringRep(ofactor_order);
|
| 208 |
+
toptXml["phrase"] = xmlrpc_c::value_string(tphrase);
|
| 209 |
+
toptXml["fscore"] = xmlrpc_c::value_double(topt->GetFutureScore());
|
| 210 |
+
toptXml["start"] = xmlrpc_c::value_int(s);
|
| 211 |
+
toptXml["end"] = xmlrpc_c::value_int(e);
|
| 212 |
+
vector<xmlrpc_c::value> scoresXml;
|
| 213 |
+
const std::valarray<FValue> &scores
|
| 214 |
+
= topt->GetScoreBreakdown().getCoreFeatures();
|
| 215 |
+
for (size_t j = 0; j < scores.size(); ++j)
|
| 216 |
+
scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
|
| 217 |
+
toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
|
| 218 |
+
ostringstream buf;
|
| 219 |
+
topt->GetScoreBreakdown().OutputAllFeatureScores(buf, true);
|
| 220 |
+
toptXml["labelledScores"] = PackScores(topt->GetScoreBreakdown());
|
| 221 |
+
toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
|
| 222 |
+
}
|
| 223 |
+
}
|
| 224 |
+
}
|
| 225 |
+
retData["topt"] = xmlrpc_c::value_array(toptsXml);
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
TranslationRequest::
|
| 229 |
+
TranslationRequest(xmlrpc_c::paramList const& paramList,
|
| 230 |
+
boost::condition_variable& cond, boost::mutex& mut)
|
| 231 |
+
: m_cond(cond), m_mutex(mut), m_done(false), m_paramList(paramList)
|
| 232 |
+
, m_session_id(0)
|
| 233 |
+
{
|
| 234 |
+
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
bool
|
| 238 |
+
check(std::map<std::string, xmlrpc_c::value> const& param,
|
| 239 |
+
std::string const key)
|
| 240 |
+
{
|
| 241 |
+
std::map<std::string, xmlrpc_c::value>::const_iterator m = param.find(key);
|
| 242 |
+
if(m == param.end()) return false;
|
| 243 |
+
|
| 244 |
+
if (m->second.type() == xmlrpc_c::value::TYPE_BOOLEAN)
|
| 245 |
+
return xmlrpc_c::value_boolean(m->second);
|
| 246 |
+
|
| 247 |
+
std::string val = string(xmlrpc_c::value_string(m->second));
|
| 248 |
+
if(val == "true" || val == "True" || val == "TRUE" || val == "1") return true;
|
| 249 |
+
return false;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
void
|
| 253 |
+
TranslationRequest::
|
| 254 |
+
parse_request(std::map<std::string, xmlrpc_c::value> const& params)
|
| 255 |
+
{
|
| 256 |
+
// parse XMLRPC request
|
| 257 |
+
m_paramList.verifyEnd(1); // ??? UG
|
| 258 |
+
|
| 259 |
+
typedef std::map<std::string, xmlrpc_c::value> params_t;
|
| 260 |
+
params_t::const_iterator si;
|
| 261 |
+
|
| 262 |
+
si = params.find("session-id");
|
| 263 |
+
if (si != params.end())
|
| 264 |
+
{
|
| 265 |
+
m_session_id = xmlrpc_c::value_int(si->second);
|
| 266 |
+
Session const& S = m_translator->get_session(m_session_id);
|
| 267 |
+
m_scope = S.scope;
|
| 268 |
+
m_session_id = S.id;
|
| 269 |
+
}
|
| 270 |
+
else
|
| 271 |
+
{
|
| 272 |
+
m_session_id = 0;
|
| 273 |
+
m_scope.reset(new Moses::ContextScope);
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
boost::shared_ptr<Moses::AllOptions> opts(new Moses::AllOptions(*StaticData::Instance().options()));
|
| 277 |
+
opts->update(params);
|
| 278 |
+
|
| 279 |
+
m_withGraphInfo = check(params, "sg");
|
| 280 |
+
if (m_withGraphInfo || opts->nbest.nbest_size > 0) {
|
| 281 |
+
opts->output.SearchGraph = "true";
|
| 282 |
+
opts->nbest.enabled = true;
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
m_options = opts;
|
| 286 |
+
|
| 287 |
+
// source text must be given, or we don't know what to translate
|
| 288 |
+
si = params.find("text");
|
| 289 |
+
if (si == params.end())
|
| 290 |
+
throw xmlrpc_c::fault("Missing source text", xmlrpc_c::fault::CODE_PARSE);
|
| 291 |
+
m_source_string = xmlrpc_c::value_string(si->second);
|
| 292 |
+
XVERBOSE(1,"Input: " << m_source_string << endl);
|
| 293 |
+
|
| 294 |
+
m_withTopts = check(params, "topt");
|
| 295 |
+
m_withScoreBreakdown = check(params, "add-score-breakdown");
|
| 296 |
+
si = params.find("lambda");
|
| 297 |
+
if (si != params.end())
|
| 298 |
+
{
|
| 299 |
+
// muMo = multiModel
|
| 300 |
+
xmlrpc_c::value_array muMoArray = xmlrpc_c::value_array(si->second);
|
| 301 |
+
vector<xmlrpc_c::value> muMoValVec(muMoArray.vectorValueValue());
|
| 302 |
+
vector<float> w(muMoValVec.size());
|
| 303 |
+
for (size_t i = 0; i < muMoValVec.size(); ++i)
|
| 304 |
+
w[i] = xmlrpc_c::value_double(muMoValVec[i]);
|
| 305 |
+
if (w.size() && (si = params.find("model_name")) != params.end())
|
| 306 |
+
{
|
| 307 |
+
string const model_name = xmlrpc_c::value_string(si->second);
|
| 308 |
+
PhraseDictionaryMultiModel* pdmm
|
| 309 |
+
= (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
|
| 310 |
+
pdmm->SetTemporaryMultiModelWeightsVector(w);
|
| 311 |
+
}
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
si = params.find("context");
|
| 315 |
+
if (si != params.end())
|
| 316 |
+
{
|
| 317 |
+
string context = xmlrpc_c::value_string(si->second);
|
| 318 |
+
VERBOSE(1,"CONTEXT " << context);
|
| 319 |
+
m_context.reset(new std::vector<std::string>(1,context));
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
si = params.find("context-scope");
|
| 323 |
+
if (si != params.end())
|
| 324 |
+
{
|
| 325 |
+
|
| 326 |
+
string context = xmlrpc_c::value_string(si->second);
|
| 327 |
+
|
| 328 |
+
string groupSeparator("Moses::ContextScope::GroupSeparator");
|
| 329 |
+
string recordSeparator("Moses::ContextScope::RecordSeparator");
|
| 330 |
+
|
| 331 |
+
// Here, we assume that any XML-RPC value
|
| 332 |
+
// associated with the key "context-scope"
|
| 333 |
+
// has the following format:
|
| 334 |
+
//
|
| 335 |
+
// FeatureFunctionName followed by recordSeparator
|
| 336 |
+
// followed by the value of interest
|
| 337 |
+
// followed by groupSeparator
|
| 338 |
+
//
|
| 339 |
+
// In the following code, the value of interest will be stored
|
| 340 |
+
// in contextScope under the key FeatureFunctionName,
|
| 341 |
+
// where FeatureFunctionName is the actual name of the feature function
|
| 342 |
+
|
| 343 |
+
boost::shared_ptr<Moses::ContextScope> contextScope = GetScope();
|
| 344 |
+
|
| 345 |
+
BOOST_FOREACH(string group, TokenizeMultiCharSeparator(context, groupSeparator)) {
|
| 346 |
+
|
| 347 |
+
vector<string> record = TokenizeMultiCharSeparator(group, recordSeparator);
|
| 348 |
+
|
| 349 |
+
// Use the feature function whose name is record[0] as a key
|
| 350 |
+
FeatureFunction& ff = Moses::FeatureFunction::FindFeatureFunction(record[0]);
|
| 351 |
+
void const* key = static_cast<void const*>(&ff);
|
| 352 |
+
|
| 353 |
+
// Store (in the context scope) record[1] as the value associated with that key
|
| 354 |
+
boost::shared_ptr<string> value = contextScope->get<string>(key,true);
|
| 355 |
+
value->replace(value->begin(), value->end(), record[1]);
|
| 356 |
+
|
| 357 |
+
}
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
// Report alignment info if Moses config says to or if XML request says to
|
| 361 |
+
m_withAlignInfo = options()->output.ReportSegmentation || check(params, "align");
|
| 362 |
+
|
| 363 |
+
// Report word alignment info if Moses config says to or if XML request says to
|
| 364 |
+
m_withWordAlignInfo = options()->output.PrintAlignmentInfo || check(params, "word-align");
|
| 365 |
+
|
| 366 |
+
si = params.find("weights");
|
| 367 |
+
if (si != params.end())
|
| 368 |
+
{
|
| 369 |
+
|
| 370 |
+
boost::unordered_map<string, FeatureFunction*> map;
|
| 371 |
+
{
|
| 372 |
+
const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
|
| 373 |
+
BOOST_FOREACH(FeatureFunction* const& ff, ffs) {
|
| 374 |
+
map[ff->GetScoreProducerDescription()] = ff;
|
| 375 |
+
}
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
string allValues = xmlrpc_c::value_string(si->second);
|
| 379 |
+
|
| 380 |
+
BOOST_FOREACH(string values, TokenizeMultiCharSeparator(allValues, "\t")) {
|
| 381 |
+
|
| 382 |
+
vector<string> record = TokenizeMultiCharSeparator(values, "=");
|
| 383 |
+
|
| 384 |
+
if (record.size() == 2) {
|
| 385 |
+
string featureName = record[0];
|
| 386 |
+
string featureWeights = record[1];
|
| 387 |
+
|
| 388 |
+
boost::unordered_map<string, FeatureFunction*>::iterator ffi = map.find(featureName);
|
| 389 |
+
|
| 390 |
+
if (ffi != map.end()) {
|
| 391 |
+
FeatureFunction* ff = ffi->second;
|
| 392 |
+
|
| 393 |
+
size_t prevNumWeights = ff->GetNumScoreComponents();
|
| 394 |
+
|
| 395 |
+
vector<float> ffWeights;
|
| 396 |
+
BOOST_FOREACH(string weight, TokenizeMultiCharSeparator(featureWeights, " ")) {
|
| 397 |
+
ffWeights.push_back(Scan<float>(weight));
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
if (ffWeights.size() == ff->GetNumScoreComponents()) {
|
| 401 |
+
|
| 402 |
+
// XXX: This is NOT thread-safe
|
| 403 |
+
Moses::StaticData::InstanceNonConst().SetWeights(ff, ffWeights);
|
| 404 |
+
VERBOSE(1, "WARNING: THIS IS NOT THREAD-SAFE!\tUpdating weights for " << featureName << " to " << featureWeights << "\n");
|
| 405 |
+
|
| 406 |
+
} else {
|
| 407 |
+
TRACE_ERR("ERROR: Unable to update weights for " << featureName << " because " << ff->GetNumScoreComponents() << " weights are required but only " << ffWeights.size() << " were provided\n");
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
} else {
|
| 411 |
+
TRACE_ERR("ERROR: No FeatureFunction with name " << featureName << ", no weight update\n");
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
} else {
|
| 415 |
+
TRACE_ERR("WARNING: XML-RPC weights update was improperly formatted:\t" << values << "\n");
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
// // biased sampling for suffix-array-based sampling phrase table?
|
| 424 |
+
// if ((si = params.find("bias")) != params.end())
|
| 425 |
+
// {
|
| 426 |
+
// std::vector<xmlrpc_c::value> tmp
|
| 427 |
+
// = xmlrpc_c::value_array(si->second).cvalue();
|
| 428 |
+
// for (size_t i = 1; i < tmp.size(); i += 2)
|
| 429 |
+
// m_bias[xmlrpc_c::value_int(tmp[i-1])] = xmlrpc_c::value_double(tmp[i]);
|
| 430 |
+
// }
|
| 431 |
+
if (is_syntax(m_options->search.algo)) {
|
| 432 |
+
m_source.reset(new Sentence(m_options,0,m_source_string));
|
| 433 |
+
} else {
|
| 434 |
+
m_source.reset(new Sentence(m_options,0,m_source_string));
|
| 435 |
+
}
|
| 436 |
+
interpret_dlt();
|
| 437 |
+
} // end of Translationtask::parse_request()
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
void
|
| 441 |
+
TranslationRequest::
|
| 442 |
+
run_chart_decoder()
|
| 443 |
+
{
|
| 444 |
+
Moses::ChartManager manager(this->self());
|
| 445 |
+
manager.Decode();
|
| 446 |
+
|
| 447 |
+
const Moses::ChartHypothesis *hypo = manager.GetBestHypothesis();
|
| 448 |
+
ostringstream out;
|
| 449 |
+
if (hypo) outputChartHypo(out,hypo);
|
| 450 |
+
|
| 451 |
+
m_target_string = out.str();
|
| 452 |
+
m_retData["text"] = xmlrpc_c::value_string(m_target_string);
|
| 453 |
+
|
| 454 |
+
if (m_withGraphInfo) {
|
| 455 |
+
std::ostringstream sgstream;
|
| 456 |
+
manager.OutputSearchGraphMoses(sgstream);
|
| 457 |
+
m_retData["sg"] = xmlrpc_c::value_string(sgstream.str());
|
| 458 |
+
}
|
| 459 |
+
} // end of TranslationRequest::run_chart_decoder()
|
| 460 |
+
|
| 461 |
+
void
|
| 462 |
+
TranslationRequest::
|
| 463 |
+
pack_hypothesis(const Moses::Manager& manager,
|
| 464 |
+
vector<Hypothesis const* > const& edges, string const& key,
|
| 465 |
+
map<string, xmlrpc_c::value> & dest) const
|
| 466 |
+
{
|
| 467 |
+
// target string
|
| 468 |
+
ostringstream target;
|
| 469 |
+
BOOST_REVERSE_FOREACH(Hypothesis const* e, edges) {
|
| 470 |
+
manager.OutputSurface(target, *e);
|
| 471 |
+
}
|
| 472 |
+
XVERBOSE(1, "BEST TRANSLATION: " << *(manager.GetBestHypothesis())
|
| 473 |
+
<< std::endl);
|
| 474 |
+
dest[key] = xmlrpc_c::value_string(target.str());
|
| 475 |
+
|
| 476 |
+
if (m_withAlignInfo) {
|
| 477 |
+
// if (options()->output.ReportSegmentation) {
|
| 478 |
+
// phrase alignment, if requested
|
| 479 |
+
|
| 480 |
+
vector<xmlrpc_c::value> p_aln;
|
| 481 |
+
BOOST_REVERSE_FOREACH(Hypothesis const* e, edges)
|
| 482 |
+
add_phrase_aln_info(*e, p_aln);
|
| 483 |
+
dest["align"] = xmlrpc_c::value_array(p_aln);
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
if (m_withWordAlignInfo) {
|
| 487 |
+
//if (options()->output.PrintAlignmentInfo) {
|
| 488 |
+
// word alignment, if requested
|
| 489 |
+
vector<xmlrpc_c::value> w_aln;
|
| 490 |
+
BOOST_REVERSE_FOREACH(Hypothesis const* e, edges)
|
| 491 |
+
e->OutputLocalWordAlignment(w_aln);
|
| 492 |
+
dest["word-align"] = xmlrpc_c::value_array(w_aln);
|
| 493 |
+
}
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
void
|
| 497 |
+
TranslationRequest::
|
| 498 |
+
pack_hypothesis(const Moses::Manager& manager, Hypothesis const* h, string const& key,
|
| 499 |
+
map<string, xmlrpc_c::value>& dest) const
|
| 500 |
+
{
|
| 501 |
+
using namespace std;
|
| 502 |
+
vector<Hypothesis const*> edges;
|
| 503 |
+
for (; h; h = h->GetPrevHypo())
|
| 504 |
+
edges.push_back(h);
|
| 505 |
+
pack_hypothesis(manager, edges, key, dest);
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
void
|
| 510 |
+
TranslationRequest::
|
| 511 |
+
run_phrase_decoder()
|
| 512 |
+
{
|
| 513 |
+
Manager manager(this->self());
|
| 514 |
+
manager.Decode();
|
| 515 |
+
pack_hypothesis(manager, manager.GetBestHypothesis(), "text", m_retData);
|
| 516 |
+
if (m_session_id)
|
| 517 |
+
m_retData["session-id"] = xmlrpc_c::value_int(m_session_id);
|
| 518 |
+
|
| 519 |
+
if (m_withGraphInfo) insertGraphInfo(manager,m_retData);
|
| 520 |
+
if (m_withTopts) insertTranslationOptions(manager,m_retData);
|
| 521 |
+
if (m_options->nbest.nbest_size) outputNBest(manager, m_retData);
|
| 522 |
+
|
| 523 |
+
}
|
| 524 |
+
}
|
mosesdecoder/moses/server/Updater.cpp
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
#include "Updater.h"
|
| 3 |
+
|
| 4 |
+
namespace MosesServer
|
| 5 |
+
{
|
| 6 |
+
using namespace Moses;
|
| 7 |
+
using namespace std;
|
| 8 |
+
|
| 9 |
+
Updater::
|
| 10 |
+
Updater()
|
| 11 |
+
{
|
| 12 |
+
// signature and help strings are documentation -- the client
|
| 13 |
+
// can query this information with a system.methodSignature and
|
| 14 |
+
// system.methodHelp RPC.
|
| 15 |
+
this->_signature = "S:S";
|
| 16 |
+
this->_help = "Updates stuff";
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
void
|
| 20 |
+
Updater::
|
| 21 |
+
execute(xmlrpc_c::paramList const& paramList,
|
| 22 |
+
xmlrpc_c::value * const retvalP)
|
| 23 |
+
{
|
| 24 |
+
#if PT_UG
|
| 25 |
+
const params_t params = paramList.getStruct(0);
|
| 26 |
+
breakOutParams(params);
|
| 27 |
+
Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(PhraseDictionary::GetColl()[0]);
|
| 28 |
+
pdsa->add(m_src, m_trg, m_aln);
|
| 29 |
+
XVERBOSE(1,"Done inserting\n");
|
| 30 |
+
*retvalP = xmlrpc_c::value_string("Phrase table updated");
|
| 31 |
+
#endif
|
| 32 |
+
};
|
| 33 |
+
|
| 34 |
+
void
|
| 35 |
+
Updater::
|
| 36 |
+
breakOutParams(const params_t& params)
|
| 37 |
+
{
|
| 38 |
+
params_t::const_iterator si = params.find("source");
|
| 39 |
+
if(si == params.end())
|
| 40 |
+
throw xmlrpc_c::fault("Missing source sentence",
|
| 41 |
+
xmlrpc_c::fault::CODE_PARSE);
|
| 42 |
+
m_src = xmlrpc_c::value_string(si->second);
|
| 43 |
+
XVERBOSE(1,"source = " << m_src << endl);
|
| 44 |
+
si = params.find("target");
|
| 45 |
+
if(si == params.end())
|
| 46 |
+
throw xmlrpc_c::fault("Missing target sentence",
|
| 47 |
+
xmlrpc_c::fault::CODE_PARSE);
|
| 48 |
+
m_trg = xmlrpc_c::value_string(si->second);
|
| 49 |
+
XVERBOSE(1,"target = " << m_trg << endl);
|
| 50 |
+
if((si = params.find("alignment")) == params.end())
|
| 51 |
+
throw xmlrpc_c::fault("Missing alignment", xmlrpc_c::fault::CODE_PARSE);
|
| 52 |
+
m_aln = xmlrpc_c::value_string(si->second);
|
| 53 |
+
XVERBOSE(1,"alignment = " << m_aln << endl);
|
| 54 |
+
m_bounded = ((si = params.find("bounded")) != params.end());
|
| 55 |
+
m_add2ORLM = ((si = params.find("updateORLM")) != params.end());
|
| 56 |
+
};
|
| 57 |
+
|
| 58 |
+
}
|
mosesdecoder/moses/server/Updater.h
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- c++ -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
|
| 4 |
+
#include "moses/Util.h"
|
| 5 |
+
#include "moses/ChartManager.h"
|
| 6 |
+
#include "moses/Hypothesis.h"
|
| 7 |
+
#include "moses/Manager.h"
|
| 8 |
+
#include "moses/StaticData.h"
|
| 9 |
+
#include "moses/ThreadPool.h"
|
| 10 |
+
|
| 11 |
+
#if PT_UG
|
| 12 |
+
#include "moses/TranslationModel/UG/mmsapt.h"
|
| 13 |
+
#endif
|
| 14 |
+
|
| 15 |
+
#include <xmlrpc-c/base.hpp>
|
| 16 |
+
#include <xmlrpc-c/registry.hpp>
|
| 17 |
+
#include <xmlrpc-c/server_abyss.hpp>
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
namespace MosesServer
|
| 21 |
+
{
|
| 22 |
+
class
|
| 23 |
+
Updater: public xmlrpc_c::method
|
| 24 |
+
{
|
| 25 |
+
|
| 26 |
+
typedef std::map<std::string, xmlrpc_c::value> params_t;
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
std::string m_src, m_trg, m_aln;
|
| 30 |
+
bool m_bounded, m_add2ORLM;
|
| 31 |
+
|
| 32 |
+
public:
|
| 33 |
+
Updater();
|
| 34 |
+
|
| 35 |
+
void
|
| 36 |
+
execute(xmlrpc_c::paramList const& paramList,
|
| 37 |
+
xmlrpc_c::value * const retvalP);
|
| 38 |
+
|
| 39 |
+
void
|
| 40 |
+
breakOutParams(const params_t& params);
|
| 41 |
+
|
| 42 |
+
};
|
| 43 |
+
|
| 44 |
+
}
|
mosesdecoder/util/bit_packing_test.cc
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "util/bit_packing.hh"
|
| 2 |
+
|
| 3 |
+
#define BOOST_TEST_MODULE BitPackingTest
|
| 4 |
+
#include <boost/test/unit_test.hpp>
|
| 5 |
+
|
| 6 |
+
#include <cstring>
|
| 7 |
+
|
| 8 |
+
namespace util {
|
| 9 |
+
namespace {
|
| 10 |
+
|
| 11 |
+
const uint64_t test57 = 0x123456789abcdefULL;
|
| 12 |
+
const uint32_t test25 = 0x1234567;
|
| 13 |
+
|
| 14 |
+
BOOST_AUTO_TEST_CASE(ZeroBit57) {
|
| 15 |
+
char mem[16];
|
| 16 |
+
memset(mem, 0, sizeof(mem));
|
| 17 |
+
WriteInt57(mem, 0, 57, test57);
|
| 18 |
+
BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1));
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
BOOST_AUTO_TEST_CASE(EachBit57) {
|
| 22 |
+
char mem[16];
|
| 23 |
+
for (uint8_t b = 0; b < 8; ++b) {
|
| 24 |
+
memset(mem, 0, sizeof(mem));
|
| 25 |
+
WriteInt57(mem, b, 57, test57);
|
| 26 |
+
BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
BOOST_AUTO_TEST_CASE(Consecutive57) {
|
| 31 |
+
char mem[57+8];
|
| 32 |
+
memset(mem, 0, sizeof(mem));
|
| 33 |
+
for (uint64_t b = 0; b < 57 * 8; b += 57) {
|
| 34 |
+
WriteInt57(mem, b, 57, test57);
|
| 35 |
+
BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
|
| 36 |
+
}
|
| 37 |
+
for (uint64_t b = 0; b < 57 * 8; b += 57) {
|
| 38 |
+
BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
BOOST_AUTO_TEST_CASE(Consecutive25) {
|
| 43 |
+
char mem[25+8];
|
| 44 |
+
memset(mem, 0, sizeof(mem));
|
| 45 |
+
for (uint64_t b = 0; b < 25 * 8; b += 25) {
|
| 46 |
+
WriteInt25(mem, b, 25, test25);
|
| 47 |
+
BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1));
|
| 48 |
+
}
|
| 49 |
+
for (uint64_t b = 0; b < 25 * 8; b += 25) {
|
| 50 |
+
BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1));
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
BOOST_AUTO_TEST_CASE(Sanity) {
|
| 55 |
+
BitPackingSanity();
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
} // namespace
|
| 59 |
+
} // namespace util
|
mosesdecoder/util/ersatz_progress.hh
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef UTIL_ERSATZ_PROGRESS_H
|
| 2 |
+
#define UTIL_ERSATZ_PROGRESS_H
|
| 3 |
+
|
| 4 |
+
#include <iostream>
|
| 5 |
+
#include <string>
|
| 6 |
+
#include <stdint.h>
|
| 7 |
+
|
| 8 |
+
// Ersatz version of boost::progress so core language model doesn't depend on
|
| 9 |
+
// boost. Also adds option to print nothing.
|
| 10 |
+
|
| 11 |
+
namespace util {
|
| 12 |
+
|
| 13 |
+
extern const char kProgressBanner[];
|
| 14 |
+
|
| 15 |
+
class ErsatzProgress {
|
| 16 |
+
public:
|
| 17 |
+
// No output.
|
| 18 |
+
ErsatzProgress();
|
| 19 |
+
|
| 20 |
+
// Null means no output. The null value is useful for passing along the ostream pointer from another caller.
|
| 21 |
+
explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
|
| 22 |
+
|
| 23 |
+
~ErsatzProgress();
|
| 24 |
+
|
| 25 |
+
ErsatzProgress &operator++() {
|
| 26 |
+
if (++current_ >= next_) Milestone();
|
| 27 |
+
return *this;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
ErsatzProgress &operator+=(uint64_t amount) {
|
| 31 |
+
if ((current_ += amount) >= next_) Milestone();
|
| 32 |
+
return *this;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
void Set(uint64_t to) {
|
| 36 |
+
if ((current_ = to) >= next_) Milestone();
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
void Finished() {
|
| 40 |
+
Set(complete_);
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
private:
|
| 44 |
+
void Milestone();
|
| 45 |
+
|
| 46 |
+
uint64_t current_, next_, complete_;
|
| 47 |
+
unsigned char stones_written_;
|
| 48 |
+
std::ostream *out_;
|
| 49 |
+
|
| 50 |
+
// noncopyable
|
| 51 |
+
ErsatzProgress(const ErsatzProgress &other);
|
| 52 |
+
ErsatzProgress &operator=(const ErsatzProgress &other);
|
| 53 |
+
};
|
| 54 |
+
|
| 55 |
+
} // namespace util
|
| 56 |
+
|
| 57 |
+
#endif // UTIL_ERSATZ_PROGRESS_H
|
mosesdecoder/util/exception.hh
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef UTIL_EXCEPTION_H
|
| 2 |
+
#define UTIL_EXCEPTION_H
|
| 3 |
+
|
| 4 |
+
#include "util/string_stream.hh"
|
| 5 |
+
|
| 6 |
+
#include <exception>
|
| 7 |
+
#include <limits>
|
| 8 |
+
#include <string>
|
| 9 |
+
#include <stdint.h>
|
| 10 |
+
|
| 11 |
+
// TODO(hieu): delete this
|
| 12 |
+
#include <sstream>
|
| 13 |
+
|
| 14 |
+
namespace util {
|
| 15 |
+
|
| 16 |
+
template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
|
| 17 |
+
|
| 18 |
+
class Exception : public std::exception {
|
| 19 |
+
public:
|
| 20 |
+
Exception() throw();
|
| 21 |
+
virtual ~Exception() throw();
|
| 22 |
+
|
| 23 |
+
const char *what() const throw() { return what_.str().c_str(); }
|
| 24 |
+
|
| 25 |
+
// For use by the UTIL_THROW macros.
|
| 26 |
+
void SetLocation(
|
| 27 |
+
const char *file,
|
| 28 |
+
unsigned int line,
|
| 29 |
+
const char *func,
|
| 30 |
+
const char *child_name,
|
| 31 |
+
const char *condition);
|
| 32 |
+
|
| 33 |
+
private:
|
| 34 |
+
template <class Except, class Data> friend typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
|
| 35 |
+
|
| 36 |
+
// This helps restrict operator<< defined below.
|
| 37 |
+
template <class T> struct ExceptionTag {
|
| 38 |
+
typedef T Identity;
|
| 39 |
+
};
|
| 40 |
+
|
| 41 |
+
StringStream what_;
|
| 42 |
+
};
|
| 43 |
+
|
| 44 |
+
/* This implements the normal operator<< for Exception and all its children.
|
| 45 |
+
* SFINAE means it only applies to Exception. Think of this as an ersatz
|
| 46 |
+
* boost::enable_if.
|
| 47 |
+
*/
|
| 48 |
+
template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) {
|
| 49 |
+
// TODO(hieu): delete this.
|
| 50 |
+
std::stringstream moses_hack;
|
| 51 |
+
moses_hack << data;
|
| 52 |
+
e.what_ << moses_hack.str();
|
| 53 |
+
return e;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
#ifdef __GNUC__
|
| 57 |
+
#define UTIL_FUNC_NAME __PRETTY_FUNCTION__
|
| 58 |
+
#else
|
| 59 |
+
#ifdef _WIN32
|
| 60 |
+
#define UTIL_FUNC_NAME __FUNCTION__
|
| 61 |
+
#else
|
| 62 |
+
#define UTIL_FUNC_NAME NULL
|
| 63 |
+
#endif
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
/* Create an instance of Exception, add the message Modify, and throw it.
|
| 67 |
+
* Modify is appended to the what() message and can contain << for ostream
|
| 68 |
+
* operations.
|
| 69 |
+
*
|
| 70 |
+
* do .. while kludge to swallow trailing ; character
|
| 71 |
+
* http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html .
|
| 72 |
+
* Arg can be a constructor argument to the exception.
|
| 73 |
+
*/
|
| 74 |
+
#define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \
|
| 75 |
+
Exception UTIL_e Arg; \
|
| 76 |
+
UTIL_e.SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, #Exception, Condition); \
|
| 77 |
+
UTIL_e << Modify; \
|
| 78 |
+
throw UTIL_e; \
|
| 79 |
+
} while (0)
|
| 80 |
+
|
| 81 |
+
#define UTIL_THROW_ARG(Exception, Arg, Modify) \
|
| 82 |
+
UTIL_THROW_BACKEND(NULL, Exception, Arg, Modify)
|
| 83 |
+
|
| 84 |
+
#define UTIL_THROW(Exception, Modify) \
|
| 85 |
+
UTIL_THROW_BACKEND(NULL, Exception, , Modify);
|
| 86 |
+
|
| 87 |
+
#define UTIL_THROW2(Modify) \
|
| 88 |
+
UTIL_THROW_BACKEND(NULL, util::Exception, , Modify);
|
| 89 |
+
|
| 90 |
+
#if __GNUC__ >= 3
|
| 91 |
+
#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0)
|
| 92 |
+
#else
|
| 93 |
+
#define UTIL_UNLIKELY(x) (x)
|
| 94 |
+
#endif
|
| 95 |
+
|
| 96 |
+
#if __GNUC__ >= 3
|
| 97 |
+
#define UTIL_LIKELY(x) __builtin_expect (!!(x), 1)
|
| 98 |
+
#else
|
| 99 |
+
#define UTIL_LIKELY(x) (x)
|
| 100 |
+
#endif
|
| 101 |
+
|
| 102 |
+
#define UTIL_THROW_IF_ARG(Condition, Exception, Arg, Modify) do { \
|
| 103 |
+
if (UTIL_UNLIKELY(Condition)) { \
|
| 104 |
+
UTIL_THROW_BACKEND(#Condition, Exception, Arg, Modify); \
|
| 105 |
+
} \
|
| 106 |
+
} while (0)
|
| 107 |
+
|
| 108 |
+
#define UTIL_THROW_IF(Condition, Exception, Modify) \
|
| 109 |
+
UTIL_THROW_IF_ARG(Condition, Exception, , Modify)
|
| 110 |
+
|
| 111 |
+
#define UTIL_THROW_IF2(Condition, Modify) \
|
| 112 |
+
UTIL_THROW_IF_ARG(Condition, util::Exception, , Modify)
|
| 113 |
+
|
| 114 |
+
// Exception that records errno and adds it to the message.
|
| 115 |
+
class ErrnoException : public Exception {
|
| 116 |
+
public:
|
| 117 |
+
ErrnoException() throw();
|
| 118 |
+
|
| 119 |
+
virtual ~ErrnoException() throw();
|
| 120 |
+
|
| 121 |
+
int Error() const throw() { return errno_; }
|
| 122 |
+
|
| 123 |
+
private:
|
| 124 |
+
int errno_;
|
| 125 |
+
};
|
| 126 |
+
|
| 127 |
+
// file wasn't there, or couldn't be open for some reason
|
| 128 |
+
class FileOpenException : public Exception {
|
| 129 |
+
public:
|
| 130 |
+
FileOpenException() throw() {}
|
| 131 |
+
~FileOpenException() throw() {}
|
| 132 |
+
};
|
| 133 |
+
|
| 134 |
+
// Utilities for overflow checking.
|
| 135 |
+
class OverflowException : public Exception {
|
| 136 |
+
public:
|
| 137 |
+
OverflowException() throw();
|
| 138 |
+
~OverflowException() throw();
|
| 139 |
+
};
|
| 140 |
+
|
| 141 |
+
template <unsigned len> inline std::size_t CheckOverflowInternal(uint64_t value) {
|
| 142 |
+
UTIL_THROW_IF(value > static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code.");
|
| 143 |
+
return value;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) {
|
| 147 |
+
return value;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
inline std::size_t CheckOverflow(uint64_t value) {
|
| 151 |
+
return CheckOverflowInternal<sizeof(std::size_t)>(value);
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
#if defined(_WIN32) || defined(_WIN64)
|
| 155 |
+
/* Thrown for Windows specific operations. */
|
| 156 |
+
class WindowsException : public Exception {
|
| 157 |
+
public:
|
| 158 |
+
WindowsException() throw();
|
| 159 |
+
~WindowsException() throw();
|
| 160 |
+
};
|
| 161 |
+
#endif
|
| 162 |
+
|
| 163 |
+
} // namespace util
|
| 164 |
+
|
| 165 |
+
#endif // UTIL_EXCEPTION_H
|
mosesdecoder/util/fake_ostream.hh
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef UTIL_FAKE_OSTREAM_H
|
| 2 |
+
#define UTIL_FAKE_OSTREAM_H
|
| 3 |
+
|
| 4 |
+
#include "util/float_to_string.hh"
|
| 5 |
+
#include "util/integer_to_string.hh"
|
| 6 |
+
#include "util/string_piece.hh"
|
| 7 |
+
|
| 8 |
+
#include <cassert>
|
| 9 |
+
#include <limits>
|
| 10 |
+
|
| 11 |
+
#include <stdint.h>
|
| 12 |
+
|
| 13 |
+
namespace util {
|
| 14 |
+
|
| 15 |
+
/* Like std::ostream but without being incredibly slow.
|
| 16 |
+
* Supports most of the built-in types except for long double.
|
| 17 |
+
*
|
| 18 |
+
* The FakeOStream class is intended to be inherited from. The inherting class
|
| 19 |
+
* should provide:
|
| 20 |
+
* public:
|
| 21 |
+
* Derived &flush();
|
| 22 |
+
* Derived &write(const void *data, std::size_t length);
|
| 23 |
+
*
|
| 24 |
+
* private: or protected:
|
| 25 |
+
* friend class FakeOStream;
|
| 26 |
+
* char *Ensure(std::size_t amount);
|
| 27 |
+
* void AdvanceTo(char *to);
|
| 28 |
+
*
|
| 29 |
+
* The Ensure function makes enough space for an in-place write and returns
|
| 30 |
+
* where to write. The AdvanceTo function happens after the write, saying how
|
| 31 |
+
* much was actually written.
|
| 32 |
+
*
|
| 33 |
+
* Precondition:
|
| 34 |
+
* amount <= kToStringMaxBytes for in-place writes.
|
| 35 |
+
*/
|
| 36 |
+
template <class Derived> class FakeOStream {
|
| 37 |
+
public:
|
| 38 |
+
FakeOStream() {}
|
| 39 |
+
|
| 40 |
+
// This also covers std::string and char*
|
| 41 |
+
Derived &operator<<(StringPiece str) {
|
| 42 |
+
return C().write(str.data(), str.size());
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
// Handle integers by size and signedness.
|
| 46 |
+
private:
|
| 47 |
+
template <class Arg> struct EnableIfKludge {
|
| 48 |
+
typedef Derived type;
|
| 49 |
+
};
|
| 50 |
+
template <class From, unsigned Length = sizeof(From), bool Signed = std::numeric_limits<From>::is_signed, bool IsInteger = std::numeric_limits<From>::is_integer> struct Coerce {};
|
| 51 |
+
|
| 52 |
+
template <class From> struct Coerce<From, 2, false, true> { typedef uint16_t To; };
|
| 53 |
+
template <class From> struct Coerce<From, 4, false, true> { typedef uint32_t To; };
|
| 54 |
+
template <class From> struct Coerce<From, 8, false, true> { typedef uint64_t To; };
|
| 55 |
+
|
| 56 |
+
template <class From> struct Coerce<From, 2, true, true> { typedef int16_t To; };
|
| 57 |
+
template <class From> struct Coerce<From, 4, true, true> { typedef int32_t To; };
|
| 58 |
+
template <class From> struct Coerce<From, 8, true, true> { typedef int64_t To; };
|
| 59 |
+
public:
|
| 60 |
+
template <class From> typename EnableIfKludge<typename Coerce<From>::To>::type &operator<<(const From value) {
|
| 61 |
+
return CallToString(static_cast<typename Coerce<From>::To>(value));
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
// Character types that get copied as bytes instead of displayed as integers.
|
| 65 |
+
Derived &operator<<(char val) { return put(val); }
|
| 66 |
+
Derived &operator<<(signed char val) { return put(static_cast<char>(val)); }
|
| 67 |
+
Derived &operator<<(unsigned char val) { return put(static_cast<char>(val)); }
|
| 68 |
+
|
| 69 |
+
Derived &operator<<(bool val) { return put(val + '0'); }
|
| 70 |
+
// enums will fall back to int but are not caught by the template.
|
| 71 |
+
Derived &operator<<(int val) { return CallToString(static_cast<typename Coerce<int>::To>(val)); }
|
| 72 |
+
|
| 73 |
+
Derived &operator<<(float val) { return CallToString(val); }
|
| 74 |
+
Derived &operator<<(double val) { return CallToString(val); }
|
| 75 |
+
|
| 76 |
+
// This is here to catch all the other pointer types.
|
| 77 |
+
Derived &operator<<(const void *value) { return CallToString(value); }
|
| 78 |
+
// This is here because the above line also catches const char*.
|
| 79 |
+
Derived &operator<<(const char *value) { return *this << StringPiece(value); }
|
| 80 |
+
Derived &operator<<(char *value) { return *this << StringPiece(value); }
|
| 81 |
+
|
| 82 |
+
Derived &put(char val) {
|
| 83 |
+
char *c = C().Ensure(1);
|
| 84 |
+
*c = val;
|
| 85 |
+
C().AdvanceTo(++c);
|
| 86 |
+
return C();
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
char widen(char val) const { return val; }
|
| 90 |
+
|
| 91 |
+
private:
|
| 92 |
+
// References to derived class for convenience.
|
| 93 |
+
Derived &C() {
|
| 94 |
+
return *static_cast<Derived*>(this);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
const Derived &C() const {
|
| 98 |
+
return *static_cast<const Derived*>(this);
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
// This is separate to prevent an infinite loop if the compiler considers
|
| 102 |
+
// types the same (i.e. gcc std::size_t and uint64_t or uint32_t).
|
| 103 |
+
template <class T> Derived &CallToString(const T value) {
|
| 104 |
+
C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf<T>::kBytes)));
|
| 105 |
+
return C();
|
| 106 |
+
}
|
| 107 |
+
};
|
| 108 |
+
|
| 109 |
+
} // namespace
|
| 110 |
+
|
| 111 |
+
#endif // UTIL_FAKE_OSTREAM_H
|
mosesdecoder/util/file_piece.hh
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef UTIL_FILE_PIECE_H
|
| 2 |
+
#define UTIL_FILE_PIECE_H
|
| 3 |
+
|
| 4 |
+
#include "util/ersatz_progress.hh"
|
| 5 |
+
#include "util/exception.hh"
|
| 6 |
+
#include "util/file.hh"
|
| 7 |
+
#include "util/mmap.hh"
|
| 8 |
+
#include "util/read_compressed.hh"
|
| 9 |
+
#include "util/string_piece.hh"
|
| 10 |
+
|
| 11 |
+
#include <cstddef>
|
| 12 |
+
#include <iosfwd>
|
| 13 |
+
#include <string>
|
| 14 |
+
#include <cassert>
|
| 15 |
+
#include <stdint.h>
|
| 16 |
+
|
| 17 |
+
namespace util {
|
| 18 |
+
|
| 19 |
+
class ParseNumberException : public Exception {
|
| 20 |
+
public:
|
| 21 |
+
explicit ParseNumberException(StringPiece value) throw();
|
| 22 |
+
~ParseNumberException() throw() {}
|
| 23 |
+
};
|
| 24 |
+
|
| 25 |
+
extern const bool kSpaces[256];
|
| 26 |
+
|
| 27 |
+
// Memory backing the returned StringPiece may vanish on the next call.
|
| 28 |
+
class FilePiece {
|
| 29 |
+
public:
|
| 30 |
+
// 1 MB default.
|
| 31 |
+
explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
|
| 32 |
+
// Takes ownership of fd. name is used for messages.
|
| 33 |
+
explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
|
| 34 |
+
|
| 35 |
+
/* Read from an istream. Don't use this if you can avoid it. Raw fd IO is
|
| 36 |
+
* much faster. But sometimes you just have an istream like Boost's HTTP
|
| 37 |
+
* server and want to parse it the same way.
|
| 38 |
+
* name is just used for messages and FileName().
|
| 39 |
+
*/
|
| 40 |
+
explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576);
|
| 41 |
+
|
| 42 |
+
~FilePiece();
|
| 43 |
+
|
| 44 |
+
char get() {
|
| 45 |
+
if (position_ == position_end_) {
|
| 46 |
+
Shift();
|
| 47 |
+
if (at_end_) throw EndOfFileException();
|
| 48 |
+
}
|
| 49 |
+
return *(position_++);
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
// Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
|
| 53 |
+
StringPiece ReadDelimited(const bool *delim = kSpaces) {
|
| 54 |
+
SkipSpaces(delim);
|
| 55 |
+
return Consume(FindDelimiterOrEOF(delim));
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
/// Read word until the line or file ends.
|
| 59 |
+
bool ReadWordSameLine(StringPiece &to, const bool *delim = kSpaces) {
|
| 60 |
+
assert(delim[static_cast<unsigned char>('\n')]);
|
| 61 |
+
// Skip non-enter spaces.
|
| 62 |
+
for (; ; ++position_) {
|
| 63 |
+
if (position_ == position_end_) {
|
| 64 |
+
try {
|
| 65 |
+
Shift();
|
| 66 |
+
} catch (const util::EndOfFileException &e) { return false; }
|
| 67 |
+
// And break out at end of file.
|
| 68 |
+
if (position_ == position_end_) return false;
|
| 69 |
+
}
|
| 70 |
+
if (!delim[static_cast<unsigned char>(*position_)]) break;
|
| 71 |
+
if (*position_ == '\n') return false;
|
| 72 |
+
}
|
| 73 |
+
// We can't be at the end of file because there's at least one character open.
|
| 74 |
+
to = Consume(FindDelimiterOrEOF(delim));
|
| 75 |
+
return true;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
/** Read a line of text from the file.
|
| 79 |
+
*
|
| 80 |
+
* Unlike ReadDelimited, this includes leading spaces and consumes the
|
| 81 |
+
* delimiter. It is similar to getline in that way.
|
| 82 |
+
*
|
| 83 |
+
* If strip_cr is true, any trailing carriate return (as would be found on
|
| 84 |
+
* a file written on Windows) will be left out of the returned line.
|
| 85 |
+
*
|
| 86 |
+
* Throws EndOfFileException if the end of the file is encountered. If the
|
| 87 |
+
* file does not end in a newline, this could mean that the last line is
|
| 88 |
+
* never read.
|
| 89 |
+
*/
|
| 90 |
+
StringPiece ReadLine(char delim = '\n', bool strip_cr = true);
|
| 91 |
+
|
| 92 |
+
/** Read a line of text from the file, or return false on EOF.
|
| 93 |
+
*
|
| 94 |
+
* This is like ReadLine, except it returns false where ReadLine throws
|
| 95 |
+
* EndOfFileException. Like ReadLine it may not read the last line in the
|
| 96 |
+
* file if the file does not end in a newline.
|
| 97 |
+
*
|
| 98 |
+
* If strip_cr is true, any trailing carriate return (as would be found on
|
| 99 |
+
* a file written on Windows) will be left out of the returned line.
|
| 100 |
+
*/
|
| 101 |
+
bool ReadLineOrEOF(StringPiece &to, char delim = '\n', bool strip_cr = true);
|
| 102 |
+
|
| 103 |
+
float ReadFloat();
|
| 104 |
+
double ReadDouble();
|
| 105 |
+
long int ReadLong();
|
| 106 |
+
unsigned long int ReadULong();
|
| 107 |
+
|
| 108 |
+
// Skip spaces defined by isspace.
|
| 109 |
+
void SkipSpaces(const bool *delim = kSpaces) {
|
| 110 |
+
assert(position_ <= position_end_);
|
| 111 |
+
for (; ; ++position_) {
|
| 112 |
+
if (position_ == position_end_) {
|
| 113 |
+
Shift();
|
| 114 |
+
// And break out at end of file.
|
| 115 |
+
if (position_ == position_end_) return;
|
| 116 |
+
}
|
| 117 |
+
assert(position_ < position_end_);
|
| 118 |
+
if (!delim[static_cast<unsigned char>(*position_)]) return;
|
| 119 |
+
}
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
uint64_t Offset() const {
|
| 123 |
+
return position_ - data_.begin() + mapped_offset_;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
const std::string &FileName() const { return file_name_; }
|
| 127 |
+
|
| 128 |
+
private:
|
| 129 |
+
void InitializeNoRead(const char *name, std::size_t min_buffer);
|
| 130 |
+
// Calls InitializeNoRead, so don't call both.
|
| 131 |
+
void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
|
| 132 |
+
|
| 133 |
+
template <class T> T ReadNumber();
|
| 134 |
+
|
| 135 |
+
StringPiece Consume(const char *to) {
|
| 136 |
+
assert(to >= position_);
|
| 137 |
+
StringPiece ret(position_, to - position_);
|
| 138 |
+
position_ = to;
|
| 139 |
+
return ret;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
const char *FindDelimiterOrEOF(const bool *delim = kSpaces);
|
| 143 |
+
|
| 144 |
+
void Shift();
|
| 145 |
+
// Backends to Shift().
|
| 146 |
+
void MMapShift(uint64_t desired_begin);
|
| 147 |
+
|
| 148 |
+
void TransitionToRead();
|
| 149 |
+
void ReadShift();
|
| 150 |
+
|
| 151 |
+
const char *position_, *last_space_, *position_end_;
|
| 152 |
+
|
| 153 |
+
scoped_fd file_;
|
| 154 |
+
const uint64_t total_size_;
|
| 155 |
+
const uint64_t page_;
|
| 156 |
+
|
| 157 |
+
std::size_t default_map_size_;
|
| 158 |
+
uint64_t mapped_offset_;
|
| 159 |
+
|
| 160 |
+
// Order matters: file_ should always be destroyed after this.
|
| 161 |
+
scoped_memory data_;
|
| 162 |
+
|
| 163 |
+
bool at_end_;
|
| 164 |
+
bool fallback_to_read_;
|
| 165 |
+
|
| 166 |
+
ErsatzProgress progress_;
|
| 167 |
+
|
| 168 |
+
std::string file_name_;
|
| 169 |
+
|
| 170 |
+
ReadCompressed fell_back_;
|
| 171 |
+
};
|
| 172 |
+
|
| 173 |
+
} // namespace util
|
| 174 |
+
|
| 175 |
+
#endif // UTIL_FILE_PIECE_H
|
mosesdecoder/util/file_piece_test.cc
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Tests might fail if you have creative characters in your path. Sue me.
|
| 2 |
+
#include "util/file_piece.hh"
|
| 3 |
+
|
| 4 |
+
#include "util/file_stream.hh"
|
| 5 |
+
#include "util/file.hh"
|
| 6 |
+
#include "util/scoped.hh"
|
| 7 |
+
|
| 8 |
+
#define BOOST_TEST_MODULE FilePieceTest
|
| 9 |
+
#include <boost/test/unit_test.hpp>
|
| 10 |
+
#include <fstream>
|
| 11 |
+
#include <iostream>
|
| 12 |
+
#include <cstdio>
|
| 13 |
+
#include <sys/types.h>
|
| 14 |
+
#include <sys/stat.h>
|
| 15 |
+
|
| 16 |
+
namespace util {
|
| 17 |
+
namespace {
|
| 18 |
+
|
| 19 |
+
std::string FileLocation() {
|
| 20 |
+
if (boost::unit_test::framework::master_test_suite().argc < 2) {
|
| 21 |
+
return "file_piece.cc";
|
| 22 |
+
}
|
| 23 |
+
std::string ret(boost::unit_test::framework::master_test_suite().argv[1]);
|
| 24 |
+
return ret;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
/* istream */
|
| 28 |
+
BOOST_AUTO_TEST_CASE(IStream) {
|
| 29 |
+
std::fstream ref(FileLocation().c_str(), std::ios::in);
|
| 30 |
+
std::fstream backing(FileLocation().c_str(), std::ios::in);
|
| 31 |
+
FilePiece test(backing);
|
| 32 |
+
std::string ref_line;
|
| 33 |
+
while (getline(ref, ref_line)) {
|
| 34 |
+
StringPiece test_line(test.ReadLine());
|
| 35 |
+
BOOST_CHECK_EQUAL(ref_line, test_line);
|
| 36 |
+
}
|
| 37 |
+
BOOST_CHECK_THROW(test.get(), EndOfFileException);
|
| 38 |
+
BOOST_CHECK_THROW(test.get(), EndOfFileException);
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
/* mmap implementation */
|
| 42 |
+
BOOST_AUTO_TEST_CASE(MMapReadLine) {
|
| 43 |
+
std::fstream ref(FileLocation().c_str(), std::ios::in);
|
| 44 |
+
FilePiece test(FileLocation().c_str(), NULL, 1);
|
| 45 |
+
std::string ref_line;
|
| 46 |
+
while (getline(ref, ref_line)) {
|
| 47 |
+
StringPiece test_line(test.ReadLine());
|
| 48 |
+
// I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
|
| 49 |
+
if (!test_line.empty() || !ref_line.empty()) {
|
| 50 |
+
BOOST_CHECK_EQUAL(ref_line, test_line);
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
BOOST_CHECK_THROW(test.get(), EndOfFileException);
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__)
|
| 57 |
+
/* Apple isn't happy with the popen, fileno, dup. And I don't want to
|
| 58 |
+
* reimplement popen. This is an issue with the test.
|
| 59 |
+
*/
|
| 60 |
+
/* read() implementation */
|
| 61 |
+
BOOST_AUTO_TEST_CASE(StreamReadLine) {
|
| 62 |
+
std::fstream ref(FileLocation().c_str(), std::ios::in);
|
| 63 |
+
|
| 64 |
+
std::string popen_args = "cat \"";
|
| 65 |
+
popen_args += FileLocation();
|
| 66 |
+
popen_args += '"';
|
| 67 |
+
|
| 68 |
+
FILE *catter = popen(popen_args.c_str(), "r");
|
| 69 |
+
BOOST_REQUIRE(catter);
|
| 70 |
+
|
| 71 |
+
FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1);
|
| 72 |
+
std::string ref_line;
|
| 73 |
+
while (getline(ref, ref_line)) {
|
| 74 |
+
StringPiece test_line(test.ReadLine());
|
| 75 |
+
// I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
|
| 76 |
+
if (!test_line.empty() || !ref_line.empty()) {
|
| 77 |
+
BOOST_CHECK_EQUAL(ref_line, test_line);
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
BOOST_CHECK_THROW(test.get(), EndOfFileException);
|
| 81 |
+
BOOST_REQUIRE(!pclose(catter));
|
| 82 |
+
}
|
| 83 |
+
#endif
|
| 84 |
+
|
| 85 |
+
#ifdef HAVE_ZLIB
|
| 86 |
+
|
| 87 |
+
// gzip file
|
| 88 |
+
BOOST_AUTO_TEST_CASE(PlainZipReadLine) {
|
| 89 |
+
std::string location(FileLocation());
|
| 90 |
+
std::fstream ref(location.c_str(), std::ios::in);
|
| 91 |
+
|
| 92 |
+
std::string command("gzip <\"");
|
| 93 |
+
command += location + "\" >\"" + location + "\".gz";
|
| 94 |
+
|
| 95 |
+
BOOST_REQUIRE_EQUAL(0, system(command.c_str()));
|
| 96 |
+
FilePiece test((location + ".gz").c_str(), NULL, 1);
|
| 97 |
+
unlink((location + ".gz").c_str());
|
| 98 |
+
std::string ref_line;
|
| 99 |
+
while (getline(ref, ref_line)) {
|
| 100 |
+
StringPiece test_line(test.ReadLine());
|
| 101 |
+
// I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
|
| 102 |
+
if (!test_line.empty() || !ref_line.empty()) {
|
| 103 |
+
BOOST_CHECK_EQUAL(ref_line, test_line);
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
BOOST_CHECK_THROW(test.get(), EndOfFileException);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
// gzip stream. Apple doesn't like popen, fileno, dup. This is an issue with
|
| 110 |
+
// the test.
|
| 111 |
+
#if !defined __APPLE__ && !defined __MINGW32__
|
| 112 |
+
BOOST_AUTO_TEST_CASE(StreamZipReadLine) {
|
| 113 |
+
std::fstream ref(FileLocation().c_str(), std::ios::in);
|
| 114 |
+
|
| 115 |
+
std::string command("gzip <\"");
|
| 116 |
+
command += FileLocation() + "\"";
|
| 117 |
+
|
| 118 |
+
FILE * catter = popen(command.c_str(), "r");
|
| 119 |
+
BOOST_REQUIRE(catter);
|
| 120 |
+
|
| 121 |
+
FilePiece test(dup(fileno(catter)), "file_piece.cc.gz", NULL, 1);
|
| 122 |
+
std::string ref_line;
|
| 123 |
+
while (getline(ref, ref_line)) {
|
| 124 |
+
StringPiece test_line(test.ReadLine());
|
| 125 |
+
// I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
|
| 126 |
+
if (!test_line.empty() || !ref_line.empty()) {
|
| 127 |
+
BOOST_CHECK_EQUAL(ref_line, test_line);
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
BOOST_CHECK_THROW(test.get(), EndOfFileException);
|
| 131 |
+
BOOST_REQUIRE(!pclose(catter));
|
| 132 |
+
}
|
| 133 |
+
#endif // __APPLE__
|
| 134 |
+
|
| 135 |
+
#endif // HAVE_ZLIB
|
| 136 |
+
|
| 137 |
+
BOOST_AUTO_TEST_CASE(Numbers) {
|
| 138 |
+
scoped_fd file(MakeTemp(FileLocation()));
|
| 139 |
+
const float floating = 3.2;
|
| 140 |
+
{
|
| 141 |
+
util::FileStream writing(file.get());
|
| 142 |
+
writing << "94389483984398493890287 " << floating << " 5";
|
| 143 |
+
}
|
| 144 |
+
SeekOrThrow(file.get(), 0);
|
| 145 |
+
util::FilePiece f(file.release());
|
| 146 |
+
BOOST_CHECK_THROW(f.ReadULong(), ParseNumberException);
|
| 147 |
+
BOOST_CHECK_EQUAL("94389483984398493890287", f.ReadDelimited());
|
| 148 |
+
// Yes, exactly equal. Isn't double-conversion wonderful?
|
| 149 |
+
BOOST_CHECK_EQUAL(floating, f.ReadFloat());
|
| 150 |
+
BOOST_CHECK_EQUAL(5, f.ReadULong());
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
} // namespace
|
| 154 |
+
} // namespace util
|
mosesdecoder/util/generator.hh
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// generator/continuation for C++
|
| 4 |
+
// author: Andrew Fedoniouk @ terrainformatica.com
|
| 5 |
+
// idea borrowed from: "coroutines in C" Simon Tatham,
|
| 6 |
+
// http://www.chiark.greenend.org.uk/~sgtatham/coroutines.html
|
| 7 |
+
// BSD license
|
| 8 |
+
|
| 9 |
+
template<typename T>
|
| 10 |
+
struct _generator
|
| 11 |
+
{
|
| 12 |
+
T* _stack;
|
| 13 |
+
int _line;
|
| 14 |
+
_generator():_stack(0), _line(-1) {}
|
| 15 |
+
void _push() { T* n = new T; *n = *static_cast<T*>(this); _stack = n; }
|
| 16 |
+
bool _pop() { if(!_stack) return false; T* t = _stack; *static_cast<T*>(this) = *_stack; t->_stack = 0; delete t; return true; }
|
| 17 |
+
~_generator() { while(_pop()); }
|
| 18 |
+
};
|
| 19 |
+
|
| 20 |
+
#define $generator(NAME) struct NAME : public _generator<NAME>
|
| 21 |
+
|
| 22 |
+
#define $emit(T) bool operator()(T& _rv) { \
|
| 23 |
+
if(_line < 0) _line=0; \
|
| 24 |
+
$START: switch(_line) { case 0:;
|
| 25 |
+
|
| 26 |
+
#define $stop } _line = 0; if(_pop()) goto $START; return false; }
|
| 27 |
+
|
| 28 |
+
#define $restart(WITH) { _push(); _stack->_line = __LINE__; _line=0; WITH; goto $START; case __LINE__:; }
|
| 29 |
+
|
| 30 |
+
#define $yield(V) \
|
| 31 |
+
do {\
|
| 32 |
+
_line=__LINE__;\
|
| 33 |
+
_rv = (V); return true; case __LINE__:;\
|
| 34 |
+
} while (0)
|
mosesdecoder/util/getopt.c
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
POSIX getopt for Windows
|
| 3 |
+
|
| 4 |
+
AT&T Public License
|
| 5 |
+
|
| 6 |
+
Code given out at the 1985 UNIFORUM conference in Dallas.
|
| 7 |
+
*/
|
| 8 |
+
|
| 9 |
+
#ifndef __GNUC__
|
| 10 |
+
|
| 11 |
+
#include "getopt.hh"
|
| 12 |
+
#include <stdio.h>
|
| 13 |
+
#include <string.h>
|
| 14 |
+
|
| 15 |
+
#define NULL 0
|
| 16 |
+
#define EOF (-1)
|
| 17 |
+
#define ERR(s, c) if(opterr){\
|
| 18 |
+
char errbuf[2];\
|
| 19 |
+
errbuf[0] = c; errbuf[1] = '\n';\
|
| 20 |
+
fputs(argv[0], stderr);\
|
| 21 |
+
fputs(s, stderr);\
|
| 22 |
+
fputc(c, stderr);}
|
| 23 |
+
//(void) write(2, argv[0], (unsigned)strlen(argv[0]));\
|
| 24 |
+
//(void) write(2, s, (unsigned)strlen(s));\
|
| 25 |
+
//(void) write(2, errbuf, 2);}
|
| 26 |
+
|
| 27 |
+
int opterr = 1;
|
| 28 |
+
int optind = 1;
|
| 29 |
+
int optopt;
|
| 30 |
+
char *optarg;
|
| 31 |
+
|
| 32 |
+
int
|
| 33 |
+
getopt(argc, argv, opts)
|
| 34 |
+
int argc;
|
| 35 |
+
char **argv, *opts;
|
| 36 |
+
{
|
| 37 |
+
static int sp = 1;
|
| 38 |
+
register int c;
|
| 39 |
+
register char *cp;
|
| 40 |
+
|
| 41 |
+
if(sp == 1)
|
| 42 |
+
if(optind >= argc ||
|
| 43 |
+
argv[optind][0] != '-' || argv[optind][1] == '\0')
|
| 44 |
+
return(EOF);
|
| 45 |
+
else if(strcmp(argv[optind], "--") == NULL) {
|
| 46 |
+
optind++;
|
| 47 |
+
return(EOF);
|
| 48 |
+
}
|
| 49 |
+
optopt = c = argv[optind][sp];
|
| 50 |
+
if(c == ':' || (cp=strchr(opts, c)) == NULL) {
|
| 51 |
+
ERR(": illegal option -- ", c);
|
| 52 |
+
if(argv[optind][++sp] == '\0') {
|
| 53 |
+
optind++;
|
| 54 |
+
sp = 1;
|
| 55 |
+
}
|
| 56 |
+
return('?');
|
| 57 |
+
}
|
| 58 |
+
if(*++cp == ':') {
|
| 59 |
+
if(argv[optind][sp+1] != '\0')
|
| 60 |
+
optarg = &argv[optind++][sp+1];
|
| 61 |
+
else if(++optind >= argc) {
|
| 62 |
+
ERR(": option requires an argument -- ", c);
|
| 63 |
+
sp = 1;
|
| 64 |
+
return('?');
|
| 65 |
+
} else
|
| 66 |
+
optarg = argv[optind++];
|
| 67 |
+
sp = 1;
|
| 68 |
+
} else {
|
| 69 |
+
if(argv[optind][++sp] == '\0') {
|
| 70 |
+
sp = 1;
|
| 71 |
+
optind++;
|
| 72 |
+
}
|
| 73 |
+
optarg = NULL;
|
| 74 |
+
}
|
| 75 |
+
return(c);
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
#endif /* __GNUC__ */
|
mosesdecoder/util/integer_to_string_test.cc
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
|
| 2 |
+
#include "util/integer_to_string.hh"
|
| 3 |
+
#include "util/string_piece.hh"
|
| 4 |
+
|
| 5 |
+
#define BOOST_TEST_MODULE IntegerToStringTest
|
| 6 |
+
#include <boost/test/unit_test.hpp>
|
| 7 |
+
#include <boost/lexical_cast.hpp>
|
| 8 |
+
|
| 9 |
+
#include <limits>
|
| 10 |
+
|
| 11 |
+
namespace util {
|
| 12 |
+
namespace {
|
| 13 |
+
|
| 14 |
+
template <class T> void TestValue(const T value) {
|
| 15 |
+
char buf[ToStringBuf<T>::kBytes];
|
| 16 |
+
StringPiece result(buf, ToString(value, buf) - buf);
|
| 17 |
+
BOOST_REQUIRE_GE(static_cast<std::size_t>(ToStringBuf<T>::kBytes), result.size());
|
| 18 |
+
if (value) {
|
| 19 |
+
BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), result);
|
| 20 |
+
} else {
|
| 21 |
+
// Platforms can do void * as 0x0 or 0.
|
| 22 |
+
BOOST_CHECK(result == "0x0" || result == "0");
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
template <class T> void TestCorners() {
|
| 27 |
+
TestValue(std::numeric_limits<T>::min());
|
| 28 |
+
TestValue(std::numeric_limits<T>::max());
|
| 29 |
+
TestValue((T)0);
|
| 30 |
+
TestValue((T)-1);
|
| 31 |
+
TestValue((T)1);
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
BOOST_AUTO_TEST_CASE(Corners) {
|
| 35 |
+
TestCorners<uint16_t>();
|
| 36 |
+
TestCorners<uint32_t>();
|
| 37 |
+
TestCorners<uint64_t>();
|
| 38 |
+
TestCorners<int16_t>();
|
| 39 |
+
TestCorners<int32_t>();
|
| 40 |
+
TestCorners<int64_t>();
|
| 41 |
+
TestCorners<const void*>();
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
template <class T> void TestAll() {
|
| 45 |
+
for (T i = std::numeric_limits<T>::min(); i < std::numeric_limits<T>::max(); ++i) {
|
| 46 |
+
TestValue(i);
|
| 47 |
+
}
|
| 48 |
+
TestValue(std::numeric_limits<T>::max());
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
BOOST_AUTO_TEST_CASE(Short) {
|
| 52 |
+
TestAll<uint16_t>();
|
| 53 |
+
TestAll<int16_t>();
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
template <class T> void Test10s() {
|
| 57 |
+
for (T i = 1; i < std::numeric_limits<T>::max() / 10; i *= 10) {
|
| 58 |
+
TestValue(i);
|
| 59 |
+
TestValue(i - 1);
|
| 60 |
+
TestValue(i + 1);
|
| 61 |
+
}
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
BOOST_AUTO_TEST_CASE(Tens) {
|
| 65 |
+
Test10s<uint64_t>();
|
| 66 |
+
Test10s<int64_t>();
|
| 67 |
+
Test10s<uint32_t>();
|
| 68 |
+
Test10s<int32_t>();
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
BOOST_AUTO_TEST_CASE(Pointers) {
|
| 72 |
+
for (uintptr_t i = 1; i < std::numeric_limits<uintptr_t>::max() / 10; i *= 10) {
|
| 73 |
+
TestValue((const void*)i);
|
| 74 |
+
}
|
| 75 |
+
for (uintptr_t i = 0; i < 256; ++i) {
|
| 76 |
+
TestValue((const void*)i);
|
| 77 |
+
TestValue((const void*)(i + 0xf00));
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
}} // namespaces
|