Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- mosesdecoder/defer/Joint.h +139 -0
- mosesdecoder/defer/PhraseDictionaryInterpolated.cpp +186 -0
- mosesdecoder/defer/PhraseLengthFeatureTest.cpp +104 -0
- mosesdecoder/lm/builder/corpus_count.hh +53 -0
- mosesdecoder/lm/builder/dump_counts_main.cc +36 -0
- mosesdecoder/lm/builder/lmplz_main.cc +220 -0
- mosesdecoder/lm/common/CMakeLists.txt +40 -0
- mosesdecoder/lm/common/Jamfile +2 -0
- mosesdecoder/lm/common/joint_order.hh +71 -0
- mosesdecoder/lm/common/ngram.hh +77 -0
- mosesdecoder/lm/common/print.cc +62 -0
- mosesdecoder/lm/common/renumber.cc +17 -0
- mosesdecoder/lm/common/renumber.hh +30 -0
- mosesdecoder/mert/ReferenceTest.cpp +123 -0
- mosesdecoder/mert/ScoreArray.cpp +169 -0
- mosesdecoder/mert/ScoreArray.h +113 -0
- mosesdecoder/mert/Util.h +149 -0
- mosesdecoder/moses/TranslationModel/UG/util/ibm1-align +3 -0
- mosesdecoder/scripts/Jamfile +23 -0
- mosesdecoder/scripts/README +15 -0
- mosesdecoder/scripts/generic/binarize4moses2.perl +88 -0
- mosesdecoder/scripts/generic/bsbleu.py +179 -0
- mosesdecoder/scripts/generic/compound-splitter.perl +295 -0
- mosesdecoder/scripts/generic/extract-factors.pl +24 -0
- mosesdecoder/scripts/generic/extract-parallel.perl +385 -0
- mosesdecoder/scripts/generic/fsa-sample.fsa +10 -0
- mosesdecoder/scripts/generic/fsa2fsal.pl +53 -0
- mosesdecoder/scripts/generic/fsa2plf.pl +182 -0
- mosesdecoder/scripts/generic/generic-parallel.perl +119 -0
- mosesdecoder/scripts/generic/giza-parallel.perl +134 -0
- mosesdecoder/scripts/generic/lopar2pos.pl +20 -0
- mosesdecoder/scripts/generic/moses_sim_pe.py +452 -0
- mosesdecoder/scripts/generic/mteval-v11b.pl +761 -0
- mosesdecoder/scripts/generic/mteval-v12.pl +784 -0
- mosesdecoder/scripts/generic/mteval-v13a.pl +1170 -0
- mosesdecoder/scripts/generic/mteval-v14.pl +1179 -0
- mosesdecoder/scripts/generic/multi-bleu-detok.perl +214 -0
- mosesdecoder/scripts/generic/multi-bleu.perl +177 -0
- mosesdecoder/scripts/generic/multi_moses.py +332 -0
- mosesdecoder/scripts/generic/ph_numbers.perl +106 -0
- mosesdecoder/scripts/generic/reverse-alignment.perl +24 -0
- mosesdecoder/scripts/generic/score-parallel.perl +428 -0
- mosesdecoder/scripts/generic/score_parallel.py +776 -0
- mosesdecoder/scripts/generic/strip-xml.perl +48 -0
- mosesdecoder/scripts/generic/trainlm-irst2.perl +72 -0
- mosesdecoder/scripts/share/nonbreaking_prefixes/README.txt +8 -0
- mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
- mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
- mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
.gitattributes
CHANGED
|
@@ -37,3 +37,4 @@ fairseq-0.10.2/fairseq/libbleu.cpython-310-x86_64-linux-gnu.so filter=lfs diff=l
|
|
| 37 |
fairseq-0.10.2/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 38 |
fairseq-0.10.2/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 39 |
fairseq-0.10.2/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 37 |
fairseq-0.10.2/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 38 |
fairseq-0.10.2/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 39 |
fairseq-0.10.2/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
mosesdecoder/moses/TranslationModel/UG/util/ibm1-align filter=lfs diff=lfs merge=lfs -text
|
mosesdecoder/defer/Joint.h
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_LanguageModelJoint_h
|
| 23 |
+
#define moses_LanguageModelJoint_h
|
| 24 |
+
|
| 25 |
+
#include <vector>
|
| 26 |
+
#include <string>
|
| 27 |
+
#include <sstream>
|
| 28 |
+
#include "SingleFactor.h"
|
| 29 |
+
#include "MultiFactor.h"
|
| 30 |
+
#include "moses/Word.h"
|
| 31 |
+
#include "moses/FactorTypeSet.h"
|
| 32 |
+
#include "moses/FactorCollection.h"
|
| 33 |
+
|
| 34 |
+
namespace Moses
|
| 35 |
+
{
|
| 36 |
+
|
| 37 |
+
class Phrase;
|
| 38 |
+
class FactorCollection;
|
| 39 |
+
|
| 40 |
+
/** LM of multiple factors. A simple extension of single factor LM - factors backoff together.
|
| 41 |
+
* Rather slow as this uses string concatenation/split.
|
| 42 |
+
* Not used for a long time
|
| 43 |
+
*/
|
| 44 |
+
class LanguageModelJoint : public LanguageModelMultiFactor
|
| 45 |
+
{
|
| 46 |
+
protected:
|
| 47 |
+
LanguageModelSingleFactor *m_lmImpl;
|
| 48 |
+
std::vector<FactorType> m_factorTypesOrdered;
|
| 49 |
+
|
| 50 |
+
size_t m_implFactor;
|
| 51 |
+
public:
|
| 52 |
+
LanguageModelJoint(const std::string &line, LanguageModelSingleFactor *lmImpl)
|
| 53 |
+
:LanguageModelMultiFactor(line) {
|
| 54 |
+
m_lmImpl = lmImpl;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
~LanguageModelJoint() {
|
| 58 |
+
delete m_lmImpl;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
bool Load(AllOptions const& opts, const std::string &filePath
|
| 62 |
+
, const std::vector<FactorType> &factorTypes
|
| 63 |
+
, size_t nGramOrder) {
|
| 64 |
+
m_factorTypes = FactorMask(factorTypes);
|
| 65 |
+
m_filePath = filePath;
|
| 66 |
+
m_nGramOrder = nGramOrder;
|
| 67 |
+
|
| 68 |
+
m_factorTypesOrdered= factorTypes;
|
| 69 |
+
m_implFactor = 0;
|
| 70 |
+
|
| 71 |
+
FactorCollection &factorCollection = FactorCollection::Instance();
|
| 72 |
+
|
| 73 |
+
// sentence markers
|
| 74 |
+
for (size_t index = 0 ; index < factorTypes.size() ; ++index) {
|
| 75 |
+
FactorType factorType = factorTypes[index];
|
| 76 |
+
m_sentenceStartWord[factorType] = factorCollection.AddFactor(Output, factorType, BOS_);
|
| 77 |
+
m_sentenceEndWord[factorType] = factorCollection.AddFactor(Output, factorType, EOS_);
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
m_lmImpl->Load(AllOptions const& opts);
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const {
|
| 84 |
+
if (contextFactor.size() == 0) {
|
| 85 |
+
LMResult ret;
|
| 86 |
+
ret.score = 0.0;
|
| 87 |
+
ret.unknown = false;
|
| 88 |
+
return ret;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
// joint context for internal LM
|
| 92 |
+
std::vector<const Word*> jointContext;
|
| 93 |
+
|
| 94 |
+
for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos ) {
|
| 95 |
+
const Word &word = *contextFactor[currPos];
|
| 96 |
+
|
| 97 |
+
// add word to chunked context
|
| 98 |
+
std::stringstream stream("");
|
| 99 |
+
|
| 100 |
+
const Factor *factor = word[ m_factorTypesOrdered[0] ];
|
| 101 |
+
stream << factor->GetString();
|
| 102 |
+
|
| 103 |
+
for (size_t index = 1 ; index < m_factorTypesOrdered.size() ; ++index) {
|
| 104 |
+
FactorType factorType = m_factorTypesOrdered[index];
|
| 105 |
+
const Factor *factor = word[factorType];
|
| 106 |
+
stream << "|" << factor->GetString();
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
factor = FactorCollection::Instance().AddFactor(Output, m_implFactor, stream.str());
|
| 110 |
+
|
| 111 |
+
Word* jointWord = new Word;
|
| 112 |
+
jointWord->SetFactor(m_implFactor, factor);
|
| 113 |
+
jointContext.push_back(jointWord);
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
// calc score on chunked phrase
|
| 117 |
+
LMResult ret = m_lmImpl->GetValueForgotState(jointContext, outState);
|
| 118 |
+
|
| 119 |
+
RemoveAllInColl(jointContext);
|
| 120 |
+
|
| 121 |
+
return ret;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
const FFState *GetNullContextState() const {
|
| 125 |
+
return m_lmImpl->GetNullContextState();
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
const FFState *GetBeginSentenceState() const {
|
| 129 |
+
return m_lmImpl->GetBeginSentenceState();
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
FFState *NewState(const FFState *from) const {
|
| 133 |
+
return m_lmImpl->NewState(from);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
};
|
| 137 |
+
|
| 138 |
+
}
|
| 139 |
+
#endif
|
mosesdecoder/defer/PhraseDictionaryInterpolated.cpp
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2013- University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include <boost/lexical_cast.hpp>
|
| 21 |
+
#include <boost/unordered_set.hpp>
|
| 22 |
+
|
| 23 |
+
#include "util/exception.hh"
|
| 24 |
+
#include "util/tokenize_piece.hh"
|
| 25 |
+
#include "moses/TranslationModel/PhraseDictionaryInterpolated.h"
|
| 26 |
+
|
| 27 |
+
using namespace std;
|
| 28 |
+
|
| 29 |
+
namespace Moses
|
| 30 |
+
{
|
| 31 |
+
|
| 32 |
+
PhraseDictionaryInterpolated::PhraseDictionaryInterpolated
|
| 33 |
+
(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature):
|
| 34 |
+
PhraseDictionary(numScoreComponent,feature),
|
| 35 |
+
m_targetPhrases(NULL),
|
| 36 |
+
m_languageModels(NULL) {}
|
| 37 |
+
|
| 38 |
+
bool PhraseDictionaryInterpolated::Load(
|
| 39 |
+
const std::vector<FactorType> &input
|
| 40 |
+
, const std::vector<FactorType> &output
|
| 41 |
+
, const std::vector<std::string>& config
|
| 42 |
+
, const std::vector<float> &weightT
|
| 43 |
+
, size_t tableLimit
|
| 44 |
+
, const LMList &languageModels
|
| 45 |
+
, float weightWP)
|
| 46 |
+
{
|
| 47 |
+
|
| 48 |
+
m_languageModels = &languageModels;
|
| 49 |
+
m_weightT = weightT;
|
| 50 |
+
m_tableLimit = tableLimit;
|
| 51 |
+
m_weightWP = weightWP;
|
| 52 |
+
|
| 53 |
+
//The config should be as follows:
|
| 54 |
+
//0-3: type factor factor num-components (as usual)
|
| 55 |
+
//4: combination mode (e.g. naive)
|
| 56 |
+
//5-(length-2): List of phrase-table files
|
| 57 |
+
//length-1: Weight string, in the same format as used for tmcombine
|
| 58 |
+
|
| 59 |
+
UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7");
|
| 60 |
+
UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'");
|
| 61 |
+
|
| 62 |
+
// Create the dictionaries
|
| 63 |
+
for (size_t i = 5; i < config.size()-1; ++i) {
|
| 64 |
+
m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor(
|
| 65 |
+
GetFeature()->GetNumScoreComponents(),
|
| 66 |
+
GetFeature()->GetNumInputScores(),
|
| 67 |
+
GetFeature())));
|
| 68 |
+
bool ret = m_dictionaries.back()->Load(
|
| 69 |
+
input,
|
| 70 |
+
output,
|
| 71 |
+
config[i],
|
| 72 |
+
weightT,
|
| 73 |
+
0,
|
| 74 |
+
languageModels,
|
| 75 |
+
weightWP);
|
| 76 |
+
if (!ret) return ret;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
//Parse the weight strings
|
| 80 |
+
for (util::TokenIter<util::SingleCharacter, false> featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) {
|
| 81 |
+
m_weights.push_back(vector<float>());
|
| 82 |
+
float sum = 0;
|
| 83 |
+
for (util::TokenIter<util::SingleCharacter, false> tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) {
|
| 84 |
+
const float weight = boost::lexical_cast<float>(*tableWeights);
|
| 85 |
+
m_weights.back().push_back(weight);
|
| 86 |
+
sum += weight;
|
| 87 |
+
}
|
| 88 |
+
UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception,
|
| 89 |
+
"Number of weights (" << m_weights.back().size() <<
|
| 90 |
+
") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")");
|
| 91 |
+
UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised");
|
| 92 |
+
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
//check number of weight sets. Make sure there is a weight for every score component
|
| 96 |
+
//except for the last - which is assumed to be the phrase penalty.
|
| 97 |
+
UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets");
|
| 98 |
+
//if 1 weight set, then repeat
|
| 99 |
+
if (m_weights.size() == 1) {
|
| 100 |
+
while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) {
|
| 101 |
+
m_weights.push_back(m_weights[0]);
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
return true;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
void PhraseDictionaryInterpolated::InitializeForInput(ttasksptr const& ttask)
|
| 109 |
+
{
|
| 110 |
+
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
|
| 111 |
+
m_dictionaries[i]->InitializeForInput(ttask);
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
typedef
|
| 116 |
+
boost::unordered_set<TargetPhrase*,PhrasePtrHasher,PhrasePtrComparator> PhraseSet;
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
TargetPhraseCollection::shared_ptr
|
| 120 |
+
PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const
|
| 121 |
+
{
|
| 122 |
+
|
| 123 |
+
delete m_targetPhrases;
|
| 124 |
+
m_targetPhrases = new TargetPhraseCollection();
|
| 125 |
+
PhraseSet allPhrases;
|
| 126 |
+
vector<PhraseSet> phrasesByTable(m_dictionaries.size());
|
| 127 |
+
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
|
| 128 |
+
TargetPhraseCollection::shared_ptr phrases = m_dictionaries[i]->GetTargetPhraseCollection(src);
|
| 129 |
+
if (phrases) {
|
| 130 |
+
for (TargetPhraseCollection::const_iterator j = phrases->begin();
|
| 131 |
+
j != phrases->end(); ++j) {
|
| 132 |
+
allPhrases.insert(*j);
|
| 133 |
+
phrasesByTable[i].insert(*j);
|
| 134 |
+
}
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
ScoreComponentCollection sparseVector;
|
| 138 |
+
for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) {
|
| 139 |
+
TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i);
|
| 140 |
+
//combinedPhrase->ResetScore();
|
| 141 |
+
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
|
| 142 |
+
combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase());
|
| 143 |
+
combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm()));
|
| 144 |
+
combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm()));
|
| 145 |
+
Scores combinedScores(GetFeature()->GetNumScoreComponents());
|
| 146 |
+
for (size_t j = 0; j < phrasesByTable.size(); ++j) {
|
| 147 |
+
PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase);
|
| 148 |
+
if (tablePhrase != phrasesByTable[j].end()) {
|
| 149 |
+
Scores tableScores = (*tablePhrase)->GetScoreBreakdown()
|
| 150 |
+
.GetScoresForProducer(GetFeature());
|
| 151 |
+
//cerr << "Scores from " << j << " table: ";
|
| 152 |
+
for (size_t k = 0; k < tableScores.size()-1; ++k) {
|
| 153 |
+
//cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") ";
|
| 154 |
+
combinedScores[k] += m_weights[k][j] * exp(tableScores[k]);
|
| 155 |
+
//cerr << m_weights[k][j] * exp(tableScores[k]) << " ";
|
| 156 |
+
}
|
| 157 |
+
//cerr << endl;
|
| 158 |
+
}
|
| 159 |
+
}
|
| 160 |
+
//map back to log space
|
| 161 |
+
//cerr << "Combined ";
|
| 162 |
+
for (size_t k = 0; k < combinedScores.size()-1; ++k) {
|
| 163 |
+
//cerr << combinedScores[k] << " ";
|
| 164 |
+
combinedScores[k] = log(combinedScores[k]);
|
| 165 |
+
//cerr << combinedScores[k] << " ";
|
| 166 |
+
}
|
| 167 |
+
//cerr << endl;
|
| 168 |
+
combinedScores.back() = 1; //assume last is penalty
|
| 169 |
+
combinedPhrase->SetScore(
|
| 170 |
+
GetFeature(),
|
| 171 |
+
combinedScores,
|
| 172 |
+
sparseVector,
|
| 173 |
+
m_weightT,
|
| 174 |
+
m_weightWP,
|
| 175 |
+
*m_languageModels);
|
| 176 |
+
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
|
| 177 |
+
m_targetPhrases->Add(combinedPhrase);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
m_targetPhrases->Prune(true,m_tableLimit);
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
return m_targetPhrases;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
}
|
mosesdecoder/defer/PhraseLengthFeatureTest.cpp
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
#include <boost/test/unit_test.hpp>
|
| 20 |
+
|
| 21 |
+
#include "moses/FF/PhraseLengthFeature.h"
|
| 22 |
+
#include "moses/FactorCollection.h"
|
| 23 |
+
#include "moses/Sentence.h"
|
| 24 |
+
#include "moses/TargetPhrase.h"
|
| 25 |
+
#include "moses/TranslationOption.h"
|
| 26 |
+
|
| 27 |
+
using namespace Moses;
|
| 28 |
+
using namespace std;
|
| 29 |
+
|
| 30 |
+
BOOST_AUTO_TEST_SUITE(phrase_length_feature)
|
| 31 |
+
|
| 32 |
+
//TODO: Factor out setup code so that it can be reused
|
| 33 |
+
|
| 34 |
+
static Word MakeWord(string text)
|
| 35 |
+
{
|
| 36 |
+
FactorCollection &factorCollection = FactorCollection::Instance();
|
| 37 |
+
const Factor* f = factorCollection.AddFactor(Input,0,text);
|
| 38 |
+
Word w;
|
| 39 |
+
w.SetFactor(0,f);
|
| 40 |
+
return w;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
BOOST_AUTO_TEST_CASE(evaluate)
|
| 45 |
+
{
|
| 46 |
+
Word w1 = MakeWord("w1");
|
| 47 |
+
Word w2 = MakeWord("y2");
|
| 48 |
+
Word w3 = MakeWord("x3");
|
| 49 |
+
Word w4 = MakeWord("w4");
|
| 50 |
+
|
| 51 |
+
Phrase p1;
|
| 52 |
+
p1.AddWord(w1);
|
| 53 |
+
p1.AddWord(w3);
|
| 54 |
+
p1.AddWord(w4);
|
| 55 |
+
|
| 56 |
+
Phrase p2;
|
| 57 |
+
p2.AddWord(w1);
|
| 58 |
+
p2.AddWord(w2);
|
| 59 |
+
|
| 60 |
+
Phrase p3;
|
| 61 |
+
p3.AddWord(w2);
|
| 62 |
+
p3.AddWord(w1);
|
| 63 |
+
p3.AddWord(w4);
|
| 64 |
+
p3.AddWord(w4);
|
| 65 |
+
|
| 66 |
+
TargetPhrase tp1(p1);
|
| 67 |
+
TargetPhrase tp2(p2);
|
| 68 |
+
TargetPhrase tp3(p3);
|
| 69 |
+
|
| 70 |
+
Sentence sentence;
|
| 71 |
+
vector<FactorType> order;
|
| 72 |
+
order.push_back(0);
|
| 73 |
+
stringstream in("the input sentence has 6 words");
|
| 74 |
+
sentence.Read(in, order);
|
| 75 |
+
|
| 76 |
+
TranslationOption topt1(WordsRange(0,0), tp1);
|
| 77 |
+
TranslationOption topt2(WordsRange(1,3), tp2);
|
| 78 |
+
TranslationOption topt3(WordsRange(2,3), tp3);
|
| 79 |
+
|
| 80 |
+
PhraseBasedFeatureContext context1(topt1,sentence);
|
| 81 |
+
PhraseBasedFeatureContext context2(topt2,sentence);
|
| 82 |
+
PhraseBasedFeatureContext context3(topt3,sentence);
|
| 83 |
+
|
| 84 |
+
PhraseLengthFeature plf;
|
| 85 |
+
|
| 86 |
+
ScoreComponentCollection acc1,acc2,acc3;
|
| 87 |
+
|
| 88 |
+
plf.Evaluate(context1, &acc1);
|
| 89 |
+
BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "s1"),1);
|
| 90 |
+
BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "t3"),1);
|
| 91 |
+
BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "1,3"),1);
|
| 92 |
+
|
| 93 |
+
plf.Evaluate(context2, &acc2);
|
| 94 |
+
BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "s3"),1);
|
| 95 |
+
BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "t2"),1);
|
| 96 |
+
BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "3,2"),1);
|
| 97 |
+
|
| 98 |
+
plf.Evaluate(context3, &acc3);
|
| 99 |
+
BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "s2"),1);
|
| 100 |
+
BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "t4"),1);
|
| 101 |
+
BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "2,4"),1);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
BOOST_AUTO_TEST_SUITE_END()
|
mosesdecoder/lm/builder/corpus_count.hh
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef LM_BUILDER_CORPUS_COUNT_H
|
| 2 |
+
#define LM_BUILDER_CORPUS_COUNT_H
|
| 3 |
+
|
| 4 |
+
#include "lm/lm_exception.hh"
|
| 5 |
+
#include "lm/word_index.hh"
|
| 6 |
+
#include "util/scoped.hh"
|
| 7 |
+
|
| 8 |
+
#include <cstddef>
|
| 9 |
+
#include <string>
|
| 10 |
+
#include <stdint.h>
|
| 11 |
+
#include <vector>
|
| 12 |
+
|
| 13 |
+
namespace util {
|
| 14 |
+
class FilePiece;
|
| 15 |
+
namespace stream {
|
| 16 |
+
class ChainPosition;
|
| 17 |
+
} // namespace stream
|
| 18 |
+
} // namespace util
|
| 19 |
+
|
| 20 |
+
namespace lm {
|
| 21 |
+
namespace builder {
|
| 22 |
+
|
| 23 |
+
class CorpusCount {
|
| 24 |
+
public:
|
| 25 |
+
// Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size
|
| 26 |
+
static float DedupeMultiplier(std::size_t order);
|
| 27 |
+
|
| 28 |
+
// How much memory vocabulary will use based on estimated size of the vocab.
|
| 29 |
+
static std::size_t VocabUsage(std::size_t vocab_estimate);
|
| 30 |
+
|
| 31 |
+
// token_count: out.
|
| 32 |
+
// type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value.
|
| 33 |
+
CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol);
|
| 34 |
+
|
| 35 |
+
void Run(const util::stream::ChainPosition &position);
|
| 36 |
+
|
| 37 |
+
private:
|
| 38 |
+
util::FilePiece &from_;
|
| 39 |
+
int vocab_write_;
|
| 40 |
+
uint64_t &token_count_;
|
| 41 |
+
WordIndex &type_count_;
|
| 42 |
+
std::vector<bool>& prune_words_;
|
| 43 |
+
const std::string& prune_vocab_filename_;
|
| 44 |
+
|
| 45 |
+
std::size_t dedupe_mem_size_;
|
| 46 |
+
util::scoped_malloc dedupe_mem_;
|
| 47 |
+
|
| 48 |
+
WarningAction disallowed_symbol_action_;
|
| 49 |
+
};
|
| 50 |
+
|
| 51 |
+
} // namespace builder
|
| 52 |
+
} // namespace lm
|
| 53 |
+
#endif // LM_BUILDER_CORPUS_COUNT_H
|
mosesdecoder/lm/builder/dump_counts_main.cc
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "lm/common/print.hh"
|
| 2 |
+
#include "lm/word_index.hh"
|
| 3 |
+
#include "util/file.hh"
|
| 4 |
+
#include "util/read_compressed.hh"
|
| 5 |
+
|
| 6 |
+
#include <boost/lexical_cast.hpp>
|
| 7 |
+
|
| 8 |
+
#include <iostream>
|
| 9 |
+
#include <vector>
|
| 10 |
+
|
| 11 |
+
int main(int argc, char *argv[]) {
|
| 12 |
+
if (argc != 4) {
|
| 13 |
+
std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n"
|
| 14 |
+
"The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n"
|
| 15 |
+
"counts. Each record has order many vocabulary ids.\n"
|
| 16 |
+
"The vocabulary file contains the words delimited by NULL in order of id.\n"
|
| 17 |
+
"The vocabulary file may not be compressed because it is mmapped but the counts\n"
|
| 18 |
+
"file can be compressed.\n";
|
| 19 |
+
return 1;
|
| 20 |
+
}
|
| 21 |
+
util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
|
| 22 |
+
util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
|
| 23 |
+
lm::VocabReconstitute vocab(vocab_file.get());
|
| 24 |
+
unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
|
| 25 |
+
std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
|
| 26 |
+
while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
|
| 27 |
+
UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size());
|
| 28 |
+
const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin());
|
| 29 |
+
for (const lm::WordIndex *i = words; i != words + order; ++i) {
|
| 30 |
+
UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?");
|
| 31 |
+
std::cout << vocab.Lookup(*i) << ' ';
|
| 32 |
+
}
|
| 33 |
+
// TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FileStream.
|
| 34 |
+
std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
|
| 35 |
+
}
|
| 36 |
+
}
|
mosesdecoder/lm/builder/lmplz_main.cc
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "lm/builder/output.hh"
|
| 2 |
+
#include "lm/builder/pipeline.hh"
|
| 3 |
+
#include "lm/common/size_option.hh"
|
| 4 |
+
#include "lm/lm_exception.hh"
|
| 5 |
+
#include "util/file.hh"
|
| 6 |
+
#include "util/file_piece.hh"
|
| 7 |
+
#include "util/usage.hh"
|
| 8 |
+
|
| 9 |
+
#include <iostream>
|
| 10 |
+
|
| 11 |
+
#include <boost/program_options.hpp>
|
| 12 |
+
#include <boost/version.hpp>
|
| 13 |
+
#include <vector>
|
| 14 |
+
|
| 15 |
+
namespace {
|
| 16 |
+
|
| 17 |
+
// Parse and validate pruning thresholds then return vector of threshold counts
|
| 18 |
+
// for each n-grams order.
|
| 19 |
+
std::vector<uint64_t> ParsePruning(const std::vector<std::string> ¶m, std::size_t order) {
|
| 20 |
+
// convert to vector of integers
|
| 21 |
+
std::vector<uint64_t> prune_thresholds;
|
| 22 |
+
prune_thresholds.reserve(order);
|
| 23 |
+
for (std::vector<std::string>::const_iterator it(param.begin()); it != param.end(); ++it) {
|
| 24 |
+
try {
|
| 25 |
+
prune_thresholds.push_back(boost::lexical_cast<uint64_t>(*it));
|
| 26 |
+
} catch(const boost::bad_lexical_cast &) {
|
| 27 |
+
UTIL_THROW(util::Exception, "Bad pruning threshold " << *it);
|
| 28 |
+
}
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
// Fill with zeros by default.
|
| 32 |
+
if (prune_thresholds.empty()) {
|
| 33 |
+
prune_thresholds.resize(order, 0);
|
| 34 |
+
return prune_thresholds;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
// validate pruning threshold if specified
|
| 38 |
+
// throw if each n-gram order has not threshold specified
|
| 39 |
+
UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
|
| 40 |
+
// threshold for unigram can only be 0 (no pruning)
|
| 41 |
+
|
| 42 |
+
// check if threshold are not in decreasing order
|
| 43 |
+
uint64_t lower_threshold = 0;
|
| 44 |
+
for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
|
| 45 |
+
UTIL_THROW_IF(lower_threshold > *it, util::Exception, "Pruning thresholds should be in non-decreasing order. Otherwise substrings would be removed, which is bad for query-time data structures.");
|
| 46 |
+
lower_threshold = *it;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
// Pad to all orders using the last value.
|
| 50 |
+
prune_thresholds.resize(order, prune_thresholds.back());
|
| 51 |
+
return prune_thresholds;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
lm::builder::Discount ParseDiscountFallback(const std::vector<std::string> ¶m) {
|
| 55 |
+
lm::builder::Discount ret;
|
| 56 |
+
UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+");
|
| 57 |
+
UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified");
|
| 58 |
+
ret.amount[0] = 0.0;
|
| 59 |
+
for (unsigned i = 0; i < 3; ++i) {
|
| 60 |
+
float discount = boost::lexical_cast<float>(param[i < param.size() ? i : (param.size() - 1)]);
|
| 61 |
+
UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "].");
|
| 62 |
+
ret.amount[i + 1] = discount;
|
| 63 |
+
}
|
| 64 |
+
return ret;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
} // namespace
|
| 68 |
+
|
| 69 |
+
int main(int argc, char *argv[]) {
|
| 70 |
+
try {
|
| 71 |
+
namespace po = boost::program_options;
|
| 72 |
+
po::options_description options("Language model building options");
|
| 73 |
+
lm::builder::PipelineConfig pipeline;
|
| 74 |
+
|
| 75 |
+
std::string text, intermediate, arpa;
|
| 76 |
+
std::vector<std::string> pruning;
|
| 77 |
+
std::vector<std::string> discount_fallback;
|
| 78 |
+
std::vector<std::string> discount_fallback_default;
|
| 79 |
+
discount_fallback_default.push_back("0.5");
|
| 80 |
+
discount_fallback_default.push_back("1");
|
| 81 |
+
discount_fallback_default.push_back("1.5");
|
| 82 |
+
bool verbose_header;
|
| 83 |
+
|
| 84 |
+
options.add_options()
|
| 85 |
+
("help,h", po::bool_switch(), "Show this help message")
|
| 86 |
+
("order,o", po::value<std::size_t>(&pipeline.order)
|
| 87 |
+
#if BOOST_VERSION >= 104200
|
| 88 |
+
->required()
|
| 89 |
+
#endif
|
| 90 |
+
, "Order of the model")
|
| 91 |
+
("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI. If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
|
| 92 |
+
("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
|
| 93 |
+
("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
|
| 94 |
+
("memory,S", lm:: SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
|
| 95 |
+
("minimum_block", lm::SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
|
| 96 |
+
("sort_block", lm::SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
|
| 97 |
+
("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
|
| 98 |
+
("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
|
| 99 |
+
("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
|
| 100 |
+
("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
|
| 101 |
+
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
|
| 102 |
+
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
|
| 103 |
+
("intermediate", po::value<std::string>(&intermediate), "Write ngrams to intermediate files. Turns off ARPA output (which can be reactivated by --arpa file). Forces --renumber on.")
|
| 104 |
+
("renumber", po::bool_switch(&pipeline.renumber_vocabulary), "Rrenumber the vocabulary identifiers so that they are monotone with the hash of each string. This is consistent with the ordering used by the trie data structure.")
|
| 105 |
+
("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
|
| 106 |
+
("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.")
|
| 107 |
+
("limit_vocab_file", po::value<std::string>(&pipeline.prune_vocab_file)->default_value(""), "Read allowed vocabulary separated by whitespace. N-grams that contain vocabulary items not in this list will be pruned. Can be combined with --prune arg")
|
| 108 |
+
("discount_fallback", po::value<std::vector<std::string> >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons. It can also fail if these values are out of range. This option falls back to user-specified discounts when the closed-form estimate fails. Note that this option is generally a bad idea: you should deduplicate your corpus instead. However, class-based models need custom discounts because they lack singleton unigrams. Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail.");
|
| 109 |
+
po::variables_map vm;
|
| 110 |
+
po::store(po::parse_command_line(argc, argv, options), vm);
|
| 111 |
+
|
| 112 |
+
if (argc == 1 || vm["help"].as<bool>()) {
|
| 113 |
+
std::cerr <<
|
| 114 |
+
"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
|
| 115 |
+
"Please cite:\n"
|
| 116 |
+
"@inproceedings{Heafield-estimate,\n"
|
| 117 |
+
" author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
|
| 118 |
+
" title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
|
| 119 |
+
" year = {2013},\n"
|
| 120 |
+
" month = {8},\n"
|
| 121 |
+
" booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
|
| 122 |
+
" address = {Sofia, Bulgaria},\n"
|
| 123 |
+
" url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
|
| 124 |
+
"}\n\n"
|
| 125 |
+
"Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n"
|
| 126 |
+
"the model (-o) is the only mandatory option. As this is an on-disk program,\n"
|
| 127 |
+
"setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
|
| 128 |
+
"Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
|
| 129 |
+
"Valid units are \% for percentage of memory (supported platforms only) and (in\n"
|
| 130 |
+
"increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n";
|
| 131 |
+
uint64_t mem = util::GuessPhysicalMemory();
|
| 132 |
+
if (mem) {
|
| 133 |
+
std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
|
| 134 |
+
} else {
|
| 135 |
+
std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
|
| 136 |
+
}
|
| 137 |
+
std::cerr << options << std::endl;
|
| 138 |
+
return 1;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
po::notify(vm);
|
| 142 |
+
|
| 143 |
+
// required() appeared in Boost 1.42.0.
|
| 144 |
+
#if BOOST_VERSION < 104200
|
| 145 |
+
if (!vm.count("order")) {
|
| 146 |
+
std::cerr << "the option '--order' is required but missing" << std::endl;
|
| 147 |
+
return 1;
|
| 148 |
+
}
|
| 149 |
+
#endif
|
| 150 |
+
|
| 151 |
+
if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) {
|
| 152 |
+
std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl;
|
| 153 |
+
return 1;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
if (vm["skip_symbols"].as<bool>()) {
|
| 157 |
+
pipeline.disallowed_symbol_action = lm::COMPLAIN;
|
| 158 |
+
} else {
|
| 159 |
+
pipeline.disallowed_symbol_action = lm::THROW_UP;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
if (vm.count("discount_fallback")) {
|
| 163 |
+
pipeline.discount.fallback = ParseDiscountFallback(discount_fallback);
|
| 164 |
+
pipeline.discount.bad_action = lm::COMPLAIN;
|
| 165 |
+
} else {
|
| 166 |
+
// Unused, just here to prevent the compiler from complaining about uninitialized.
|
| 167 |
+
pipeline.discount.fallback = lm::builder::Discount();
|
| 168 |
+
pipeline.discount.bad_action = lm::THROW_UP;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
// parse pruning thresholds. These depend on order, so it is not done as a notifier.
|
| 172 |
+
pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order);
|
| 173 |
+
|
| 174 |
+
if (!vm["limit_vocab_file"].as<std::string>().empty()) {
|
| 175 |
+
pipeline.prune_vocab = true;
|
| 176 |
+
}
|
| 177 |
+
else {
|
| 178 |
+
pipeline.prune_vocab = false;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
|
| 182 |
+
|
| 183 |
+
lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
|
| 184 |
+
// TODO: evaluate options for these.
|
| 185 |
+
initial.adder_in.total_memory = 32768;
|
| 186 |
+
initial.adder_in.block_count = 2;
|
| 187 |
+
initial.adder_out.total_memory = 32768;
|
| 188 |
+
initial.adder_out.block_count = 2;
|
| 189 |
+
pipeline.read_backoffs = initial.adder_out;
|
| 190 |
+
|
| 191 |
+
// Read from stdin, write to stdout by default
|
| 192 |
+
util::scoped_fd in(0), out(1);
|
| 193 |
+
if (vm.count("text")) {
|
| 194 |
+
in.reset(util::OpenReadOrThrow(text.c_str()));
|
| 195 |
+
}
|
| 196 |
+
if (vm.count("arpa")) {
|
| 197 |
+
out.reset(util::CreateOrThrow(arpa.c_str()));
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
try {
|
| 201 |
+
bool writing_intermediate = vm.count("intermediate");
|
| 202 |
+
if (writing_intermediate) {
|
| 203 |
+
pipeline.renumber_vocabulary = true;
|
| 204 |
+
}
|
| 205 |
+
lm::builder::Output output(writing_intermediate ? intermediate : pipeline.sort.temp_prefix, writing_intermediate, pipeline.output_q);
|
| 206 |
+
if (!writing_intermediate || vm.count("arpa")) {
|
| 207 |
+
output.Add(new lm::builder::PrintHook(out.release(), verbose_header));
|
| 208 |
+
}
|
| 209 |
+
lm::builder::Pipeline(pipeline, in.release(), output);
|
| 210 |
+
} catch (const util::MallocException &e) {
|
| 211 |
+
std::cerr << e.what() << std::endl;
|
| 212 |
+
std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
|
| 213 |
+
return 1;
|
| 214 |
+
}
|
| 215 |
+
util::PrintUsage(std::cerr);
|
| 216 |
+
} catch (const std::exception &e) {
|
| 217 |
+
std::cerr << e.what() << std::endl;
|
| 218 |
+
return 1;
|
| 219 |
+
}
|
| 220 |
+
}
|
mosesdecoder/lm/common/CMakeLists.txt
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cmake_minimum_required(VERSION 2.8.8)
|
| 2 |
+
#
|
| 3 |
+
# The KenLM cmake files make use of add_library(... OBJECTS ...)
|
| 4 |
+
#
|
| 5 |
+
# This syntax allows grouping of source files when compiling
|
| 6 |
+
# (effectively creating "fake" libraries based on source subdirs).
|
| 7 |
+
#
|
| 8 |
+
# This syntax was only added in cmake version 2.8.8
|
| 9 |
+
#
|
| 10 |
+
# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
|
| 14 |
+
|
| 15 |
+
# Explicitly list the source files for this subdirectory
|
| 16 |
+
#
|
| 17 |
+
# If you add any source files to this subdirectory
|
| 18 |
+
# that should be included in the kenlm library,
|
| 19 |
+
# (this excludes any unit test files)
|
| 20 |
+
# you should add them to the following list:
|
| 21 |
+
#
|
| 22 |
+
# In order to set correct paths to these files
|
| 23 |
+
# in case this variable is referenced by CMake files in the parent directory,
|
| 24 |
+
# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
|
| 25 |
+
#
|
| 26 |
+
set(KENLM_COMMON_SOURCE
|
| 27 |
+
${CMAKE_CURRENT_SOURCE_DIR}/model_buffer.cc
|
| 28 |
+
${CMAKE_CURRENT_SOURCE_DIR}/print.cc
|
| 29 |
+
${CMAKE_CURRENT_SOURCE_DIR}/renumber.cc
|
| 30 |
+
${CMAKE_CURRENT_SOURCE_DIR}/size_option.cc
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# Group these objects together for later use.
|
| 35 |
+
#
|
| 36 |
+
# Given add_library(foo OBJECT ${my_foo_sources}),
|
| 37 |
+
# refer to these objects as $<TARGET_OBJECTS:foo>
|
| 38 |
+
#
|
| 39 |
+
add_library(kenlm_common OBJECT ${KENLM_COMMON_SOURCE})
|
| 40 |
+
|
mosesdecoder/lm/common/Jamfile
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fakelib common : [ glob *.cc : *test.cc *main.cc ]
|
| 2 |
+
../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm /top//boost_program_options ;
|
mosesdecoder/lm/common/joint_order.hh
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef LM_COMMON_JOINT_ORDER_H
|
| 2 |
+
#define LM_COMMON_JOINT_ORDER_H
|
| 3 |
+
|
| 4 |
+
#include "lm/common/ngram_stream.hh"
|
| 5 |
+
#include "lm/lm_exception.hh"
|
| 6 |
+
|
| 7 |
+
#ifdef DEBUG
|
| 8 |
+
#include "util/fixed_array.hh"
|
| 9 |
+
#include <iostream>
|
| 10 |
+
#endif
|
| 11 |
+
|
| 12 |
+
#include <cstring>
|
| 13 |
+
|
| 14 |
+
namespace lm {
|
| 15 |
+
|
| 16 |
+
template <class Callback, class Compare> void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) {
|
| 17 |
+
// Allow matching to reference streams[-1].
|
| 18 |
+
util::FixedArray<ProxyStream<NGramHeader> > streams_with_dummy(positions.size() + 1);
|
| 19 |
+
// A bogus stream for [-1].
|
| 20 |
+
streams_with_dummy.push_back();
|
| 21 |
+
for (std::size_t i = 0; i < positions.size(); ++i) {
|
| 22 |
+
streams_with_dummy.push_back(positions[i], NGramHeader(NULL, i + 1));
|
| 23 |
+
}
|
| 24 |
+
ProxyStream<NGramHeader> *streams = streams_with_dummy.begin() + 1;
|
| 25 |
+
|
| 26 |
+
std::size_t order;
|
| 27 |
+
for (order = 0; order < positions.size() && streams[order]; ++order) {}
|
| 28 |
+
assert(order); // should always have <unk>.
|
| 29 |
+
|
| 30 |
+
// Debugging only: call comparison function to sanity check order.
|
| 31 |
+
#ifdef DEBUG
|
| 32 |
+
util::FixedArray<Compare> less_compare(order);
|
| 33 |
+
for (unsigned i = 0; i < order; ++i)
|
| 34 |
+
less_compare.push_back(i + 1);
|
| 35 |
+
#endif // DEBUG
|
| 36 |
+
|
| 37 |
+
std::size_t current = 0;
|
| 38 |
+
while (true) {
|
| 39 |
+
// Does the context match the lower one?
|
| 40 |
+
if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
|
| 41 |
+
callback.Enter(current, streams[current].Get());
|
| 42 |
+
// Transition to looking for extensions.
|
| 43 |
+
if (++current < order) continue;
|
| 44 |
+
}
|
| 45 |
+
#ifdef DEBUG
|
| 46 |
+
// match_check[current - 1] matches current-grams
|
| 47 |
+
// The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams).
|
| 48 |
+
else if (!less_compare[current - 1](streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) {
|
| 49 |
+
std::cerr << "Stream out of order detected" << std::endl;
|
| 50 |
+
abort();
|
| 51 |
+
}
|
| 52 |
+
#endif // DEBUG
|
| 53 |
+
// No extension left.
|
| 54 |
+
while(true) {
|
| 55 |
+
assert(current > 0);
|
| 56 |
+
--current;
|
| 57 |
+
callback.Exit(current, streams[current].Get());
|
| 58 |
+
|
| 59 |
+
if (++streams[current]) break;
|
| 60 |
+
|
| 61 |
+
UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
|
| 62 |
+
|
| 63 |
+
order = current;
|
| 64 |
+
if (!order) return;
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
} // namespaces
|
| 70 |
+
|
| 71 |
+
#endif // LM_COMMON_JOINT_ORDER_H
|
mosesdecoder/lm/common/ngram.hh
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef LM_COMMON_NGRAM_H
|
| 2 |
+
#define LM_COMMON_NGRAM_H
|
| 3 |
+
|
| 4 |
+
#include "lm/weights.hh"
|
| 5 |
+
#include "lm/word_index.hh"
|
| 6 |
+
|
| 7 |
+
#include <cstddef>
|
| 8 |
+
#include <cassert>
|
| 9 |
+
#include <stdint.h>
|
| 10 |
+
#include <cstring>
|
| 11 |
+
|
| 12 |
+
namespace lm {
|
| 13 |
+
|
| 14 |
+
class NGramHeader {
|
| 15 |
+
public:
|
| 16 |
+
NGramHeader(void *begin, std::size_t order)
|
| 17 |
+
: begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {}
|
| 18 |
+
|
| 19 |
+
NGramHeader() : begin_(NULL), end_(NULL) {}
|
| 20 |
+
|
| 21 |
+
const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); }
|
| 22 |
+
uint8_t *Base() { return reinterpret_cast<uint8_t*>(begin_); }
|
| 23 |
+
|
| 24 |
+
void ReBase(void *to) {
|
| 25 |
+
std::size_t difference = end_ - begin_;
|
| 26 |
+
begin_ = reinterpret_cast<WordIndex*>(to);
|
| 27 |
+
end_ = begin_ + difference;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
// These are for the vocab index.
|
| 31 |
+
// Lower-case in deference to STL.
|
| 32 |
+
const WordIndex *begin() const { return begin_; }
|
| 33 |
+
WordIndex *begin() { return begin_; }
|
| 34 |
+
const WordIndex *end() const { return end_; }
|
| 35 |
+
WordIndex *end() { return end_; }
|
| 36 |
+
|
| 37 |
+
std::size_t size() const { return end_ - begin_; }
|
| 38 |
+
std::size_t Order() const { return end_ - begin_; }
|
| 39 |
+
|
| 40 |
+
private:
|
| 41 |
+
WordIndex *begin_, *end_;
|
| 42 |
+
};
|
| 43 |
+
|
| 44 |
+
template <class PayloadT> class NGram : public NGramHeader {
|
| 45 |
+
public:
|
| 46 |
+
typedef PayloadT Payload;
|
| 47 |
+
|
| 48 |
+
NGram() : NGramHeader(NULL, 0) {}
|
| 49 |
+
|
| 50 |
+
NGram(void *begin, std::size_t order) : NGramHeader(begin, order) {}
|
| 51 |
+
|
| 52 |
+
// Would do operator++ but that can get confusing for a stream.
|
| 53 |
+
void NextInMemory() {
|
| 54 |
+
ReBase(&Value() + 1);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
static std::size_t TotalSize(std::size_t order) {
|
| 58 |
+
return order * sizeof(WordIndex) + sizeof(Payload);
|
| 59 |
+
}
|
| 60 |
+
std::size_t TotalSize() const {
|
| 61 |
+
// Compiler should optimize this.
|
| 62 |
+
return TotalSize(Order());
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
static std::size_t OrderFromSize(std::size_t size) {
|
| 66 |
+
std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex);
|
| 67 |
+
assert(size == TotalSize(ret));
|
| 68 |
+
return ret;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
const Payload &Value() const { return *reinterpret_cast<const Payload *>(end()); }
|
| 72 |
+
Payload &Value() { return *reinterpret_cast<Payload *>(end()); }
|
| 73 |
+
};
|
| 74 |
+
|
| 75 |
+
} // namespace lm
|
| 76 |
+
|
| 77 |
+
#endif // LM_COMMON_NGRAM_H
|
mosesdecoder/lm/common/print.cc
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "lm/common/print.hh"
|
| 2 |
+
|
| 3 |
+
#include "lm/common/ngram_stream.hh"
|
| 4 |
+
#include "util/file_stream.hh"
|
| 5 |
+
#include "util/file.hh"
|
| 6 |
+
#include "util/mmap.hh"
|
| 7 |
+
#include "util/scoped.hh"
|
| 8 |
+
|
| 9 |
+
#include <sstream>
|
| 10 |
+
#include <cstring>
|
| 11 |
+
|
| 12 |
+
namespace lm {
|
| 13 |
+
|
| 14 |
+
VocabReconstitute::VocabReconstitute(int fd) {
|
| 15 |
+
uint64_t size = util::SizeOrThrow(fd);
|
| 16 |
+
util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
|
| 17 |
+
const char *const start = static_cast<const char*>(memory_.get());
|
| 18 |
+
const char *i;
|
| 19 |
+
for (i = start; i != start + size; i += strlen(i) + 1) {
|
| 20 |
+
map_.push_back(i);
|
| 21 |
+
}
|
| 22 |
+
// Last one for LookupPiece.
|
| 23 |
+
map_.push_back(i);
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
namespace {
|
| 27 |
+
template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FileStream &out) {
|
| 28 |
+
out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin());
|
| 29 |
+
for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
|
| 30 |
+
out << ' ' << vocab.Lookup(*i);
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
} // namespace
|
| 34 |
+
|
| 35 |
+
void PrintARPA::Run(const util::stream::ChainPositions &positions) {
|
| 36 |
+
VocabReconstitute vocab(vocab_fd_);
|
| 37 |
+
util::FileStream out(out_fd_);
|
| 38 |
+
out << "\\data\\\n";
|
| 39 |
+
for (size_t i = 0; i < positions.size(); ++i) {
|
| 40 |
+
out << "ngram " << (i+1) << '=' << counts_[i] << '\n';
|
| 41 |
+
}
|
| 42 |
+
out << '\n';
|
| 43 |
+
|
| 44 |
+
for (unsigned order = 1; order < positions.size(); ++order) {
|
| 45 |
+
out << "\\" << order << "-grams:" << '\n';
|
| 46 |
+
for (ProxyStream<NGram<ProbBackoff> > stream(positions[order - 1], NGram<ProbBackoff>(NULL, order)); stream; ++stream) {
|
| 47 |
+
PrintLead(vocab, stream, out);
|
| 48 |
+
out << '\t' << stream->Value().backoff << '\n';
|
| 49 |
+
}
|
| 50 |
+
out << '\n';
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
out << "\\" << positions.size() << "-grams:" << '\n';
|
| 54 |
+
for (ProxyStream<NGram<Prob> > stream(positions.back(), NGram<Prob>(NULL, positions.size())); stream; ++stream) {
|
| 55 |
+
PrintLead(vocab, stream, out);
|
| 56 |
+
out << '\n';
|
| 57 |
+
}
|
| 58 |
+
out << '\n';
|
| 59 |
+
out << "\\end\\\n";
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
} // namespace lm
|
mosesdecoder/lm/common/renumber.cc
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "lm/common/renumber.hh"
|
| 2 |
+
#include "lm/common/ngram.hh"
|
| 3 |
+
|
| 4 |
+
#include "util/stream/stream.hh"
|
| 5 |
+
|
| 6 |
+
namespace lm {
|
| 7 |
+
|
| 8 |
+
void Renumber::Run(const util::stream::ChainPosition &position) {
|
| 9 |
+
for (util::stream::Stream stream(position); stream; ++stream) {
|
| 10 |
+
NGramHeader gram(stream.Get(), order_);
|
| 11 |
+
for (WordIndex *w = gram.begin(); w != gram.end(); ++w) {
|
| 12 |
+
*w = new_numbers_[*w];
|
| 13 |
+
}
|
| 14 |
+
}
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
} // namespace lm
|
mosesdecoder/lm/common/renumber.hh
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Map vocab ids. This is useful to merge independently collected counts or
|
| 2 |
+
* change the vocab ids to the order used by the trie.
|
| 3 |
+
*/
|
| 4 |
+
#ifndef LM_COMMON_RENUMBER_H
|
| 5 |
+
#define LM_COMMON_RENUMBER_H
|
| 6 |
+
|
| 7 |
+
#include "lm/word_index.hh"
|
| 8 |
+
|
| 9 |
+
#include <cstddef>
|
| 10 |
+
|
| 11 |
+
namespace util { namespace stream { class ChainPosition; }}
|
| 12 |
+
|
| 13 |
+
namespace lm {
|
| 14 |
+
|
| 15 |
+
class Renumber {
|
| 16 |
+
public:
|
| 17 |
+
// Assumes the array is large enough to map all words and stays alive while
|
| 18 |
+
// the thread is active.
|
| 19 |
+
Renumber(const WordIndex *new_numbers, std::size_t order)
|
| 20 |
+
: new_numbers_(new_numbers), order_(order) {}
|
| 21 |
+
|
| 22 |
+
void Run(const util::stream::ChainPosition &position);
|
| 23 |
+
|
| 24 |
+
private:
|
| 25 |
+
const WordIndex *new_numbers_;
|
| 26 |
+
std::size_t order_;
|
| 27 |
+
};
|
| 28 |
+
|
| 29 |
+
} // namespace lm
|
| 30 |
+
#endif // LM_COMMON_RENUMBER_H
|
mosesdecoder/mert/ReferenceTest.cpp
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "Reference.h"
|
| 2 |
+
|
| 3 |
+
#define BOOST_TEST_MODULE MertReference
|
| 4 |
+
#include <boost/test/unit_test.hpp>
|
| 5 |
+
|
| 6 |
+
using namespace MosesTuning;
|
| 7 |
+
|
| 8 |
+
BOOST_AUTO_TEST_CASE(refernece_count)
|
| 9 |
+
{
|
| 10 |
+
Reference ref;
|
| 11 |
+
BOOST_CHECK(ref.get_counts() != NULL);
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
BOOST_AUTO_TEST_CASE(refernece_length_iterator)
|
| 15 |
+
{
|
| 16 |
+
Reference ref;
|
| 17 |
+
ref.push_back(4);
|
| 18 |
+
ref.push_back(2);
|
| 19 |
+
BOOST_REQUIRE(ref.num_references() == 2);
|
| 20 |
+
|
| 21 |
+
Reference::iterator it = ref.begin();
|
| 22 |
+
BOOST_CHECK_EQUAL(*it, 4);
|
| 23 |
+
++it;
|
| 24 |
+
BOOST_CHECK_EQUAL(*it, 2);
|
| 25 |
+
++it;
|
| 26 |
+
BOOST_CHECK(it == ref.end());
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
BOOST_AUTO_TEST_CASE(refernece_length_average)
|
| 30 |
+
{
|
| 31 |
+
{
|
| 32 |
+
Reference ref;
|
| 33 |
+
ref.push_back(4);
|
| 34 |
+
ref.push_back(1);
|
| 35 |
+
BOOST_CHECK_EQUAL(2, ref.CalcAverage());
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
{
|
| 39 |
+
Reference ref;
|
| 40 |
+
ref.push_back(4);
|
| 41 |
+
ref.push_back(3);
|
| 42 |
+
BOOST_CHECK_EQUAL(3, ref.CalcAverage());
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
{
|
| 46 |
+
Reference ref;
|
| 47 |
+
ref.push_back(4);
|
| 48 |
+
ref.push_back(3);
|
| 49 |
+
ref.push_back(4);
|
| 50 |
+
ref.push_back(5);
|
| 51 |
+
BOOST_CHECK_EQUAL(4, ref.CalcAverage());
|
| 52 |
+
}
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
BOOST_AUTO_TEST_CASE(refernece_length_closest)
|
| 56 |
+
{
|
| 57 |
+
{
|
| 58 |
+
Reference ref;
|
| 59 |
+
ref.push_back(4);
|
| 60 |
+
ref.push_back(1);
|
| 61 |
+
BOOST_REQUIRE(ref.num_references() == 2);
|
| 62 |
+
|
| 63 |
+
BOOST_CHECK_EQUAL(1, ref.CalcClosest(2));
|
| 64 |
+
BOOST_CHECK_EQUAL(1, ref.CalcClosest(1));
|
| 65 |
+
BOOST_CHECK_EQUAL(4, ref.CalcClosest(3));
|
| 66 |
+
BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
|
| 67 |
+
BOOST_CHECK_EQUAL(4, ref.CalcClosest(5));
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
{
|
| 71 |
+
Reference ref;
|
| 72 |
+
ref.push_back(4);
|
| 73 |
+
ref.push_back(3);
|
| 74 |
+
BOOST_REQUIRE(ref.num_references() == 2);
|
| 75 |
+
|
| 76 |
+
BOOST_CHECK_EQUAL(3, ref.CalcClosest(1));
|
| 77 |
+
BOOST_CHECK_EQUAL(3, ref.CalcClosest(2));
|
| 78 |
+
BOOST_CHECK_EQUAL(3, ref.CalcClosest(3));
|
| 79 |
+
BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
|
| 80 |
+
BOOST_CHECK_EQUAL(4, ref.CalcClosest(5));
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
{
|
| 84 |
+
Reference ref;
|
| 85 |
+
ref.push_back(4);
|
| 86 |
+
ref.push_back(3);
|
| 87 |
+
ref.push_back(4);
|
| 88 |
+
ref.push_back(5);
|
| 89 |
+
BOOST_REQUIRE(ref.num_references() == 4);
|
| 90 |
+
|
| 91 |
+
BOOST_CHECK_EQUAL(3, ref.CalcClosest(1));
|
| 92 |
+
BOOST_CHECK_EQUAL(3, ref.CalcClosest(2));
|
| 93 |
+
BOOST_CHECK_EQUAL(3, ref.CalcClosest(3));
|
| 94 |
+
BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
|
| 95 |
+
BOOST_CHECK_EQUAL(5, ref.CalcClosest(5));
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
BOOST_AUTO_TEST_CASE(refernece_length_shortest)
|
| 100 |
+
{
|
| 101 |
+
{
|
| 102 |
+
Reference ref;
|
| 103 |
+
ref.push_back(4);
|
| 104 |
+
ref.push_back(1);
|
| 105 |
+
BOOST_CHECK_EQUAL(1, ref.CalcShortest());
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
{
|
| 109 |
+
Reference ref;
|
| 110 |
+
ref.push_back(4);
|
| 111 |
+
ref.push_back(3);
|
| 112 |
+
BOOST_CHECK_EQUAL(3, ref.CalcShortest());
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
{
|
| 116 |
+
Reference ref;
|
| 117 |
+
ref.push_back(4);
|
| 118 |
+
ref.push_back(3);
|
| 119 |
+
ref.push_back(4);
|
| 120 |
+
ref.push_back(5);
|
| 121 |
+
BOOST_CHECK_EQUAL(3, ref.CalcShortest());
|
| 122 |
+
}
|
| 123 |
+
}
|
mosesdecoder/mert/ScoreArray.cpp
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* ScoreArray.cpp
|
| 3 |
+
* mert - Minimum Error Rate Training
|
| 4 |
+
*
|
| 5 |
+
* Created by Nicola Bertoldi on 13/05/08.
|
| 6 |
+
*
|
| 7 |
+
*/
|
| 8 |
+
|
| 9 |
+
#include "ScoreArray.h"
|
| 10 |
+
#include "Util.h"
|
| 11 |
+
#include "FileStream.h"
|
| 12 |
+
|
| 13 |
+
using namespace std;
|
| 14 |
+
|
| 15 |
+
namespace MosesTuning
|
| 16 |
+
{
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
ScoreArray::ScoreArray()
|
| 20 |
+
: m_num_scores(0), m_index(0) {}
|
| 21 |
+
|
| 22 |
+
void ScoreArray::savetxt(ostream* os, const string& sctype)
|
| 23 |
+
{
|
| 24 |
+
*os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size()
|
| 25 |
+
<< " " << m_num_scores << " " << sctype << endl;
|
| 26 |
+
for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
|
| 27 |
+
i->savetxt(os);
|
| 28 |
+
*os << endl;
|
| 29 |
+
}
|
| 30 |
+
*os << SCORES_TXT_END << endl;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
void ScoreArray::savebin(ostream* os, const string& score_type)
|
| 34 |
+
{
|
| 35 |
+
*os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size()
|
| 36 |
+
<< " " << m_num_scores << " " << score_type << endl;
|
| 37 |
+
for (scorearray_t::iterator i = m_array.begin();
|
| 38 |
+
i != m_array.end(); i++) {
|
| 39 |
+
i->savebin(os);
|
| 40 |
+
}
|
| 41 |
+
*os << SCORES_BIN_END << endl;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
void ScoreArray::save(ostream* os, const string& score_type, bool bin)
|
| 45 |
+
{
|
| 46 |
+
if (size() <= 0) return;
|
| 47 |
+
if (bin) {
|
| 48 |
+
savebin(os, score_type);
|
| 49 |
+
} else {
|
| 50 |
+
savetxt(os, score_type);
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
void ScoreArray::save(const string &file, const string& score_type, bool bin)
|
| 55 |
+
{
|
| 56 |
+
ofstream ofs(file.c_str(), ios::out);
|
| 57 |
+
if (!ofs) {
|
| 58 |
+
cerr << "Failed to open " << file << endl;
|
| 59 |
+
exit(1);
|
| 60 |
+
}
|
| 61 |
+
ostream* os = &ofs;
|
| 62 |
+
save(os, score_type, bin);
|
| 63 |
+
ofs.close();
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
void ScoreArray::save(const string& score_type, bool bin)
|
| 67 |
+
{
|
| 68 |
+
save(&cout, score_type, bin);
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
void ScoreArray::loadbin(istream* is, size_t n)
|
| 72 |
+
{
|
| 73 |
+
ScoreStats entry(m_num_scores);
|
| 74 |
+
for (size_t i = 0; i < n; i++) {
|
| 75 |
+
entry.loadbin(is);
|
| 76 |
+
add(entry);
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
void ScoreArray::loadtxt(istream* is, size_t n)
|
| 81 |
+
{
|
| 82 |
+
ScoreStats entry(m_num_scores);
|
| 83 |
+
for (size_t i = 0; i < n; i++) {
|
| 84 |
+
entry.loadtxt(is);
|
| 85 |
+
add(entry);
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
void ScoreArray::load(istream* is)
|
| 90 |
+
{
|
| 91 |
+
size_t number_of_entries = 0;
|
| 92 |
+
bool binmode = false;
|
| 93 |
+
|
| 94 |
+
string substring, stringBuf;
|
| 95 |
+
string::size_type loc;
|
| 96 |
+
|
| 97 |
+
getline(*is, stringBuf);
|
| 98 |
+
if (!is->good()) {
|
| 99 |
+
return;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
if (!stringBuf.empty()) {
|
| 103 |
+
if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0) {
|
| 104 |
+
binmode=false;
|
| 105 |
+
} else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0) {
|
| 106 |
+
binmode=true;
|
| 107 |
+
} else {
|
| 108 |
+
TRACE_ERR("ERROR: ScoreArray::load(): Wrong header");
|
| 109 |
+
return;
|
| 110 |
+
}
|
| 111 |
+
getNextPound(stringBuf, substring);
|
| 112 |
+
getNextPound(stringBuf, substring);
|
| 113 |
+
m_index = atoi(substring.c_str());
|
| 114 |
+
getNextPound(stringBuf, substring);
|
| 115 |
+
number_of_entries = atoi(substring.c_str());
|
| 116 |
+
getNextPound(stringBuf, substring);
|
| 117 |
+
m_num_scores = atoi(substring.c_str());
|
| 118 |
+
getNextPound(stringBuf, substring);
|
| 119 |
+
m_score_type = substring;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
if (binmode) {
|
| 123 |
+
loadbin(is, number_of_entries);
|
| 124 |
+
} else {
|
| 125 |
+
loadtxt(is, number_of_entries);
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
getline(*is, stringBuf);
|
| 129 |
+
if (!stringBuf.empty()) {
|
| 130 |
+
if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 &&
|
| 131 |
+
(loc = stringBuf.find(SCORES_BIN_END)) != 0) {
|
| 132 |
+
TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer");
|
| 133 |
+
return;
|
| 134 |
+
}
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
void ScoreArray::load(const string &file)
|
| 139 |
+
{
|
| 140 |
+
TRACE_ERR("loading data from " << file << endl);
|
| 141 |
+
inputfilestream input_stream(file); // matches a stream with a file. Opens the file
|
| 142 |
+
istream* is = &input_stream;
|
| 143 |
+
load(is);
|
| 144 |
+
input_stream.close();
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
void ScoreArray::merge(ScoreArray& e)
|
| 149 |
+
{
|
| 150 |
+
//dummy implementation
|
| 151 |
+
for (size_t i=0; i<e.size(); i++)
|
| 152 |
+
add(e.get(i));
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
bool ScoreArray::check_consistency() const
|
| 156 |
+
{
|
| 157 |
+
const size_t sz = NumberOfScores();
|
| 158 |
+
if (sz == 0)
|
| 159 |
+
return true;
|
| 160 |
+
|
| 161 |
+
for (scorearray_t::const_iterator i = m_array.begin();
|
| 162 |
+
i != m_array.end(); ++i) {
|
| 163 |
+
if (i->size() != sz)
|
| 164 |
+
return false;
|
| 165 |
+
}
|
| 166 |
+
return true;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
}
|
mosesdecoder/mert/ScoreArray.h
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* ScoreArray.h
|
| 3 |
+
* mert - Minimum Error Rate Training
|
| 4 |
+
*
|
| 5 |
+
* Created by Nicola Bertoldi on 13/05/08.
|
| 6 |
+
*
|
| 7 |
+
*/
|
| 8 |
+
|
| 9 |
+
#ifndef MERT_SCORE_ARRAY_H_
|
| 10 |
+
#define MERT_SCORE_ARRAY_H_
|
| 11 |
+
|
| 12 |
+
#include <vector>
|
| 13 |
+
#include <iostream>
|
| 14 |
+
#include <string>
|
| 15 |
+
|
| 16 |
+
#include "ScoreStats.h"
|
| 17 |
+
|
| 18 |
+
namespace MosesTuning
|
| 19 |
+
{
|
| 20 |
+
|
| 21 |
+
const char SCORES_TXT_BEGIN[] = "SCORES_TXT_BEGIN_0";
|
| 22 |
+
const char SCORES_TXT_END[] = "SCORES_TXT_END_0";
|
| 23 |
+
const char SCORES_BIN_BEGIN[] = "SCORES_BIN_BEGIN_0";
|
| 24 |
+
const char SCORES_BIN_END[] = "SCORES_BIN_END_0";
|
| 25 |
+
|
| 26 |
+
class ScoreArray
|
| 27 |
+
{
|
| 28 |
+
private:
|
| 29 |
+
scorearray_t m_array;
|
| 30 |
+
std::string m_score_type;
|
| 31 |
+
std::size_t m_num_scores;
|
| 32 |
+
|
| 33 |
+
// indexx to identify the utterance.
|
| 34 |
+
// It can differ from the index inside the vector.
|
| 35 |
+
int m_index;
|
| 36 |
+
|
| 37 |
+
public:
|
| 38 |
+
ScoreArray();
|
| 39 |
+
~ScoreArray() {}
|
| 40 |
+
|
| 41 |
+
void clear() {
|
| 42 |
+
m_array.clear();
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
int getIndex() const {
|
| 46 |
+
return m_index;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
void setIndex(int value) {
|
| 50 |
+
m_index = value;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
ScoreStats& get(std::size_t i) {
|
| 54 |
+
return m_array.at(i);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
const ScoreStats& get(std::size_t i) const {
|
| 58 |
+
return m_array.at(i);
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
void add(const ScoreStats& e) {
|
| 62 |
+
m_array.push_back(e);
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
//ADDED BY TS
|
| 66 |
+
void swap(std::size_t i, std::size_t j) {
|
| 67 |
+
std::swap(m_array[i], m_array[j]);
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
void resize(std::size_t new_size) {
|
| 71 |
+
m_array.resize(std::min(new_size, m_array.size()));
|
| 72 |
+
}
|
| 73 |
+
//END_ADDED
|
| 74 |
+
|
| 75 |
+
void merge(ScoreArray& e);
|
| 76 |
+
|
| 77 |
+
std::string name() const {
|
| 78 |
+
return m_score_type;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
void name(std::string &score_type) {
|
| 82 |
+
m_score_type = score_type;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
std::size_t size() const {
|
| 86 |
+
return m_array.size();
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
std::size_t NumberOfScores() const {
|
| 90 |
+
return m_num_scores;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
void NumberOfScores(std::size_t v) {
|
| 94 |
+
m_num_scores = v;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
void savetxt(std::ostream* os, const std::string& score_type);
|
| 98 |
+
void savebin(std::ostream* os, const std::string& score_type);
|
| 99 |
+
void save(std::ostream* os, const std::string& score_type, bool bin=false);
|
| 100 |
+
void save(const std::string &file, const std::string& score_type, bool bin=false);
|
| 101 |
+
void save(const std::string& score_type, bool bin=false);
|
| 102 |
+
|
| 103 |
+
void loadtxt(std::istream* is, std::size_t n);
|
| 104 |
+
void loadbin(std::istream* is, std::size_t n);
|
| 105 |
+
void load(std::istream* is);
|
| 106 |
+
void load(const std::string &file);
|
| 107 |
+
|
| 108 |
+
bool check_consistency() const;
|
| 109 |
+
};
|
| 110 |
+
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
#endif // MERT_SCORE_ARRAY_H_
|
mosesdecoder/mert/Util.h
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Util.h
|
| 3 |
+
* mert - Minimum Error Rate Training
|
| 4 |
+
*
|
| 5 |
+
* Created by Nicola Bertoldi on 13/05/08.
|
| 6 |
+
*
|
| 7 |
+
*/
|
| 8 |
+
|
| 9 |
+
#ifndef MERT_UTIL_H_
|
| 10 |
+
#define MERT_UTIL_H_
|
| 11 |
+
|
| 12 |
+
#include <cmath>
|
| 13 |
+
#include <cstdlib>
|
| 14 |
+
#include <stdexcept>
|
| 15 |
+
#include <limits>
|
| 16 |
+
#include <vector>
|
| 17 |
+
#include <map>
|
| 18 |
+
#include <iostream>
|
| 19 |
+
#include <sstream>
|
| 20 |
+
#include <string>
|
| 21 |
+
#include <cstring>
|
| 22 |
+
|
| 23 |
+
#include "Types.h"
|
| 24 |
+
|
| 25 |
+
namespace MosesTuning
|
| 26 |
+
{
|
| 27 |
+
|
| 28 |
+
#ifdef TRACE_ENABLE
|
| 29 |
+
#define TRACE_ERR(str) { std::cerr << str; }
|
| 30 |
+
#else
|
| 31 |
+
#define TRACE_ERR(str) { }
|
| 32 |
+
#endif
|
| 33 |
+
|
| 34 |
+
#if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2)
|
| 35 |
+
// gcc nth_element() bug
|
| 36 |
+
#define NTH_ELEMENT3(begin, middle, end) std::sort(begin, end)
|
| 37 |
+
#define NTH_ELEMENT4(begin, middle, end, orderer) std::sort(begin, end, orderer)
|
| 38 |
+
#else
|
| 39 |
+
#define NTH_ELEMENT3(begin, middle, end) std::nth_element(begin, middle, end)
|
| 40 |
+
#define NTH_ELEMENT4(begin, middle, end, orderer) std::nth_element(begin, middle, end, orderer)
|
| 41 |
+
#endif
|
| 42 |
+
|
| 43 |
+
const char kDefaultDelimiterSymbol[] = " ";
|
| 44 |
+
|
| 45 |
+
int verboselevel();
|
| 46 |
+
int setverboselevel(int v);
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
const float kEPS = 0.0001f;
|
| 50 |
+
|
| 51 |
+
template <typename T>
|
| 52 |
+
bool IsAlmostEqual(T expected, T actual, float round=kEPS)
|
| 53 |
+
{
|
| 54 |
+
if (std::abs(expected - actual) < round) {
|
| 55 |
+
return true;
|
| 56 |
+
} else {
|
| 57 |
+
std::cerr << "Fail: expected = " << expected
|
| 58 |
+
<< " (actual = " << actual << ")" << std::endl;
|
| 59 |
+
return false;
|
| 60 |
+
}
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
/**
|
| 64 |
+
* Find the specified delimiter for the string 'str', and 'str' is assigned
|
| 65 |
+
* to a substring object that starts at the position of first occurrence of
|
| 66 |
+
* the delimiter in 'str'. 'substr' is copied from 'str' ranging from
|
| 67 |
+
* the start position of 'str' to the position of first occurrence of
|
| 68 |
+
* the delimiter.
|
| 69 |
+
*
|
| 70 |
+
* It returns the position of first occurrence in the queried string.
|
| 71 |
+
* If the content is not found, std::string::npos is returned.
|
| 72 |
+
*/
|
| 73 |
+
size_t getNextPound(std::string &str, std::string &substr,
|
| 74 |
+
const std::string &delimiter = kDefaultDelimiterSymbol);
|
| 75 |
+
|
| 76 |
+
void split(const std::string &s, char delim, std::vector<std::string> &elems);
|
| 77 |
+
|
| 78 |
+
/**
|
| 79 |
+
* Split the string 'str' with specified delimitter 'delim' into tokens.
|
| 80 |
+
* The resulting tokens are set to 'res'.
|
| 81 |
+
*
|
| 82 |
+
* ex. "a,b,c" => {"a", "b", "c"}.
|
| 83 |
+
*/
|
| 84 |
+
void Tokenize(const char *str, const char delim, std::vector<std::string> *res);
|
| 85 |
+
|
| 86 |
+
template<typename T>
|
| 87 |
+
inline T Scan(const std::string &input)
|
| 88 |
+
{
|
| 89 |
+
std::stringstream stream(input);
|
| 90 |
+
T ret;
|
| 91 |
+
stream >> ret;
|
| 92 |
+
return ret;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
/**
|
| 96 |
+
* Returns true iff "str" ends with "suffix".
|
| 97 |
+
* e.g., Given str = "abc:" and suffix = ":", this function returns true.
|
| 98 |
+
*/
|
| 99 |
+
inline bool EndsWith(const std::string& str, const char* suffix)
|
| 100 |
+
{
|
| 101 |
+
return str.find_last_of(suffix) == str.size() - 1;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
template<typename T>
|
| 105 |
+
inline std::string stringify(T x)
|
| 106 |
+
{
|
| 107 |
+
std::ostringstream o;
|
| 108 |
+
if (!(o << x))
|
| 109 |
+
throw std::runtime_error("stringify(template<typename T>)");
|
| 110 |
+
return o.str();
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
inline ScoreStatsType ConvertCharToScoreStatsType(const char *str)
|
| 114 |
+
{
|
| 115 |
+
return std::atoi(str);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
inline ScoreStatsType ConvertStringToScoreStatsType(const std::string& str)
|
| 119 |
+
{
|
| 120 |
+
return ConvertCharToScoreStatsType(str.c_str());
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
inline FeatureStatsType ConvertCharToFeatureStatsType(const char *str)
|
| 124 |
+
{
|
| 125 |
+
return static_cast<FeatureStatsType>(std::atof(str));
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
inline FeatureStatsType ConvertStringToFeatureStatsType(const std::string &str)
|
| 129 |
+
{
|
| 130 |
+
return ConvertCharToFeatureStatsType(str.c_str());
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
inline std::string trimStr(const std::string& Src, const std::string& c = " \r\n")
|
| 134 |
+
{
|
| 135 |
+
size_t p2 = Src.find_last_not_of(c);
|
| 136 |
+
if (p2 == std::string::npos) return std::string();
|
| 137 |
+
size_t p1 = Src.find_first_not_of(c);
|
| 138 |
+
if (p1 == std::string::npos) p1 = 0;
|
| 139 |
+
return Src.substr(p1, (p2-p1)+1);
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
// Utilities to measure decoding time
|
| 143 |
+
void ResetUserTime();
|
| 144 |
+
void PrintUserTime(const std::string &message);
|
| 145 |
+
double GetUserTime();
|
| 146 |
+
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
#endif // MERT_UTIL_H_
|
mosesdecoder/moses/TranslationModel/UG/util/ibm1-align
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:67f9b51b84f1b18fefcfe58feba9a9879648529fed29fbfb90ec0cec4f42a80e
|
| 3 |
+
size 1062799
|
mosesdecoder/scripts/Jamfile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#See ../Jamroot for options.
|
| 2 |
+
import option path ;
|
| 3 |
+
|
| 4 |
+
build-project training ;
|
| 5 |
+
|
| 6 |
+
prefix = [ option.get "prefix" ] ;
|
| 7 |
+
if $(prefix) {
|
| 8 |
+
prefix = [ path.root $(prefix) [ path.pwd ] ] ;
|
| 9 |
+
location = [ option.get "install-scripts" : : $(prefix)$(GITTAG)/scripts ] ;
|
| 10 |
+
} else {
|
| 11 |
+
location = [ option.get "install-scripts" ] ;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
if $(location) {
|
| 15 |
+
location = [ path.root $(location) [ path.pwd ] ] ;
|
| 16 |
+
install scripts :
|
| 17 |
+
[ glob-tree README *.js *.pl *.perl *.pm *.py *.sh *.php : tests regression-testing other bin ]
|
| 18 |
+
[ glob share/nonbreaking_prefixes/* ems/example/*.* ems/example/data/* ems/web/* analysis/smtgui/* : ems/web/javascripts ]
|
| 19 |
+
generic/fsa-sample.fsa
|
| 20 |
+
ems/experiment.machines
|
| 21 |
+
ems/experiment.meta
|
| 22 |
+
: <install-source-root>. <location>$(location) ;
|
| 23 |
+
}
|
mosesdecoder/scripts/README
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2006-07-29
|
| 2 |
+
|
| 3 |
+
This directory should contain all multi-purpose scripts for:
|
| 4 |
+
|
| 5 |
+
- training ... training moses (including BLEU evaluation needed for MERT)
|
| 6 |
+
- analysis ... analyzing MT output (for human analysis)
|
| 7 |
+
- generic ... script for handling generic issues (parallelization)
|
| 8 |
+
- lib ... perl modules used by various scripts
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
The Jamfile then takes care of proper 'release' from your git directory to
|
| 12 |
+
the shared directories.
|
| 13 |
+
|
| 14 |
+
The released scripts should remain in the *same directory structure*.
|
| 15 |
+
|
mosesdecoder/scripts/generic/binarize4moses2.perl
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
|
| 3 |
+
use strict;
|
| 4 |
+
|
| 5 |
+
use Getopt::Long;
|
| 6 |
+
use File::Basename;
|
| 7 |
+
use FindBin qw($RealBin);
|
| 8 |
+
|
| 9 |
+
sub systemCheck($);
|
| 10 |
+
|
| 11 |
+
my $mosesDir = "$RealBin/../..";
|
| 12 |
+
my $ptPath;
|
| 13 |
+
my $lexRoPath;
|
| 14 |
+
my $outPath;
|
| 15 |
+
my $numScores = 4;
|
| 16 |
+
my $numLexScores;
|
| 17 |
+
my $pruneNum = 100;
|
| 18 |
+
my $scfg = 0;
|
| 19 |
+
|
| 20 |
+
GetOptions("phrase-table=s" => \$ptPath,
|
| 21 |
+
"lex-ro=s" => \$lexRoPath,
|
| 22 |
+
"output-dir=s" => \$outPath,
|
| 23 |
+
"num-scores=s" => \$numScores,
|
| 24 |
+
"num-lex-scores=i" => \$numLexScores,
|
| 25 |
+
"prune=i" => \$pruneNum,
|
| 26 |
+
"scfg" => \$scfg
|
| 27 |
+
) or exit 1;
|
| 28 |
+
|
| 29 |
+
#print STDERR "scfg=$scfg \n";
|
| 30 |
+
die("ERROR: please set --phrase-table") unless defined($ptPath);
|
| 31 |
+
#die("ERROR: please set --lex-ro") unless defined($lexRoPath);
|
| 32 |
+
die("ERROR: please set --output-dir") unless defined($outPath);
|
| 33 |
+
#die("ERROR: please set --num-lex-scores") unless defined($numLexScores);
|
| 34 |
+
die("ERROR: compile contrib/sigtest-filter") if (!-X "$mosesDir/contrib/sigtest-filter/filter-pt");
|
| 35 |
+
die("ERROR: compile with bjam --with-cmph") if (!-X "$mosesDir/bin/processLexicalTableMin");
|
| 36 |
+
die("ERROR: compile with bjam --with-xmlrpc-c") if (!-X "$mosesDir/bin/CreateProbingPT");
|
| 37 |
+
|
| 38 |
+
my $cmd;
|
| 39 |
+
|
| 40 |
+
my $tempPath = dirname($outPath) ."/tmp.$$";
|
| 41 |
+
`mkdir -p $tempPath`;
|
| 42 |
+
|
| 43 |
+
$cmd = "gzip -dc $ptPath | $mosesDir/contrib/sigtest-filter/filter-pt -n $pruneNum | gzip -c > $tempPath/pt.gz";
|
| 44 |
+
systemCheck($cmd);
|
| 45 |
+
|
| 46 |
+
if (defined($lexRoPath)) {
|
| 47 |
+
die("ERROR: please set --num-lex-scores") unless defined($numLexScores);
|
| 48 |
+
|
| 49 |
+
$cmd = "$mosesDir/bin/processLexicalTableMin -in $lexRoPath -out $tempPath/lex-ro -T . -threads all";
|
| 50 |
+
systemCheck($cmd);
|
| 51 |
+
|
| 52 |
+
$cmd = "$mosesDir/bin/addLexROtoPT $tempPath/pt.gz $tempPath/lex-ro.minlexr | gzip -c > $tempPath/pt.withLexRO.gz";
|
| 53 |
+
systemCheck($cmd);
|
| 54 |
+
|
| 55 |
+
$cmd = "ln -s pt.withLexRO.gz $tempPath/pt.txt.gz";
|
| 56 |
+
systemCheck($cmd);
|
| 57 |
+
}
|
| 58 |
+
else {
|
| 59 |
+
$cmd = "ln -s pt.gz $tempPath/pt.txt.gz";
|
| 60 |
+
systemCheck($cmd);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
$cmd = "$mosesDir/bin/CreateProbingPT2 --num-scores $numScores --log-prob --input-pt $tempPath/pt.txt.gz --output-dir $outPath";
|
| 64 |
+
|
| 65 |
+
if (defined($lexRoPath)) {
|
| 66 |
+
$cmd .= " --num-lex-scores $numLexScores";
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
if ($scfg) {
|
| 70 |
+
$cmd .= " --scfg";
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
systemCheck($cmd);
|
| 74 |
+
|
| 75 |
+
exit(0);
|
| 76 |
+
|
| 77 |
+
#####################################################
|
| 78 |
+
sub systemCheck($)
|
| 79 |
+
{
|
| 80 |
+
my $cmd = shift;
|
| 81 |
+
print STDERR "Executing: $cmd\n";
|
| 82 |
+
|
| 83 |
+
my $retVal = system($cmd);
|
| 84 |
+
if ($retVal != 0)
|
| 85 |
+
{
|
| 86 |
+
exit(1);
|
| 87 |
+
}
|
| 88 |
+
}
|
mosesdecoder/scripts/generic/bsbleu.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# compute Bleu scores with confidence intervals via boostrap resampling
|
| 3 |
+
# written by Ulrich Germann
|
| 4 |
+
#
|
| 5 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 6 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 7 |
+
|
| 8 |
+
from argparse import ArgumentParser
|
| 9 |
+
import math
|
| 10 |
+
import os
|
| 11 |
+
from random import randint
|
| 12 |
+
import sys, gzip
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def count_ngrams(snt, max_n):
|
| 16 |
+
"""
|
| 17 |
+
Return a dictionary of ngram counts (up to length /max_n/)
|
| 18 |
+
for sentence (list of words) /snt/.
|
| 19 |
+
"""
|
| 20 |
+
ret = {}
|
| 21 |
+
for i in xrange(len(snt)):
|
| 22 |
+
for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)):
|
| 23 |
+
key = tuple(snt[i:k])
|
| 24 |
+
ret[key] = ret.get(key, 0) + 1
|
| 25 |
+
return ret
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def max_counts(ng1, ng2):
|
| 29 |
+
"""
|
| 30 |
+
Return a dicitonary of ngram counts such that
|
| 31 |
+
each count is the greater of the two individual counts
|
| 32 |
+
for each ngram in the input ngram count dictionaries
|
| 33 |
+
/ng1/ and /ng2/.
|
| 34 |
+
"""
|
| 35 |
+
ret = ng1.copy()
|
| 36 |
+
for k, v in ng2.items():
|
| 37 |
+
ret[k] = max(ret.get(k, 0), v)
|
| 38 |
+
return ret
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def ng_hits(hyp, ref, max_n):
|
| 42 |
+
"""
|
| 43 |
+
Return a list of ngram counts such that each ngram count
|
| 44 |
+
is the minimum of the counts in hyp and ref, up to ngram
|
| 45 |
+
length /max_n/.
|
| 46 |
+
"""
|
| 47 |
+
ret = [0 for i in xrange(max_n)]
|
| 48 |
+
for ng, cnt in hyp.items():
|
| 49 |
+
k = ng
|
| 50 |
+
if len(k) <= max_n:
|
| 51 |
+
ret[len(k) - 1] += min(cnt, ref.get(ng, 0))
|
| 52 |
+
return ret
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class BleuScore:
|
| 56 |
+
def __init__(self, hyp, ref, max_n=4, bootstrap=1000):
|
| 57 |
+
# print len(hyp.ngrams), len(ref.ngrams), "X"
|
| 58 |
+
self.hits = [
|
| 59 |
+
ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n)
|
| 60 |
+
for i in xrange(len(hyp.ngrams))]
|
| 61 |
+
self.max_n = max_n
|
| 62 |
+
self.hyp = hyp
|
| 63 |
+
self.ref = ref
|
| 64 |
+
self.lower = None
|
| 65 |
+
self.upper = None
|
| 66 |
+
self.median = None
|
| 67 |
+
self.actual = self.score([i for i in xrange(len(hyp.snt))])
|
| 68 |
+
if bootstrap:
|
| 69 |
+
self.bootstrap = [self.score([randint(0, len(hyp.snt) - 1)
|
| 70 |
+
for s in hyp.snt])
|
| 71 |
+
for i in xrange(bootstrap)]
|
| 72 |
+
self.bootstrap.sort()
|
| 73 |
+
else:
|
| 74 |
+
self.bootstrap = [self.actual]
|
| 75 |
+
pass
|
| 76 |
+
|
| 77 |
+
def score(self, sample):
|
| 78 |
+
hits = [0 for i in xrange(self.max_n)]
|
| 79 |
+
self.hyplen = 0
|
| 80 |
+
self.reflen = 0
|
| 81 |
+
self.total = [0 for i in hits]
|
| 82 |
+
for i in sample:
|
| 83 |
+
self.hyplen += len(self.hyp.snt[i])
|
| 84 |
+
self.reflen += len(self.ref.snt[i])
|
| 85 |
+
for n in xrange(self.max_n):
|
| 86 |
+
hits[n] += self.hits[i][n]
|
| 87 |
+
self.total[n] += max(len(self.hyp.snt[i]) - n, 0)
|
| 88 |
+
pass
|
| 89 |
+
self.prec = [float(hits[n]) / self.total[n]
|
| 90 |
+
for n in xrange(self.max_n)]
|
| 91 |
+
ret = sum([math.log(x) for x in self.prec]) / self.max_n
|
| 92 |
+
self.BP = min(
|
| 93 |
+
1, math.exp(1. - float(self.reflen) / float(self.hyplen)))
|
| 94 |
+
ret += math.log(self.BP)
|
| 95 |
+
return math.exp(ret)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class Document:
|
| 99 |
+
def __init__(self, fname=None):
|
| 100 |
+
self.fname = fname
|
| 101 |
+
if fname:
|
| 102 |
+
if fname[-3:] == ".gz":
|
| 103 |
+
self.snt = [line.strip().split() for line in gzip.open(fname).readlines()]
|
| 104 |
+
else:
|
| 105 |
+
self.snt = [line.strip().split() for line in open(fname)]
|
| 106 |
+
pass
|
| 107 |
+
self.ngrams = [count_ngrams(snt, 4) for snt in self.snt]
|
| 108 |
+
# print self.snt
|
| 109 |
+
else:
|
| 110 |
+
self.snt = None
|
| 111 |
+
self.ngrams = None
|
| 112 |
+
|
| 113 |
+
def merge(self, R):
|
| 114 |
+
self.fname = "multi-ref"
|
| 115 |
+
self.ngrams = [x for x in R[0].ngrams]
|
| 116 |
+
self.snt = [x for x in R[0].snt]
|
| 117 |
+
for i in xrange(len(R[0].ngrams)):
|
| 118 |
+
for k in xrange(1, len(R)):
|
| 119 |
+
self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i])
|
| 120 |
+
|
| 121 |
+
def update(self, hyp, R):
|
| 122 |
+
for i, hyp_snt in enumerate(hyp.snt):
|
| 123 |
+
clen = len(hyp_snt)
|
| 124 |
+
K = 0
|
| 125 |
+
for k in xrange(1, len(R)):
|
| 126 |
+
k_snt = R[k].snt[i]
|
| 127 |
+
assert len(R[k].snt) == len(hyp.snt), (
|
| 128 |
+
"Mismatch in number of sentences " +
|
| 129 |
+
"between reference and candidate")
|
| 130 |
+
if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen):
|
| 131 |
+
if len(k_snt) < len(R[K].snt[i]):
|
| 132 |
+
K = k
|
| 133 |
+
elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen):
|
| 134 |
+
K = k
|
| 135 |
+
self.snt[i] = R[K].snt[i]
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
if __name__ == "__main__":
|
| 139 |
+
argparser = ArgumentParser()
|
| 140 |
+
argparser.add_argument(
|
| 141 |
+
"-r", "--ref", nargs='+', help="Reference translation(s).")
|
| 142 |
+
argparser.add_argument(
|
| 143 |
+
"-c", "--cand", nargs='+', help="Candidate translations.")
|
| 144 |
+
argparser.add_argument(
|
| 145 |
+
"-i", "--individual", action='store_true',
|
| 146 |
+
help="Compute BLEU scores for individual references.")
|
| 147 |
+
argparser.add_argument(
|
| 148 |
+
"-b", "--bootstrap", type=int, default=1000,
|
| 149 |
+
help="Sample size for bootstrap resampling.")
|
| 150 |
+
argparser.add_argument(
|
| 151 |
+
"-a", "--alpha", type=float, default=.05,
|
| 152 |
+
help="1-alpha = confidence interval.")
|
| 153 |
+
args = argparser.parse_args(sys.argv[1:])
|
| 154 |
+
R = [Document(fname) for fname in args.ref]
|
| 155 |
+
C = [Document(fname) for fname in args.cand]
|
| 156 |
+
Rx = Document() # for multi-reference BLEU
|
| 157 |
+
Rx.merge(R)
|
| 158 |
+
for c in C:
|
| 159 |
+
# compute multi-reference BLEU
|
| 160 |
+
Rx.update(c, R)
|
| 161 |
+
bleu = BleuScore(c, Rx, bootstrap=args.bootstrap)
|
| 162 |
+
print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % (
|
| 163 |
+
100 * bleu.actual,
|
| 164 |
+
os.path.basename(Rx.fname),
|
| 165 |
+
100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)],
|
| 166 |
+
100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)],
|
| 167 |
+
100 * bleu.bootstrap[int(.5 * args.bootstrap)],
|
| 168 |
+
c.fname) # os.path.basename(c.fname))
|
| 169 |
+
|
| 170 |
+
if args.individual:
|
| 171 |
+
for r in R:
|
| 172 |
+
bleu = BleuScore(c, r, bootstrap=args.bootstrap)
|
| 173 |
+
print " %5.2f %s" % (
|
| 174 |
+
100 * bleu.actual, os.path.basename(r.fname))
|
| 175 |
+
# print bleu.prec, bleu.hyplen, bleu.reflen, bleu.BP
|
| 176 |
+
|
| 177 |
+
# print [
|
| 178 |
+
# sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))])
|
| 179 |
+
# for n in xrange(4)]
|
mosesdecoder/scripts/generic/compound-splitter.perl
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
use warnings;
|
| 7 |
+
use strict;
|
| 8 |
+
use Getopt::Long "GetOptions";
|
| 9 |
+
|
| 10 |
+
my ($CORPUS,$MODEL,$TRAIN,$HELP,$VERBOSE);
|
| 11 |
+
my $FILLER = ":s:es";
|
| 12 |
+
my $MIN_SIZE = 3;
|
| 13 |
+
my $MIN_COUNT = 5;
|
| 14 |
+
my $MAX_COUNT = 5;
|
| 15 |
+
my $FACTORED = 0;
|
| 16 |
+
my $SYNTAX = 0;
|
| 17 |
+
my $MARK_SPLIT = 0;
|
| 18 |
+
my $BINARIZE = 0;
|
| 19 |
+
$HELP = 1
|
| 20 |
+
unless &GetOptions('corpus=s' => \$CORPUS,
|
| 21 |
+
'model=s' => \$MODEL,
|
| 22 |
+
'filler=s' => \$FILLER,
|
| 23 |
+
'factored' => \$FACTORED,
|
| 24 |
+
'min-size=i' => \$MIN_SIZE,
|
| 25 |
+
'min-count=i' => \$MIN_COUNT,
|
| 26 |
+
'max-count=i' => \$MAX_COUNT,
|
| 27 |
+
'help' => \$HELP,
|
| 28 |
+
'verbose' => \$VERBOSE,
|
| 29 |
+
'syntax' => \$SYNTAX,
|
| 30 |
+
'binarize' => \$BINARIZE,
|
| 31 |
+
'mark-split' => \$MARK_SPLIT,
|
| 32 |
+
'train' => \$TRAIN);
|
| 33 |
+
|
| 34 |
+
if ($HELP ||
|
| 35 |
+
( $TRAIN && !$CORPUS) ||
|
| 36 |
+
(!$TRAIN && !$MODEL)) {
|
| 37 |
+
print "Compound splitter\n";
|
| 38 |
+
print "-----------------\n\n";
|
| 39 |
+
print "train: compound-splitter -train -corpus txt-file -model new-model\n";
|
| 40 |
+
print "apply: compound-splitter -model trained-model < in > out\n";
|
| 41 |
+
print "options: -min-size: minimum word size (default $MIN_SIZE)\n";
|
| 42 |
+
print " -min-count: minimum word count (default $MIN_COUNT)\n";
|
| 43 |
+
print " -filler: filler letters between words (default $FILLER)\n";
|
| 44 |
+
print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n";
|
| 45 |
+
print " -syntax: syntactically parsed data (default $SYNTAX)\n";
|
| 46 |
+
print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n";
|
| 47 |
+
print " -binarize: binarize subtree for split word (default $BINARIZE)\n";
|
| 48 |
+
exit;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
if ($TRAIN) {
|
| 52 |
+
if ($SYNTAX) { &train_syntax(); }
|
| 53 |
+
elsif ($FACTORED) { &train_factored(); }
|
| 54 |
+
else { &train(); }
|
| 55 |
+
}
|
| 56 |
+
else {
|
| 57 |
+
&apply();
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
sub train {
|
| 61 |
+
my %COUNT;
|
| 62 |
+
open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
|
| 63 |
+
while(<CORPUS>) {
|
| 64 |
+
chop; s/\s+/ /g; s/^ //; s/ $//;
|
| 65 |
+
foreach (split) {
|
| 66 |
+
$COUNT{$_}++;
|
| 67 |
+
}
|
| 68 |
+
}
|
| 69 |
+
close(CORPUS);
|
| 70 |
+
&save_trained_model(\%COUNT);
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
sub save_trained_model {
|
| 74 |
+
my ($COUNT) = @_;
|
| 75 |
+
my $id = 0;
|
| 76 |
+
open(MODEL,">".$MODEL);
|
| 77 |
+
foreach my $word (keys %$COUNT) {
|
| 78 |
+
print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n";
|
| 79 |
+
}
|
| 80 |
+
close(MODEL);
|
| 81 |
+
print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n";
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
sub train_factored {
|
| 85 |
+
my (%COUNT,%FACTORED_COUNT);
|
| 86 |
+
# collect counts for interpretations for each surface word
|
| 87 |
+
open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
|
| 88 |
+
while(<CORPUS>) {
|
| 89 |
+
chop; s/\s+/ /g; s/^ //; s/ $//;
|
| 90 |
+
foreach my $factored_word (split) {
|
| 91 |
+
my $word = $factored_word;
|
| 92 |
+
$word =~ s/\|.+//g; # just first factor
|
| 93 |
+
$FACTORED_COUNT{$word}{$factored_word}++;
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
close(CORPUS);
|
| 97 |
+
# only preserve most frequent interpretation, assign sum of counts
|
| 98 |
+
foreach my $word (keys %FACTORED_COUNT) {
|
| 99 |
+
my ($max,$best,$total) = (0,"",0);
|
| 100 |
+
foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) {
|
| 101 |
+
my $count = $FACTORED_COUNT{$word}{$factored_word};
|
| 102 |
+
$total += $count;
|
| 103 |
+
if ($count > $max) {
|
| 104 |
+
$max = $count;
|
| 105 |
+
$best = $factored_word;
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
$COUNT{$best} = $total;
|
| 109 |
+
}
|
| 110 |
+
&save_trained_model(\%COUNT);
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
sub train_syntax {
|
| 114 |
+
my (%COUNT,%LABELED_COUNT);
|
| 115 |
+
# collect counts for interpretations for each surface word
|
| 116 |
+
open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
|
| 117 |
+
while(<CORPUS>) {
|
| 118 |
+
chop; s/\s+/ /g; s/^ //; s/ $//;
|
| 119 |
+
my $label;
|
| 120 |
+
foreach (split) {
|
| 121 |
+
if (/^label="([^\"]+)"/) {
|
| 122 |
+
$label = $1;
|
| 123 |
+
}
|
| 124 |
+
elsif (! /^</) {
|
| 125 |
+
$LABELED_COUNT{$_}{$label}++;
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
close(CORPUS);
|
| 130 |
+
|
| 131 |
+
# only preserve most frequent label, assign sum of counts
|
| 132 |
+
foreach my $word (keys %LABELED_COUNT) {
|
| 133 |
+
my ($max,$best,$total) = (0,"",0);
|
| 134 |
+
foreach my $label (keys %{$LABELED_COUNT{$word}}) {
|
| 135 |
+
my $count = $LABELED_COUNT{$word}{$label};
|
| 136 |
+
$total += $count;
|
| 137 |
+
if ($count > $max) {
|
| 138 |
+
$max = $count;
|
| 139 |
+
$best = "$word $label";
|
| 140 |
+
}
|
| 141 |
+
}
|
| 142 |
+
$COUNT{$best} = $total;
|
| 143 |
+
}
|
| 144 |
+
&save_trained_model(\%COUNT);
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
sub apply {
|
| 148 |
+
my (%COUNT,%TRUECASE,%LABEL);
|
| 149 |
+
open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'");
|
| 150 |
+
while(<MODEL>) {
|
| 151 |
+
chomp;
|
| 152 |
+
my ($id,$factored_word,$count) = split(/\t/);
|
| 153 |
+
my $label;
|
| 154 |
+
($factored_word,$label) = split(/ /,$factored_word);
|
| 155 |
+
my $word = $factored_word;
|
| 156 |
+
$word =~ s/\|.+//g; # just first factor
|
| 157 |
+
my $lc = lc($word);
|
| 158 |
+
# if word exists with multipe casings, only record most frequent
|
| 159 |
+
next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
|
| 160 |
+
$COUNT{$lc} = $count;
|
| 161 |
+
$TRUECASE{$lc} = $factored_word;
|
| 162 |
+
$LABEL{$lc} = $label if $SYNTAX;
|
| 163 |
+
}
|
| 164 |
+
close(MODEL);
|
| 165 |
+
|
| 166 |
+
while(<STDIN>) {
|
| 167 |
+
my $first = 1;
|
| 168 |
+
chop; s/\s+/ /g; s/^ //; s/ $//;
|
| 169 |
+
my @BUFFER; # for xml tags
|
| 170 |
+
foreach my $factored_word (split) {
|
| 171 |
+
print " " unless $first;
|
| 172 |
+
$first = 0;
|
| 173 |
+
|
| 174 |
+
# syntax: don't split xml
|
| 175 |
+
if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
|
| 176 |
+
push @BUFFER,$factored_word;
|
| 177 |
+
$first = 1;
|
| 178 |
+
next;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
# get case class
|
| 182 |
+
my $word = $factored_word;
|
| 183 |
+
$word =~ s/\|.+//g; # just first factor
|
| 184 |
+
my $lc = lc($word);
|
| 185 |
+
|
| 186 |
+
print STDERR "considering $word ($lc)...\n" if $VERBOSE;
|
| 187 |
+
# don't split frequent words
|
| 188 |
+
if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
|
| 189 |
+
$lc !~ /[a-zA-Z]/) {; # has to have at least one letter
|
| 190 |
+
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
|
| 191 |
+
print $factored_word;
|
| 192 |
+
print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
|
| 193 |
+
next;
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
# consider possible splits
|
| 197 |
+
my $final = length($word)-1;
|
| 198 |
+
my %REACHABLE;
|
| 199 |
+
for(my $i=0;$i<=$final;$i++) { $REACHABLE{$i} = (); }
|
| 200 |
+
|
| 201 |
+
print STDERR "splitting $word:\n" if $VERBOSE;
|
| 202 |
+
for(my $end=$MIN_SIZE;$end<length($word);$end++) {
|
| 203 |
+
for(my $start=0;$start<=$end-$MIN_SIZE;$start++) {
|
| 204 |
+
next unless $start == 0 || defined($REACHABLE{$start-1});
|
| 205 |
+
foreach my $filler (split(/:/,$FILLER)) {
|
| 206 |
+
next if $start == 0 && $filler ne "";
|
| 207 |
+
next if lc(substr($word,$start,length($filler))) ne $filler;
|
| 208 |
+
my $subword = lc(substr($word,
|
| 209 |
+
$start+length($filler),
|
| 210 |
+
$end-$start+1-length($filler)));
|
| 211 |
+
next unless defined($COUNT{$subword});
|
| 212 |
+
next unless $COUNT{$subword} >= $MIN_COUNT;
|
| 213 |
+
print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE;
|
| 214 |
+
push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}";
|
| 215 |
+
}
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
# no matches at all?
|
| 220 |
+
if (!defined($REACHABLE{$final})) {
|
| 221 |
+
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
|
| 222 |
+
print $factored_word;
|
| 223 |
+
next;
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
my ($best_split,$best_score) = ("",0);
|
| 227 |
+
|
| 228 |
+
my %ITERATOR;
|
| 229 |
+
for(my $i=0;$i<=$final;$i++) { $ITERATOR{$i}=0; }
|
| 230 |
+
my $done = 0;
|
| 231 |
+
while(1) {
|
| 232 |
+
# read off word
|
| 233 |
+
my ($pos,$decomp,$score,$num,@INDEX) = ($final,"",1,0);
|
| 234 |
+
while($pos>0) {
|
| 235 |
+
last unless scalar @{$REACHABLE{$pos}} > $ITERATOR{$pos}; # dead end?
|
| 236 |
+
my ($nextpos,$subword,$count)
|
| 237 |
+
= split(/ /,$REACHABLE{$pos}[ $ITERATOR{$pos} ]);
|
| 238 |
+
$decomp = $subword." ".$decomp;
|
| 239 |
+
$score *= $count;
|
| 240 |
+
$num++;
|
| 241 |
+
push @INDEX,$pos;
|
| 242 |
+
# print STDERR "($nextpos-$pos,$decomp,$score,$num)\n";
|
| 243 |
+
$pos = $nextpos-1;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
chop($decomp);
|
| 247 |
+
print STDERR "\tsplit: $decomp ($score ** 1/$num) = ".($score ** (1/$num))."\n" if $VERBOSE;
|
| 248 |
+
$score **= 1/$num;
|
| 249 |
+
if ($score>$best_score) {
|
| 250 |
+
$best_score = $score;
|
| 251 |
+
$best_split = $decomp;
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
# increase iterator
|
| 255 |
+
my $increase = -1;
|
| 256 |
+
while($increase<$final) {
|
| 257 |
+
$increase = pop @INDEX;
|
| 258 |
+
$ITERATOR{$increase}++;
|
| 259 |
+
last if scalar @{$REACHABLE{$increase}} > $ITERATOR{$increase};
|
| 260 |
+
}
|
| 261 |
+
last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final};
|
| 262 |
+
for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; }
|
| 263 |
+
}
|
| 264 |
+
if ($best_split !~ / /) {
|
| 265 |
+
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
|
| 266 |
+
print $factored_word; # do not change case for unsplit words
|
| 267 |
+
next;
|
| 268 |
+
}
|
| 269 |
+
if (!$SYNTAX) {
|
| 270 |
+
print $best_split;
|
| 271 |
+
}
|
| 272 |
+
else {
|
| 273 |
+
$BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT;
|
| 274 |
+
$BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n");
|
| 275 |
+
my $pos = $1;
|
| 276 |
+
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
|
| 277 |
+
|
| 278 |
+
my @SPLIT = split(/ /,$best_split);
|
| 279 |
+
my @OUT = ();
|
| 280 |
+
if ($BINARIZE) {
|
| 281 |
+
for(my $w=0;$w<scalar(@SPLIT)-2;$w++) {
|
| 282 |
+
push @OUT,"<tree label=\"\@$pos\">";
|
| 283 |
+
}
|
| 284 |
+
}
|
| 285 |
+
for(my $w=0;$w<scalar(@SPLIT);$w++) {
|
| 286 |
+
if ($BINARIZE && $w>=2) { push @OUT, "</tree>"; }
|
| 287 |
+
push @OUT,"<tree label=\"".$LABEL{lc($SPLIT[$w])}."\"> $SPLIT[$w] </tree>";
|
| 288 |
+
}
|
| 289 |
+
print join(" ",@OUT);
|
| 290 |
+
}
|
| 291 |
+
}
|
| 292 |
+
print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer
|
| 293 |
+
print "\n";
|
| 294 |
+
}
|
| 295 |
+
}
|
mosesdecoder/scripts/generic/extract-factors.pl
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
# $Id$
|
| 7 |
+
#extract-factors.pl: extract only the desired factors from a factored corpus
|
| 8 |
+
#usage: extract-factors corpusfile factor-index factor-index ... > outfile
|
| 9 |
+
#factor indices start at 0
|
| 10 |
+
#factor indices too large ought to be ignored
|
| 11 |
+
|
| 12 |
+
use warnings;
|
| 13 |
+
use strict;
|
| 14 |
+
|
| 15 |
+
my ($filename, @factors) = @ARGV;
|
| 16 |
+
my %indices = map {$_ => 1} @factors;
|
| 17 |
+
|
| 18 |
+
open(INFILE, "<$filename") or die "couldn't open '$filename' for read: $!\n";
|
| 19 |
+
while(my $line = <INFILE>)
|
| 20 |
+
{
|
| 21 |
+
chop $line;
|
| 22 |
+
print join(' ', map {my $i = 0; join('|', grep($indices{$i++}, split(/\|/, $_)))} split(/\s+/, $line)) . "\n";
|
| 23 |
+
}
|
| 24 |
+
close(INFILE);
|
mosesdecoder/scripts/generic/extract-parallel.perl
ADDED
|
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
# example
|
| 7 |
+
# ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput
|
| 8 |
+
|
| 9 |
+
use warnings;
|
| 10 |
+
use strict;
|
| 11 |
+
use File::Basename;
|
| 12 |
+
|
| 13 |
+
sub RunFork($);
|
| 14 |
+
sub systemCheck($);
|
| 15 |
+
sub NumStr($);
|
| 16 |
+
sub DigitStr($);
|
| 17 |
+
sub CharStr($);
|
| 18 |
+
sub GetSplitVersion($);
|
| 19 |
+
|
| 20 |
+
my $alph = "abcdefghijklmnopqrstuvwxyz";
|
| 21 |
+
my @alph = (split(//,$alph));
|
| 22 |
+
|
| 23 |
+
print "Started ".localtime() ."\n";
|
| 24 |
+
|
| 25 |
+
my $numParallel= $ARGV[0];
|
| 26 |
+
$numParallel = 1 if $numParallel < 1;
|
| 27 |
+
|
| 28 |
+
my $splitCmd= $ARGV[1];
|
| 29 |
+
my $sortCmd= $ARGV[2];
|
| 30 |
+
my $extractCmd= $ARGV[3];
|
| 31 |
+
|
| 32 |
+
my $target = $ARGV[4]; # 1st arg of extract argument
|
| 33 |
+
my $source = $ARGV[5]; # 2nd arg of extract argument
|
| 34 |
+
my $align = $ARGV[6]; # 3rd arg of extract argument
|
| 35 |
+
my $extract = $ARGV[7]; # 4th arg of extract argument
|
| 36 |
+
|
| 37 |
+
my $makeTTable = 1; # whether to build the ttable extract files
|
| 38 |
+
my $otherExtractArgs= "";
|
| 39 |
+
my $weights = "";
|
| 40 |
+
my $baselineExtract;
|
| 41 |
+
my $glueFile;
|
| 42 |
+
my $phraseOrientation = 0;
|
| 43 |
+
my $phraseOrientationPriorsFile;
|
| 44 |
+
my $splitCmdOption = "";
|
| 45 |
+
|
| 46 |
+
my $GZIP_EXEC;
|
| 47 |
+
if(`which pigz 2> /dev/null`) {
|
| 48 |
+
$GZIP_EXEC = 'pigz';
|
| 49 |
+
}
|
| 50 |
+
else {
|
| 51 |
+
$GZIP_EXEC = 'gzip';
|
| 52 |
+
}
|
| 53 |
+
print STDERR "using $GZIP_EXEC \n";
|
| 54 |
+
|
| 55 |
+
my $isBSDSplit = GetSplitVersion($splitCmd);
|
| 56 |
+
print STDERR "isBSDSplit=$isBSDSplit \n";
|
| 57 |
+
|
| 58 |
+
if ($isBSDSplit == 0) {
|
| 59 |
+
$splitCmdOption .= "-d";
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
my $gzOut = 0;
|
| 63 |
+
|
| 64 |
+
for (my $i = 8; $i < $#ARGV + 1; ++$i)
|
| 65 |
+
{
|
| 66 |
+
$makeTTable = 0 if $ARGV[$i] eq "--NoTTable";
|
| 67 |
+
if ($ARGV[$i] eq '--BaselineExtract') {
|
| 68 |
+
$baselineExtract = $ARGV[++$i];
|
| 69 |
+
next;
|
| 70 |
+
}
|
| 71 |
+
if ($ARGV[$i] eq '--InstanceWeights') {
|
| 72 |
+
$weights = $ARGV[++$i];
|
| 73 |
+
next;
|
| 74 |
+
}
|
| 75 |
+
if ($ARGV[$i] eq '--GlueGrammar') {
|
| 76 |
+
$glueFile = $ARGV[++$i];
|
| 77 |
+
next;
|
| 78 |
+
}
|
| 79 |
+
$phraseOrientation = 1 if $ARGV[$i] eq "--PhraseOrientation";
|
| 80 |
+
if ($ARGV[$i] eq '--PhraseOrientationPriors') {
|
| 81 |
+
$phraseOrientationPriorsFile = $ARGV[++$i];
|
| 82 |
+
next;
|
| 83 |
+
}
|
| 84 |
+
if ($ARGV[$i] eq '--GZOutput') {
|
| 85 |
+
$gzOut = 1;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
$otherExtractArgs .= $ARGV[$i] ." ";
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
die("Need to specify --GZOutput for parallel extract") if ($gzOut == 0);
|
| 92 |
+
|
| 93 |
+
my $cmd;
|
| 94 |
+
my $TMPDIR=dirname($extract) ."/tmp.$$";
|
| 95 |
+
$cmd = "mkdir -p $TMPDIR; ls -l $TMPDIR";
|
| 96 |
+
print STDERR "Executing: $cmd \n";
|
| 97 |
+
`$cmd`;
|
| 98 |
+
|
| 99 |
+
my $totalLines = int(`cat $align | wc -l`);
|
| 100 |
+
my $linesPerSplit = int($totalLines / $numParallel) + 1;
|
| 101 |
+
|
| 102 |
+
print "total=$totalLines line-per-split=$linesPerSplit \n";
|
| 103 |
+
|
| 104 |
+
my @children;
|
| 105 |
+
my $pid;
|
| 106 |
+
|
| 107 |
+
if ($numParallel > 1)
|
| 108 |
+
{
|
| 109 |
+
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $target $TMPDIR/target.";
|
| 110 |
+
$pid = RunFork($cmd);
|
| 111 |
+
push(@children, $pid);
|
| 112 |
+
|
| 113 |
+
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $source $TMPDIR/source.";
|
| 114 |
+
$pid = RunFork($cmd);
|
| 115 |
+
push(@children, $pid);
|
| 116 |
+
|
| 117 |
+
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $align $TMPDIR/align.";
|
| 118 |
+
$pid = RunFork($cmd);
|
| 119 |
+
push(@children, $pid);
|
| 120 |
+
|
| 121 |
+
if ($weights) {
|
| 122 |
+
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $weights $TMPDIR/weights.";
|
| 123 |
+
$pid = RunFork($cmd);
|
| 124 |
+
push(@children, $pid);
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
# wait for everything is finished
|
| 128 |
+
foreach (@children) {
|
| 129 |
+
waitpid($_, 0);
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
}
|
| 133 |
+
else
|
| 134 |
+
{
|
| 135 |
+
my $numStr = NumStr(0);
|
| 136 |
+
|
| 137 |
+
$cmd = "ln -s $target $TMPDIR/target.$numStr";
|
| 138 |
+
`$cmd`;
|
| 139 |
+
|
| 140 |
+
$cmd = "ln -s $source $TMPDIR/source.$numStr";
|
| 141 |
+
`$cmd`;
|
| 142 |
+
|
| 143 |
+
$cmd = "ln -s $align $TMPDIR/align.$numStr";
|
| 144 |
+
`$cmd`;
|
| 145 |
+
|
| 146 |
+
if ($weights) {
|
| 147 |
+
$cmd = "ln -s $weights $TMPDIR/weights.$numStr";
|
| 148 |
+
`$cmd`;
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
# run extract
|
| 153 |
+
@children = ();
|
| 154 |
+
for (my $i = 0; $i < $numParallel; ++$i)
|
| 155 |
+
{
|
| 156 |
+
my $pid = fork();
|
| 157 |
+
|
| 158 |
+
if ($pid == 0)
|
| 159 |
+
{ # child
|
| 160 |
+
my $numStr = NumStr($i);
|
| 161 |
+
my $weightsCmd = "";
|
| 162 |
+
if ($weights) {
|
| 163 |
+
$weightsCmd = "--InstanceWeights $TMPDIR/weights.$numStr";
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
my $glueArg = "";
|
| 167 |
+
if (defined($glueFile)) {
|
| 168 |
+
$glueArg = "--GlueGrammar $TMPDIR/glue.$numStr";
|
| 169 |
+
}
|
| 170 |
+
#print STDERR "glueArg=$glueArg \n";
|
| 171 |
+
|
| 172 |
+
my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $glueArg $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n";
|
| 173 |
+
`$cmd`;
|
| 174 |
+
|
| 175 |
+
exit();
|
| 176 |
+
}
|
| 177 |
+
else
|
| 178 |
+
{ # parent
|
| 179 |
+
push(@children, $pid);
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
# wait for everything is finished
|
| 184 |
+
foreach (@children) {
|
| 185 |
+
waitpid($_, 0);
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
# merge
|
| 189 |
+
my $catCmd = "gunzip -c ";
|
| 190 |
+
my $catInvCmd = $catCmd;
|
| 191 |
+
my $catOCmd = $catCmd;
|
| 192 |
+
my $catContextCmd = $catCmd;
|
| 193 |
+
my $catContextInvCmd = $catCmd;
|
| 194 |
+
|
| 195 |
+
for (my $i = 0; $i < $numParallel; ++$i)
|
| 196 |
+
{
|
| 197 |
+
my $numStr = NumStr($i);
|
| 198 |
+
$catCmd .= "$TMPDIR/extract.$numStr.gz ";
|
| 199 |
+
$catInvCmd .= "$TMPDIR/extract.$numStr.inv.gz ";
|
| 200 |
+
$catOCmd .= "$TMPDIR/extract.$numStr.o.gz ";
|
| 201 |
+
$catContextCmd .= "$TMPDIR/extract.$numStr.context ";
|
| 202 |
+
$catContextInvCmd .= "$TMPDIR/extract.$numStr.context.inv ";
|
| 203 |
+
}
|
| 204 |
+
if (defined($baselineExtract)) {
|
| 205 |
+
my $sorted = -e "$baselineExtract.sorted.gz" ? ".sorted" : "";
|
| 206 |
+
$catCmd .= "$baselineExtract$sorted.gz ";
|
| 207 |
+
$catInvCmd .= "$baselineExtract.inv$sorted.gz ";
|
| 208 |
+
$catOCmd .= "$baselineExtract.o$sorted.gz ";
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.sorted.gz 2>> /dev/stderr \n";
|
| 212 |
+
$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.inv.sorted.gz 2>> /dev/stderr \n";
|
| 213 |
+
$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.o.sorted.gz 2>> /dev/stderr \n";
|
| 214 |
+
$catContextCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.sorted.gz 2>> /dev/stderr \n";
|
| 215 |
+
$catContextInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.inv.sorted.gz 2>> /dev/stderr \n";
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
@children = ();
|
| 219 |
+
if ($makeTTable)
|
| 220 |
+
{
|
| 221 |
+
print STDERR "merging extract / extract.inv\n";
|
| 222 |
+
$pid = RunFork($catCmd);
|
| 223 |
+
push(@children, $pid);
|
| 224 |
+
|
| 225 |
+
$pid = RunFork($catInvCmd);
|
| 226 |
+
push(@children, $pid);
|
| 227 |
+
}
|
| 228 |
+
else {
|
| 229 |
+
print STDERR "skipping extract, doing only extract.o\n";
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
if ($otherExtractArgs =~ /--FlexibilityScore/) {
|
| 233 |
+
$pid = RunFork($catContextCmd);
|
| 234 |
+
push(@children, $pid);
|
| 235 |
+
|
| 236 |
+
$pid = RunFork($catContextInvCmd);
|
| 237 |
+
push(@children, $pid);
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
my $numStr = NumStr(0);
|
| 241 |
+
if (-e "$TMPDIR/extract.$numStr.o.gz")
|
| 242 |
+
{
|
| 243 |
+
$pid = RunFork($catOCmd);
|
| 244 |
+
push(@children, $pid);
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
# wait for all sorting to finish
|
| 248 |
+
foreach (@children) {
|
| 249 |
+
waitpid($_, 0);
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
# merge glue rules
|
| 253 |
+
if (defined($glueFile)) {
|
| 254 |
+
my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
|
| 255 |
+
print STDERR "Merging glue rules: $cmd \n";
|
| 256 |
+
print STDERR `$cmd`;
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
# merge phrase orientation priors (GHKM extraction)
|
| 260 |
+
if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
|
| 261 |
+
print STDERR "Merging phrase orientation priors\n";
|
| 262 |
+
|
| 263 |
+
my @orientationPriorsCountFiles = glob("$TMPDIR/*.phraseOrientationPriors");
|
| 264 |
+
my %priorCounts;
|
| 265 |
+
|
| 266 |
+
foreach my $filenamePhraseOrientationPriors (@orientationPriorsCountFiles) {
|
| 267 |
+
if (-f $filenamePhraseOrientationPriors) {
|
| 268 |
+
open my $infilePhraseOrientationPriors, '<', $filenamePhraseOrientationPriors or die "cannot open $filenamePhraseOrientationPriors: $!";
|
| 269 |
+
while (my $line = <$infilePhraseOrientationPriors>) {
|
| 270 |
+
print $line;
|
| 271 |
+
my ($key, $value) = split / /, $line;
|
| 272 |
+
$priorCounts{$key} += $value;
|
| 273 |
+
}
|
| 274 |
+
close $infilePhraseOrientationPriors;
|
| 275 |
+
}
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
open my $outPhraseOrientationPriors, '>', $phraseOrientationPriorsFile or die "cannot open $phraseOrientationPriorsFile: $!";
|
| 279 |
+
foreach my $key (sort keys %priorCounts) {
|
| 280 |
+
print $outPhraseOrientationPriors $key." ".$priorCounts{$key}."\n";
|
| 281 |
+
}
|
| 282 |
+
close($outPhraseOrientationPriors);
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
# delete temporary files
|
| 286 |
+
$cmd = "rm -rf $TMPDIR \n";
|
| 287 |
+
systemCheck($cmd);
|
| 288 |
+
|
| 289 |
+
print STDERR "Finished ".localtime() ."\n";
|
| 290 |
+
|
| 291 |
+
# -----------------------------------------
|
| 292 |
+
# -----------------------------------------
|
| 293 |
+
|
| 294 |
+
sub RunFork($)
|
| 295 |
+
{
|
| 296 |
+
my $cmd = shift;
|
| 297 |
+
|
| 298 |
+
my $pid = fork();
|
| 299 |
+
|
| 300 |
+
if ($pid == 0)
|
| 301 |
+
{ # child
|
| 302 |
+
print STDERR $cmd;
|
| 303 |
+
systemCheck($cmd);
|
| 304 |
+
exit();
|
| 305 |
+
}
|
| 306 |
+
return $pid;
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
sub systemCheck($)
|
| 310 |
+
{
|
| 311 |
+
my $cmd = shift;
|
| 312 |
+
my $retVal = system($cmd);
|
| 313 |
+
if ($retVal != 0)
|
| 314 |
+
{
|
| 315 |
+
exit(1);
|
| 316 |
+
}
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
sub DigitStr($)
|
| 320 |
+
{
|
| 321 |
+
my $i = shift;
|
| 322 |
+
my $numStr;
|
| 323 |
+
if ($i < 10) {
|
| 324 |
+
$numStr = "000000$i";
|
| 325 |
+
}
|
| 326 |
+
elsif ($i < 100) {
|
| 327 |
+
$numStr = "00000$i";
|
| 328 |
+
}
|
| 329 |
+
elsif ($i < 1000) {
|
| 330 |
+
$numStr = "0000$i";
|
| 331 |
+
}
|
| 332 |
+
elsif ($i < 10000) {
|
| 333 |
+
$numStr = "000$i";
|
| 334 |
+
}
|
| 335 |
+
elsif ($i < 100000) {
|
| 336 |
+
$numStr = "00$i";
|
| 337 |
+
}
|
| 338 |
+
elsif ($i < 1000000) {
|
| 339 |
+
$numStr = "0$i";
|
| 340 |
+
}
|
| 341 |
+
else {
|
| 342 |
+
$numStr = $i;
|
| 343 |
+
}
|
| 344 |
+
return $numStr;
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
sub CharStr($)
|
| 348 |
+
{
|
| 349 |
+
my $i = shift;
|
| 350 |
+
my $charStr;
|
| 351 |
+
my @bit=();
|
| 352 |
+
|
| 353 |
+
while ($i>0){
|
| 354 |
+
push @bit, $i%26;
|
| 355 |
+
$i=int($i/26);
|
| 356 |
+
}
|
| 357 |
+
my $offset=scalar(@bit);
|
| 358 |
+
my $h;
|
| 359 |
+
for ($h=6;$h>=$offset;--$h) { $charStr.="a"; }
|
| 360 |
+
for ($h=$offset-1;$h>=0;--$h) { $charStr.="$alph[$bit[$h]]"; }
|
| 361 |
+
return $charStr;
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
sub NumStr($)
|
| 365 |
+
{
|
| 366 |
+
my $i = shift;
|
| 367 |
+
if ($isBSDSplit){
|
| 368 |
+
return CharStr($i);
|
| 369 |
+
}else{
|
| 370 |
+
return DigitStr($i);
|
| 371 |
+
}
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
sub GetSplitVersion($)
|
| 375 |
+
{
|
| 376 |
+
my $splitCmd = shift;
|
| 377 |
+
my $retVal = system("$splitCmd --help > /dev/null");
|
| 378 |
+
if ($retVal != 0) {
|
| 379 |
+
return 1;
|
| 380 |
+
}
|
| 381 |
+
else {
|
| 382 |
+
return 0;
|
| 383 |
+
}
|
| 384 |
+
}
|
| 385 |
+
|
mosesdecoder/scripts/generic/fsa-sample.fsa
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0 1 Prague 0.5
|
| 2 |
+
1 2 Stock 1
|
| 3 |
+
2 6 Market 1
|
| 4 |
+
0 3 New 0.5
|
| 5 |
+
3 4 York 1
|
| 6 |
+
4 5 Stock 1
|
| 7 |
+
5 6 Exchange 1
|
| 8 |
+
6 7 falls 0.5
|
| 9 |
+
6 7 drops 0.5
|
| 10 |
+
7 8 . 1
|
mosesdecoder/scripts/generic/fsa2fsal.pl
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
# A very simple script that converts fsa format (openfst lattices) to the same
|
| 3 |
+
# thing represented one sentence per line. It uses '|||' to delimit columns and
|
| 4 |
+
# ' ' to delimit nodes (i.e. original lines).
|
| 5 |
+
# Some rudimentary sanity checks are done on the fly.
|
| 6 |
+
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
|
| 7 |
+
#
|
| 8 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 9 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 10 |
+
|
| 11 |
+
use warnings;
|
| 12 |
+
use strict;
|
| 13 |
+
|
| 14 |
+
my $errs = 0;
|
| 15 |
+
sub err {
|
| 16 |
+
my $nr = shift;
|
| 17 |
+
my $msg = shift;
|
| 18 |
+
print STDERR "$nr:$msg\n";
|
| 19 |
+
$errs++;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
my $onr = 0;
|
| 23 |
+
my @lines = ();
|
| 24 |
+
sub flush {
|
| 25 |
+
return if 0 == scalar @lines;
|
| 26 |
+
print join(" ", @lines);
|
| 27 |
+
print "\n";
|
| 28 |
+
$onr++;
|
| 29 |
+
@lines = ();
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
my $nr = 0;
|
| 33 |
+
my $numscores = undef;
|
| 34 |
+
while (<>) {
|
| 35 |
+
chomp;
|
| 36 |
+
if ($_ eq "") {
|
| 37 |
+
flush();
|
| 38 |
+
next;
|
| 39 |
+
}
|
| 40 |
+
my ($a, $b, $label, $scores, $rest) = split /\s+/, $_, 5;
|
| 41 |
+
err($nr, "The delimiter '|||' can't appear in the input!") if /\|\|\|/;
|
| 42 |
+
err($nr, "Node id not numeric: $a") if $a !~ /^\d+$/;
|
| 43 |
+
err($nr, "Node id not numeric: $b") if $b !~ /^\d+$/;
|
| 44 |
+
err($nr, "Unexpected tail: '$rest'") if defined $rest && $rest !~ /^\s*$/;
|
| 45 |
+
my $thisnumscores = ($scores =~ tr/,/,/);
|
| 46 |
+
$numscores = $thisnumscores if !defined $numscores;
|
| 47 |
+
err($nr, "Incompatible number of arc scores, previous lines had ".($numscores+1).", now ".($thisnumscores+1))
|
| 48 |
+
if $numscores != $thisnumscores;
|
| 49 |
+
push @lines, join("|||", ($a,$b,$label,$scores));
|
| 50 |
+
}
|
| 51 |
+
flush();
|
| 52 |
+
|
| 53 |
+
exit 1 if $errs;
|
mosesdecoder/scripts/generic/fsa2plf.pl
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
# Converts AT&T FSA format to 'python lattice format'.
|
| 3 |
+
# Note that the input FSA needs to be epsilon-free and topologically sorted.
|
| 4 |
+
# This script checks for topological sortedness.
|
| 5 |
+
# The start node has to have the index 0.
|
| 6 |
+
# All path ends are assumed to be final nodes, not just the explicitly stated
|
| 7 |
+
# final nodes.
|
| 8 |
+
# Note that the output format may not contain any spaces.
|
| 9 |
+
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
|
| 10 |
+
#
|
| 11 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 12 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 13 |
+
|
| 14 |
+
use warnings;
|
| 15 |
+
use strict;
|
| 16 |
+
use Getopt::Long;
|
| 17 |
+
|
| 18 |
+
binmode(STDIN, ":utf8");
|
| 19 |
+
binmode(STDOUT, ":utf8");
|
| 20 |
+
binmode(STDERR, ":utf8");
|
| 21 |
+
|
| 22 |
+
my $filelist;
|
| 23 |
+
my $ignore_final_state_cost = 0;
|
| 24 |
+
my $mangle_weights = undef;
|
| 25 |
+
GetOptions(
|
| 26 |
+
"ignore-final-state-cost" => \$ignore_final_state_cost,
|
| 27 |
+
# sometimes, final states have a cost (e.g. "45 0.05\n")
|
| 28 |
+
# instead of dying there, ignore the problem
|
| 29 |
+
"filelist|fl=s" => \$filelist,
|
| 30 |
+
"mangle-weights=s" => \$mangle_weights,
|
| 31 |
+
) or exit 1;
|
| 32 |
+
|
| 33 |
+
my @infiles;
|
| 34 |
+
if (defined $filelist) {
|
| 35 |
+
my $fh = my_open($filelist);
|
| 36 |
+
while (<$fh>) {
|
| 37 |
+
chomp;
|
| 38 |
+
push @infiles, $_;
|
| 39 |
+
}
|
| 40 |
+
close $fh;
|
| 41 |
+
}
|
| 42 |
+
push @infiles, @ARGV;
|
| 43 |
+
@ARGV = ();
|
| 44 |
+
if (0 == scalar(@infiles)) {
|
| 45 |
+
print STDERR "Reading input from stdin\n";
|
| 46 |
+
push @infiles, "-";
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
my $err = 0;
|
| 50 |
+
foreach my $inf (@infiles) {
|
| 51 |
+
my $nr = 0;
|
| 52 |
+
NEXTLATTICE:
|
| 53 |
+
my %usedids = (); # collect all used ids for densification
|
| 54 |
+
my %usedtgtids = (); # collect all used ids for densification
|
| 55 |
+
my @outnodes = ();
|
| 56 |
+
my $fh = my_open($inf);
|
| 57 |
+
my %is_final; # remember which nodes were final
|
| 58 |
+
while (<$fh>) {
|
| 59 |
+
chomp;
|
| 60 |
+
$nr++;
|
| 61 |
+
last if $_ eq ""; # assume a blank line delimits lattices
|
| 62 |
+
my ($src, $tgt, $label, $weight) = split /\s+/;
|
| 63 |
+
die "$inf:$nr:Bad src node index: $src" if $src !~ /^[0-9]+$/;
|
| 64 |
+
|
| 65 |
+
if (!defined $label && !defined $weight) {
|
| 66 |
+
# explicit final node, warn at the end if there are any intermed. final
|
| 67 |
+
# nodes
|
| 68 |
+
$is_final{$src};
|
| 69 |
+
# final nodes can have a cost
|
| 70 |
+
die "$inf:$nr:Final state $src has cost $tgt. Unsupported, use --ignore-final-state-cost"
|
| 71 |
+
if defined $tgt && !$ignore_final_state_cost;
|
| 72 |
+
|
| 73 |
+
next;
|
| 74 |
+
}
|
| 75 |
+
$weight = 0 if !defined $weight;
|
| 76 |
+
|
| 77 |
+
$usedids{$src} = 1;
|
| 78 |
+
$usedtgtids{$tgt} = 1;
|
| 79 |
+
|
| 80 |
+
# process the weight
|
| 81 |
+
# when reading RWTH FSA output, the weights are negated natural logarithms
|
| 82 |
+
# we need to negate them back
|
| 83 |
+
if (defined $mangle_weights) {
|
| 84 |
+
if ($mangle_weights eq "expneg") {
|
| 85 |
+
$weight = join(",", map {exp(-$_)} split /,/, $weight);
|
| 86 |
+
} else {
|
| 87 |
+
die "Bad weights mangling: $mangle_weights";
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
# remember the node
|
| 91 |
+
my $targetnode = $tgt-$src;
|
| 92 |
+
die "$inf:$nr:Not topologically sorted, got arc from $src to $tgt"
|
| 93 |
+
if $targetnode <= 0;
|
| 94 |
+
push @{$outnodes[$src]}, [ $label, $weight, $tgt ];
|
| 95 |
+
}
|
| 96 |
+
if (eof($fh)) {
|
| 97 |
+
close $fh;
|
| 98 |
+
$fh = undef;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
# Assign our dense IDs: source node ids are assigned first
|
| 102 |
+
my %denseids = (); # maps node ids from the file to dense ids
|
| 103 |
+
my $nextid = 0;
|
| 104 |
+
foreach my $id (sort {$a<=>$b} keys %usedids) {
|
| 105 |
+
$denseids{$id} = $nextid;
|
| 106 |
+
$nextid++;
|
| 107 |
+
}
|
| 108 |
+
# All unseen target nodes then get the same next id, the final node id
|
| 109 |
+
foreach my $id (keys %usedtgtids) {
|
| 110 |
+
next if defined $denseids{$id};
|
| 111 |
+
$denseids{$id} = $nextid;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
foreach my $f (keys %is_final) {
|
| 115 |
+
if (defined $outnodes[$f]) {
|
| 116 |
+
print STDERR "$inf:Node $f is final but it has outgoing edges!\n";
|
| 117 |
+
$err = 1;
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
# # Verbose: print original to dense IDs mapping
|
| 121 |
+
# foreach my $src (sort {$a<=>$b} keys %denseids) {
|
| 122 |
+
# print STDERR "$src ...> $denseids{$src}\n";
|
| 123 |
+
# }
|
| 124 |
+
|
| 125 |
+
print "(";
|
| 126 |
+
for(my $origsrc = 0; $origsrc < @outnodes; $origsrc++) {
|
| 127 |
+
my $src = $denseids{$origsrc};
|
| 128 |
+
next if !defined $src; # this original node ID is not used at all
|
| 129 |
+
next if $src == $nextid; # this is the ultimate merged final node
|
| 130 |
+
my $outnode = $outnodes[$origsrc];
|
| 131 |
+
print "(";
|
| 132 |
+
foreach my $arc (@$outnode) {
|
| 133 |
+
my $origtgt = $arc->[2];
|
| 134 |
+
my $tgt = $denseids{$origtgt};
|
| 135 |
+
if (!defined $tgt) {
|
| 136 |
+
# this was a final node only
|
| 137 |
+
$tgt = $denseids{$origtgt} = $nextid;
|
| 138 |
+
$nextid++;
|
| 139 |
+
}
|
| 140 |
+
my $step_to_target = $tgt - $src;
|
| 141 |
+
die "$inf:Bug, I damaged top-sortedness (orig $origsrc .. $origtgt; curr $src .. $tgt)." if $step_to_target <= 0;
|
| 142 |
+
print "('".apo($arc->[0])."',$arc->[1],$step_to_target),";
|
| 143 |
+
}
|
| 144 |
+
print "),";
|
| 145 |
+
}
|
| 146 |
+
print ")\n";
|
| 147 |
+
goto NEXTLATTICE if defined $fh && ! eof($fh);
|
| 148 |
+
}
|
| 149 |
+
die "There were errors." if $err;
|
| 150 |
+
|
| 151 |
+
sub apo {
|
| 152 |
+
my $s = shift;
|
| 153 |
+
# protects apostrophy and backslash
|
| 154 |
+
$s =~ s/\\/\\\\/g;
|
| 155 |
+
$s =~ s/(['])/\\$1/g;
|
| 156 |
+
return $s;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
sub my_open {
|
| 160 |
+
my $f = shift;
|
| 161 |
+
if ($f eq "-") {
|
| 162 |
+
binmode(STDIN, ":utf8");
|
| 163 |
+
return *STDIN;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
die "Not found: $f" if ! -e $f;
|
| 167 |
+
|
| 168 |
+
my $opn;
|
| 169 |
+
my $hdl;
|
| 170 |
+
my $ft = `file '$f'`;
|
| 171 |
+
# file might not recognize some files!
|
| 172 |
+
if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) {
|
| 173 |
+
$opn = "zcat '$f' |";
|
| 174 |
+
} elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) {
|
| 175 |
+
$opn = "bzcat '$f' |";
|
| 176 |
+
} else {
|
| 177 |
+
$opn = "$f";
|
| 178 |
+
}
|
| 179 |
+
open $hdl, $opn or die "Can't open '$opn': $!";
|
| 180 |
+
binmode $hdl, ":utf8";
|
| 181 |
+
return $hdl;
|
| 182 |
+
}
|
mosesdecoder/scripts/generic/generic-parallel.perl
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
use warnings;
|
| 7 |
+
use strict;
|
| 8 |
+
use utf8;
|
| 9 |
+
|
| 10 |
+
binmode STDIN, ":utf8";
|
| 11 |
+
binmode STDOUT, ":utf8";
|
| 12 |
+
binmode STDERR, ":utf8";
|
| 13 |
+
|
| 14 |
+
sub NumStr($);
|
| 15 |
+
|
| 16 |
+
my $NUM_SPLIT_LINES = $ARGV[0];
|
| 17 |
+
|
| 18 |
+
my $TMPDIR = $ARGV[1];
|
| 19 |
+
$TMPDIR = "$TMPDIR/tmp.$$";
|
| 20 |
+
mkdir $TMPDIR;
|
| 21 |
+
print STDERR "TMPDIR=$TMPDIR \n";
|
| 22 |
+
|
| 23 |
+
my $cmd = "";
|
| 24 |
+
for (my $i = 2; $i < scalar(@ARGV); ++$i)
|
| 25 |
+
{
|
| 26 |
+
$cmd .= $ARGV[$i] ." ";
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
# split input file
|
| 30 |
+
open (INPUT_ALL, "> $TMPDIR/input.all");
|
| 31 |
+
binmode INPUT_ALL, ":utf8";
|
| 32 |
+
while (my $line = <STDIN>)
|
| 33 |
+
{
|
| 34 |
+
chomp($line);
|
| 35 |
+
print INPUT_ALL $line."\n";
|
| 36 |
+
}
|
| 37 |
+
close(INPUT_ALL);
|
| 38 |
+
|
| 39 |
+
my $cmd2 = "split -l $NUM_SPLIT_LINES -a 5 -d $TMPDIR/input.all $TMPDIR/x";
|
| 40 |
+
`$cmd2`;
|
| 41 |
+
|
| 42 |
+
# create exec file
|
| 43 |
+
open (EXEC, "> $TMPDIR/exec");
|
| 44 |
+
binmode EXEC, ":utf8";
|
| 45 |
+
|
| 46 |
+
# execute in parallel
|
| 47 |
+
print STDERR "executing\n";
|
| 48 |
+
|
| 49 |
+
my $i = 0;
|
| 50 |
+
my $filePath = "$TMPDIR/x" .NumStr($i);
|
| 51 |
+
while (-f $filePath)
|
| 52 |
+
{
|
| 53 |
+
print EXEC "$cmd < $filePath > $filePath.out\n";
|
| 54 |
+
|
| 55 |
+
++$i;
|
| 56 |
+
$filePath = "$TMPDIR/x" .NumStr($i);
|
| 57 |
+
}
|
| 58 |
+
close (EXEC);
|
| 59 |
+
|
| 60 |
+
$cmd2 = "parallel < $TMPDIR/exec";
|
| 61 |
+
`$cmd2`;
|
| 62 |
+
|
| 63 |
+
# concatenate
|
| 64 |
+
print STDERR "concatenating\n";
|
| 65 |
+
|
| 66 |
+
$i = 1;
|
| 67 |
+
my $firstPath = "$TMPDIR/x" .NumStr(0) .".out";
|
| 68 |
+
$filePath = "$TMPDIR/x" .NumStr($i) .".out";
|
| 69 |
+
while (-f $filePath)
|
| 70 |
+
{
|
| 71 |
+
$cmd = "cat $filePath >> $firstPath";
|
| 72 |
+
`$cmd`;
|
| 73 |
+
|
| 74 |
+
++$i;
|
| 75 |
+
$filePath = "$TMPDIR/x" .NumStr($i) .".out";
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
# output
|
| 79 |
+
open (OUTPUT_ALL, "$firstPath");
|
| 80 |
+
binmode OUTPUT_ALL, ":utf8";
|
| 81 |
+
while (my $line = <OUTPUT_ALL>)
|
| 82 |
+
{
|
| 83 |
+
chomp($line);
|
| 84 |
+
print "$line\n";
|
| 85 |
+
}
|
| 86 |
+
close(OUTPUT_ALL);
|
| 87 |
+
|
| 88 |
+
$cmd = "rm -rf $TMPDIR/";
|
| 89 |
+
`$cmd`;
|
| 90 |
+
|
| 91 |
+
###########################################
|
| 92 |
+
sub NumStr($)
|
| 93 |
+
{
|
| 94 |
+
my $i = shift;
|
| 95 |
+
my $numStr;
|
| 96 |
+
if ($i < 10) {
|
| 97 |
+
$numStr = "000000$i";
|
| 98 |
+
}
|
| 99 |
+
elsif ($i < 100) {
|
| 100 |
+
$numStr = "00000$i";
|
| 101 |
+
}
|
| 102 |
+
elsif ($i < 1000) {
|
| 103 |
+
$numStr = "0000$i";
|
| 104 |
+
}
|
| 105 |
+
elsif ($i < 10000) {
|
| 106 |
+
$numStr = "000$i";
|
| 107 |
+
}
|
| 108 |
+
elsif ($i < 100000) {
|
| 109 |
+
$numStr = "00$i";
|
| 110 |
+
}
|
| 111 |
+
elsif ($i < 1000000) {
|
| 112 |
+
$numStr = "0$i";
|
| 113 |
+
}
|
| 114 |
+
else {
|
| 115 |
+
$numStr = $i;
|
| 116 |
+
}
|
| 117 |
+
return $numStr;
|
| 118 |
+
}
|
| 119 |
+
|
mosesdecoder/scripts/generic/giza-parallel.perl
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
# example
|
| 7 |
+
# ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align
|
| 8 |
+
|
| 9 |
+
use warnings;
|
| 10 |
+
use strict;
|
| 11 |
+
use File::Basename;
|
| 12 |
+
|
| 13 |
+
sub NumStr($);
|
| 14 |
+
|
| 15 |
+
print "Started ".localtime() ."\n";
|
| 16 |
+
|
| 17 |
+
my $numParallel = $ARGV[0];
|
| 18 |
+
my $splitCmd = $ARGV[1];
|
| 19 |
+
my $trainCmd = $ARGV[2];
|
| 20 |
+
my $inputExt = $ARGV[3];
|
| 21 |
+
my $outputExt = $ARGV[4];
|
| 22 |
+
my $corpus = $ARGV[5];
|
| 23 |
+
my $align = $ARGV[6];
|
| 24 |
+
|
| 25 |
+
my $TMPDIR=dirname($align) ."/tmp.$$";
|
| 26 |
+
mkdir $TMPDIR;
|
| 27 |
+
|
| 28 |
+
my $scriptDir=dirname($trainCmd) ."/..";
|
| 29 |
+
|
| 30 |
+
# split corpus file
|
| 31 |
+
my $totalLines = int(`wc -l $corpus.$inputExt`);
|
| 32 |
+
my $linesPerSplit = int($totalLines / $numParallel) + 1;
|
| 33 |
+
|
| 34 |
+
my $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$inputExt $TMPDIR/source.";
|
| 35 |
+
`$cmd`;
|
| 36 |
+
|
| 37 |
+
$cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$outputExt $TMPDIR/target.";
|
| 38 |
+
`$cmd`;
|
| 39 |
+
|
| 40 |
+
for (my $i = 0; $i < $numParallel; ++$i)
|
| 41 |
+
{
|
| 42 |
+
my $numStr = NumStr($i);
|
| 43 |
+
rename("$TMPDIR/source.$numStr", "$TMPDIR/$numStr.source");
|
| 44 |
+
rename("$TMPDIR/target.$numStr", "$TMPDIR/$numStr.target");
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
#fork & run giza & friends
|
| 48 |
+
my $isParent = 1;
|
| 49 |
+
my @childs;
|
| 50 |
+
for (my $i = 0; $i < $numParallel; ++$i)
|
| 51 |
+
{
|
| 52 |
+
my $pid = fork();
|
| 53 |
+
|
| 54 |
+
if ($pid == 0)
|
| 55 |
+
{ # child
|
| 56 |
+
$isParent = 0;
|
| 57 |
+
|
| 58 |
+
my $numStr = NumStr($i);
|
| 59 |
+
my $cmd = "$trainCmd -dont-zip -last-step 1 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus $TMPDIR/$numStr -corpus-dir $TMPDIR/prepared.$numStr \n";
|
| 60 |
+
print $cmd;
|
| 61 |
+
`$cmd`;
|
| 62 |
+
|
| 63 |
+
$cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-e2f $TMPDIR/giza.$numStr -direction 2 \n";
|
| 64 |
+
print $cmd;
|
| 65 |
+
`$cmd`;
|
| 66 |
+
|
| 67 |
+
$cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -direction 1 \n";
|
| 68 |
+
print $cmd;
|
| 69 |
+
`$cmd`;
|
| 70 |
+
|
| 71 |
+
$cmd = "$trainCmd -dont-zip -first-step 3 -last-step 3 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -giza-e2f $TMPDIR/giza.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -alignment-file $TMPDIR/aligned.$numStr -alignment grow-diag-final-and \n";
|
| 72 |
+
print $cmd;
|
| 73 |
+
`$cmd`;
|
| 74 |
+
|
| 75 |
+
exit();
|
| 76 |
+
}
|
| 77 |
+
else
|
| 78 |
+
{ # parent
|
| 79 |
+
push(@childs, $pid);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
# wait for everything is finished
|
| 85 |
+
if ($isParent)
|
| 86 |
+
{
|
| 87 |
+
foreach (@childs) {
|
| 88 |
+
waitpid($_, 0);
|
| 89 |
+
}
|
| 90 |
+
}
|
| 91 |
+
else
|
| 92 |
+
{
|
| 93 |
+
die "shouldn't be here";
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# cat all aligned files together. Voila
|
| 97 |
+
my $cmd = "cat ";
|
| 98 |
+
for (my $i = 0; $i < $numParallel; ++$i)
|
| 99 |
+
{
|
| 100 |
+
my $numStr = NumStr($i);
|
| 101 |
+
$cmd .= "$TMPDIR/aligned.$numStr.grow-diag-final-and ";
|
| 102 |
+
}
|
| 103 |
+
$cmd .= " > $align \n";
|
| 104 |
+
print $cmd;
|
| 105 |
+
`$cmd`;
|
| 106 |
+
|
| 107 |
+
sub NumStr($)
|
| 108 |
+
{
|
| 109 |
+
my $i = shift;
|
| 110 |
+
my $numStr;
|
| 111 |
+
if ($i < 10) {
|
| 112 |
+
$numStr = "000000$i";
|
| 113 |
+
}
|
| 114 |
+
elsif ($i < 100) {
|
| 115 |
+
$numStr = "00000$i";
|
| 116 |
+
}
|
| 117 |
+
elsif ($i < 1000) {
|
| 118 |
+
$numStr = "0000$i";
|
| 119 |
+
}
|
| 120 |
+
elsif ($i < 10000) {
|
| 121 |
+
$numStr = "000$i";
|
| 122 |
+
}
|
| 123 |
+
elsif ($i < 100000) {
|
| 124 |
+
$numStr = "00$i";
|
| 125 |
+
}
|
| 126 |
+
elsif ($i < 1000000) {
|
| 127 |
+
$numStr = "0$i";
|
| 128 |
+
}
|
| 129 |
+
else {
|
| 130 |
+
$numStr = $i;
|
| 131 |
+
}
|
| 132 |
+
return $numStr;
|
| 133 |
+
}
|
| 134 |
+
|
mosesdecoder/scripts/generic/lopar2pos.pl
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
# $Id$
|
| 7 |
+
#lopar2pos: extract POSs from LOPAR output
|
| 8 |
+
#usage: lopar2pos.pl CORPUS.lopar > CORPUS.pos
|
| 9 |
+
|
| 10 |
+
use warnings;
|
| 11 |
+
|
| 12 |
+
my $infilename = shift @ARGV;
|
| 13 |
+
open(INFILE, "<$infilename") or die "couldn't open '$infilename' for read: $!\n";
|
| 14 |
+
while(my $line = <INFILE>)
|
| 15 |
+
{
|
| 16 |
+
my @words = split(/\s+/, $line);
|
| 17 |
+
my @tags = map {$_ =~ /^[^_]*_([A-Z]+)/; $1} @words;
|
| 18 |
+
print join(' ', @tags) . "\n";
|
| 19 |
+
}
|
| 20 |
+
close(INFILE);
|
mosesdecoder/scripts/generic/moses_sim_pe.py
ADDED
|
@@ -0,0 +1,452 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
|
| 3 |
+
# Written by Michael Denkowski
|
| 4 |
+
#
|
| 5 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 6 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 7 |
+
|
| 8 |
+
"""Parallelize decoding with simulated post-editing via moses XML input.
|
| 9 |
+
|
| 10 |
+
(XML entities need to be escaped in tokenization). Memory mapped
|
| 11 |
+
dynamic phrase tables (Ulrich Germann,
|
| 12 |
+
www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models
|
| 13 |
+
(Kenneth Heafield,
|
| 14 |
+
http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19)
|
| 15 |
+
facilitate memory efficient multi process decoding. Input is divided into
|
| 16 |
+
batches, each of which is decoded sequentially. Each batch pre-loads the
|
| 17 |
+
data from previous batches.
|
| 18 |
+
|
| 19 |
+
To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the
|
| 20 |
+
alignment from input to references. Specify the number of jobs with
|
| 21 |
+
--decoder-flags="-threads N".
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
import gzip
|
| 25 |
+
import itertools
|
| 26 |
+
import math
|
| 27 |
+
import os
|
| 28 |
+
import shutil
|
| 29 |
+
import subprocess
|
| 30 |
+
import sys
|
| 31 |
+
import tempfile
|
| 32 |
+
import threading
|
| 33 |
+
|
| 34 |
+
HELP = '''Moses with simulated post-editing
|
| 35 |
+
|
| 36 |
+
Usage:
|
| 37 |
+
{} moses-cmd -config moses.ini -input-file text.src -ref text.tgt \
|
| 38 |
+
-symal text.src-tgt.symal [options] [decoder flags]
|
| 39 |
+
|
| 40 |
+
Options:
|
| 41 |
+
-threads N: number of decoders to run in parallel \
|
| 42 |
+
(default read from moses.ini, 1 if not present)
|
| 43 |
+
-n-best-list nbest.out N [distinct]: location and size of N-best list
|
| 44 |
+
-show-weights: for mert-moses.pl, just call moses and exit
|
| 45 |
+
-tmp: location of temp directory (default /tmp)
|
| 46 |
+
|
| 47 |
+
Other options (decoder flags) are passed through to moses-cmd\n'''
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class ProgramFailure(Exception):
|
| 51 |
+
"""Known kind of failure, with a known presentation to the user.
|
| 52 |
+
|
| 53 |
+
Error message will be printed, and the program will return an error,
|
| 54 |
+
but no traceback will be shown to the user.
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class Progress:
|
| 59 |
+
"""Provides progress bar."""
|
| 60 |
+
|
| 61 |
+
def __init__(self):
|
| 62 |
+
self.i = 0
|
| 63 |
+
self.lock = threading.Lock()
|
| 64 |
+
|
| 65 |
+
def inc(self):
|
| 66 |
+
self.lock.acquire()
|
| 67 |
+
self.i += 1
|
| 68 |
+
if self.i % 100 == 0:
|
| 69 |
+
sys.stderr.write('.')
|
| 70 |
+
if self.i % 1000 == 0:
|
| 71 |
+
sys.stderr.write(' [{}]\n'.format(self.i))
|
| 72 |
+
sys.stderr.flush()
|
| 73 |
+
self.lock.release()
|
| 74 |
+
|
| 75 |
+
def done(self):
|
| 76 |
+
self.lock.acquire()
|
| 77 |
+
if self.i % 1000 != 0:
|
| 78 |
+
sys.stderr.write('\n')
|
| 79 |
+
self.lock.release()
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def atomic_io(cmd, in_file, out_file, err_file, prog=None):
|
| 83 |
+
"""Run with atomic (synchronous) I/O."""
|
| 84 |
+
with open(in_file, 'r') as inp, open(out_file, 'w') as out, open(err_file, 'w') as err:
|
| 85 |
+
p = subprocess.Popen(
|
| 86 |
+
cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=err)
|
| 87 |
+
while True:
|
| 88 |
+
line = inp.readline()
|
| 89 |
+
if not line:
|
| 90 |
+
break
|
| 91 |
+
p.stdin.write(line)
|
| 92 |
+
out.write(p.stdout.readline())
|
| 93 |
+
out.flush()
|
| 94 |
+
if prog:
|
| 95 |
+
prog.inc()
|
| 96 |
+
p.stdin.close()
|
| 97 |
+
p.wait()
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def gzopen(f):
|
| 101 |
+
"""Open plain or gzipped text."""
|
| 102 |
+
return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r')
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def wc(f):
|
| 106 |
+
"""Word count."""
|
| 107 |
+
i = 0
|
| 108 |
+
for line in gzopen(f):
|
| 109 |
+
i += 1
|
| 110 |
+
return i
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def write_gzfile(lines, f):
|
| 114 |
+
"""Write lines to gzipped file."""
|
| 115 |
+
out = gzip.open(f, 'wb')
|
| 116 |
+
for line in lines:
|
| 117 |
+
out.write('{}\n'.format(line))
|
| 118 |
+
out.close()
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def main(argv):
|
| 122 |
+
# Defaults
|
| 123 |
+
moses_ini = None
|
| 124 |
+
moses_ini_lines = None
|
| 125 |
+
text_src = None
|
| 126 |
+
text_tgt = None
|
| 127 |
+
text_symal = None
|
| 128 |
+
text_len = None
|
| 129 |
+
threads_found = False
|
| 130 |
+
threads = 1
|
| 131 |
+
n_best_out = None
|
| 132 |
+
n_best_size = None
|
| 133 |
+
n_best_distinct = False
|
| 134 |
+
hg_ext = None
|
| 135 |
+
hg_dir = None
|
| 136 |
+
tmp_dir = '/tmp'
|
| 137 |
+
xml_found = False
|
| 138 |
+
xml_input = 'exclusive'
|
| 139 |
+
show_weights = False
|
| 140 |
+
mmsapt_dynamic = []
|
| 141 |
+
mmsapt_static = []
|
| 142 |
+
mmsapt_l1 = None
|
| 143 |
+
mmsapt_l2 = None
|
| 144 |
+
|
| 145 |
+
# Decoder command
|
| 146 |
+
cmd = argv[1:]
|
| 147 |
+
|
| 148 |
+
# Parse special options and remove from cmd
|
| 149 |
+
i = 1
|
| 150 |
+
while i < len(cmd):
|
| 151 |
+
if cmd[i] in ('-f', '-config'):
|
| 152 |
+
moses_ini = cmd[i + 1]
|
| 153 |
+
cmd = cmd[:i] + cmd[i + 2:]
|
| 154 |
+
elif cmd[i] in ('-i', '-input-file'):
|
| 155 |
+
text_src = cmd[i + 1]
|
| 156 |
+
cmd = cmd[:i] + cmd[i + 2:]
|
| 157 |
+
elif cmd[i] == '-ref':
|
| 158 |
+
text_tgt = cmd[i + 1]
|
| 159 |
+
cmd = cmd[:i] + cmd[i + 2:]
|
| 160 |
+
elif cmd[i] == '-symal':
|
| 161 |
+
text_symal = cmd[i + 1]
|
| 162 |
+
cmd = cmd[:i] + cmd[i + 2:]
|
| 163 |
+
elif cmd[i] in ('-th', '-threads'):
|
| 164 |
+
threads_found = True
|
| 165 |
+
threads = int(cmd[i + 1])
|
| 166 |
+
cmd = cmd[:i] + cmd[i + 2:]
|
| 167 |
+
elif cmd[i] == '-n-best-list':
|
| 168 |
+
n_best_out = cmd[i + 1]
|
| 169 |
+
n_best_size = cmd[i + 2]
|
| 170 |
+
# Optional "distinct"
|
| 171 |
+
if i + 3 < len(cmd) and cmd[i + 3] == 'distinct':
|
| 172 |
+
n_best_distinct = True
|
| 173 |
+
cmd = cmd[:i] + cmd[i + 4:]
|
| 174 |
+
else:
|
| 175 |
+
cmd = cmd[:i] + cmd[i + 3:]
|
| 176 |
+
elif cmd[i] == '-output-search-graph-hypergraph':
|
| 177 |
+
# cmd[i + 1] == true
|
| 178 |
+
hg_ext = cmd[i + 2]
|
| 179 |
+
if i + 3 < len(cmd) and cmd[i + 3][0] != '-':
|
| 180 |
+
hg_dir = cmd[i + 3]
|
| 181 |
+
cmd = cmd[:i] + cmd[i + 4:]
|
| 182 |
+
else:
|
| 183 |
+
hg_dir = 'hypergraph'
|
| 184 |
+
cmd = cmd[:i] + cmd[i + 3:]
|
| 185 |
+
elif cmd[i] == '-tmp':
|
| 186 |
+
tmp_dir = cmd[i + 1]
|
| 187 |
+
cmd = cmd[:i] + cmd[i + 2:]
|
| 188 |
+
# Handled specially to make sure XML input is turned on somewhere
|
| 189 |
+
elif cmd[i] in ('-xi', '-xml-input'):
|
| 190 |
+
xml_found = True
|
| 191 |
+
xml_input = cmd[i + 1]
|
| 192 |
+
cmd = cmd[:i] + cmd[i + 2:]
|
| 193 |
+
# Handled specially for mert-moses.pl
|
| 194 |
+
elif cmd[i] == '-show-weights':
|
| 195 |
+
show_weights = True
|
| 196 |
+
# Do not remove from cmd
|
| 197 |
+
i += 1
|
| 198 |
+
else:
|
| 199 |
+
i += 1
|
| 200 |
+
|
| 201 |
+
# Read moses.ini
|
| 202 |
+
if moses_ini:
|
| 203 |
+
moses_ini_lines = [line.strip() for line in open(moses_ini, 'r')]
|
| 204 |
+
i = 0
|
| 205 |
+
while i < len(moses_ini_lines):
|
| 206 |
+
# PhraseDictionaryBitextSampling name=TranslationModel0
|
| 207 |
+
# output-factor=0 num-features=7 path=corpus. L1=src L2=tgt
|
| 208 |
+
# pfwd=g pbwd=g smooth=0 sample=1000 workers=1
|
| 209 |
+
if moses_ini_lines[i].startswith('PhraseDictionaryBitextSampling'):
|
| 210 |
+
for (k, v) in (pair.split('=') for pair in moses_ini_lines[i].split()[1:]):
|
| 211 |
+
if k == 'name':
|
| 212 |
+
# Dynamic means update this model
|
| 213 |
+
if v.startswith('Dynamic'):
|
| 214 |
+
mmsapt_dynamic.append(v)
|
| 215 |
+
moses_ini_lines[i] += '{mmsapt_extra}'
|
| 216 |
+
else:
|
| 217 |
+
mmsapt_static.append(v)
|
| 218 |
+
elif k == 'L1':
|
| 219 |
+
if mmsapt_l1 and v != mmsapt_l1:
|
| 220 |
+
raise ProgramFailure(
|
| 221 |
+
'Error: All PhraseDictionaryBitextSampling '
|
| 222 |
+
'entries should have same L1: '
|
| 223 |
+
'{} != {}\n'.format(v, mmsapt_l1))
|
| 224 |
+
mmsapt_l1 = v
|
| 225 |
+
elif k == 'L2':
|
| 226 |
+
if mmsapt_l2 and v != mmsapt_l2:
|
| 227 |
+
raise ProgramFailure(
|
| 228 |
+
'Error: All PhraseDictionaryBitextSampling '
|
| 229 |
+
'entries should have same L2: '
|
| 230 |
+
'{} != {}\n'.format(v, mmsapt_l2))
|
| 231 |
+
mmsapt_l2 = v
|
| 232 |
+
# [threads]
|
| 233 |
+
# 8
|
| 234 |
+
elif moses_ini_lines[i] == '[threads]':
|
| 235 |
+
# Prefer command line over moses.ini
|
| 236 |
+
if not threads_found:
|
| 237 |
+
threads = int(moses_ini_lines[i + 1])
|
| 238 |
+
i += 1
|
| 239 |
+
# [xml-input]
|
| 240 |
+
# exclusive
|
| 241 |
+
elif moses_ini_lines[i] == '[xml-input]':
|
| 242 |
+
# Prefer command line over moses.ini
|
| 243 |
+
if not xml_found:
|
| 244 |
+
xml_found = True
|
| 245 |
+
xml_input = moses_ini_lines[i + 1]
|
| 246 |
+
i += 1
|
| 247 |
+
i += 1
|
| 248 |
+
|
| 249 |
+
# If mert-moses.pl passes -show-weights, just call moses
|
| 250 |
+
if show_weights:
|
| 251 |
+
# re-append original moses.ini
|
| 252 |
+
cmd.append('-config')
|
| 253 |
+
cmd.append(moses_ini)
|
| 254 |
+
sys.stdout.write(subprocess.check_output(cmd))
|
| 255 |
+
sys.stdout.flush()
|
| 256 |
+
sys.exit(0)
|
| 257 |
+
|
| 258 |
+
# Input length
|
| 259 |
+
if text_src:
|
| 260 |
+
text_len = wc(text_src)
|
| 261 |
+
|
| 262 |
+
# Check inputs
|
| 263 |
+
if not (len(cmd) > 0 and all((moses_ini, text_src, text_tgt, text_symal))):
|
| 264 |
+
sys.stderr.write(HELP.format(argv[0]))
|
| 265 |
+
sys.exit(2)
|
| 266 |
+
if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)):
|
| 267 |
+
raise ProgramFailure(
|
| 268 |
+
'Error: moses-cmd "{}" is not executable\n'.format(cmd[0]))
|
| 269 |
+
if not mmsapt_dynamic:
|
| 270 |
+
raise ProgramFailure((
|
| 271 |
+
'Error: no PhraseDictionaryBitextSampling entries named '
|
| 272 |
+
'"Dynamic..." found in {}. See '
|
| 273 |
+
'http://www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40\n'
|
| 274 |
+
).format(moses_ini))
|
| 275 |
+
if wc(text_tgt) != text_len or wc(text_symal) != text_len:
|
| 276 |
+
raise ProgramFailure(
|
| 277 |
+
'Error: length mismatch between "{}", "{}", and "{}"\n'.format(
|
| 278 |
+
text_src, text_tgt, text_symal))
|
| 279 |
+
|
| 280 |
+
# Setup
|
| 281 |
+
work_dir = tempfile.mkdtemp(prefix='moses.', dir=os.path.abspath(tmp_dir))
|
| 282 |
+
threads = min(threads, text_len)
|
| 283 |
+
batch_size = int(math.ceil(float(text_len) / threads))
|
| 284 |
+
|
| 285 |
+
# Report settings
|
| 286 |
+
sys.stderr.write(
|
| 287 |
+
'Moses flags: {}\n'.format(
|
| 288 |
+
' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:])))
|
| 289 |
+
for (i, n) in enumerate(mmsapt_dynamic):
|
| 290 |
+
sys.stderr.write(
|
| 291 |
+
'Dynamic mmsapt {}: {} {} {}\n'.format(
|
| 292 |
+
i, n, mmsapt_l1, mmsapt_l2))
|
| 293 |
+
for (i, n) in enumerate(mmsapt_static):
|
| 294 |
+
sys.stderr.write(
|
| 295 |
+
'Static mmsapt {}: {} {} {}\n'.format(i, n, mmsapt_l1, mmsapt_l2))
|
| 296 |
+
sys.stderr.write('XML mode: {}\n'.format(xml_input))
|
| 297 |
+
sys.stderr.write(
|
| 298 |
+
'Inputs: {} {} {} ({})\n'.format(
|
| 299 |
+
text_src, text_tgt, text_symal, text_len))
|
| 300 |
+
sys.stderr.write('Jobs: {}\n'.format(threads))
|
| 301 |
+
sys.stderr.write('Batch size: {}\n'.format(batch_size))
|
| 302 |
+
if n_best_out:
|
| 303 |
+
sys.stderr.write(
|
| 304 |
+
'N-best list: {} ({}{})\n'.format(
|
| 305 |
+
n_best_out, n_best_size,
|
| 306 |
+
', distinct' if n_best_distinct else ''))
|
| 307 |
+
if hg_dir:
|
| 308 |
+
sys.stderr.write('Hypergraph dir: {} ({})\n'.format(hg_dir, hg_ext))
|
| 309 |
+
sys.stderr.write('Temp dir: {}\n'.format(work_dir))
|
| 310 |
+
|
| 311 |
+
# Accumulate seen lines
|
| 312 |
+
src_lines = []
|
| 313 |
+
tgt_lines = []
|
| 314 |
+
symal_lines = []
|
| 315 |
+
|
| 316 |
+
# Current XML source file
|
| 317 |
+
xml_out = None
|
| 318 |
+
|
| 319 |
+
# Split into batches. Each batch after 0 gets extra files with data from
|
| 320 |
+
# previous batches.
|
| 321 |
+
# Data from previous lines in the current batch is added using XML input.
|
| 322 |
+
job = -1
|
| 323 |
+
lc = -1
|
| 324 |
+
lines = itertools.izip(
|
| 325 |
+
gzopen(text_src), gzopen(text_tgt), gzopen(text_symal))
|
| 326 |
+
for (src, tgt, symal) in lines:
|
| 327 |
+
(src, tgt, symal) = (src.strip(), tgt.strip(), symal.strip())
|
| 328 |
+
lc += 1
|
| 329 |
+
if lc % batch_size == 0:
|
| 330 |
+
job += 1
|
| 331 |
+
xml_file = os.path.join(work_dir, 'input.{}.xml'.format(job))
|
| 332 |
+
extra_src_file = os.path.join(
|
| 333 |
+
work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l1))
|
| 334 |
+
extra_tgt_file = os.path.join(
|
| 335 |
+
work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l2))
|
| 336 |
+
extra_symal_file = os.path.join(
|
| 337 |
+
work_dir, 'extra.{}.{}-{}.symal.gz'.format(
|
| 338 |
+
job, mmsapt_l1, mmsapt_l2))
|
| 339 |
+
if job > 0:
|
| 340 |
+
xml_out.close()
|
| 341 |
+
write_gzfile(src_lines, extra_src_file)
|
| 342 |
+
write_gzfile(tgt_lines, extra_tgt_file)
|
| 343 |
+
write_gzfile(symal_lines, extra_symal_file)
|
| 344 |
+
xml_out = open(xml_file, 'w')
|
| 345 |
+
ini_file = os.path.join(work_dir, 'moses.{}.ini'.format(job))
|
| 346 |
+
with open(ini_file, 'w') as moses_ini_out:
|
| 347 |
+
if job == 0:
|
| 348 |
+
extra = ''
|
| 349 |
+
else:
|
| 350 |
+
extra = ' extra={}'.format(
|
| 351 |
+
os.path.join(work_dir, 'extra.{}.'.format(job)))
|
| 352 |
+
moses_ini_out.write(
|
| 353 |
+
'{}\n'.format(
|
| 354 |
+
'\n'.join(moses_ini_lines).format(mmsapt_extra=extra)))
|
| 355 |
+
src_lines.append(src)
|
| 356 |
+
tgt_lines.append(tgt)
|
| 357 |
+
symal_lines.append(symal)
|
| 358 |
+
# Lines after first start with update tag including previous
|
| 359 |
+
# translation.
|
| 360 |
+
# Translation of last line of each batch is included in extra for
|
| 361 |
+
# next batch.
|
| 362 |
+
xml_tags = []
|
| 363 |
+
if lc % batch_size != 0:
|
| 364 |
+
tag_template = (
|
| 365 |
+
'<update '
|
| 366 |
+
'name="{}" source="{}" target="{}" alignment="{}" /> ')
|
| 367 |
+
for n in mmsapt_dynamic:
|
| 368 |
+
# Note: space after tag.
|
| 369 |
+
xml_tags.append(
|
| 370 |
+
tag_template.format(
|
| 371 |
+
n, src_lines[-2], tgt_lines[-2], symal_lines[-2]))
|
| 372 |
+
xml_out.write('{}{}\n'.format(''.join(xml_tags), src))
|
| 373 |
+
xml_out.close()
|
| 374 |
+
|
| 375 |
+
# Run decoders in parallel
|
| 376 |
+
workers = []
|
| 377 |
+
prog = Progress()
|
| 378 |
+
for i in range(threads):
|
| 379 |
+
work_cmd = cmd[:]
|
| 380 |
+
work_cmd.append('-config')
|
| 381 |
+
work_cmd.append(os.path.join(work_dir, 'moses.{}.ini'.format(i)))
|
| 382 |
+
# Workers use 1 CPU each
|
| 383 |
+
work_cmd.append('-threads')
|
| 384 |
+
work_cmd.append('1')
|
| 385 |
+
if not xml_found:
|
| 386 |
+
work_cmd.append('-xml-input')
|
| 387 |
+
work_cmd.append(xml_input)
|
| 388 |
+
if n_best_out:
|
| 389 |
+
work_cmd.append('-n-best-list')
|
| 390 |
+
work_cmd.append(os.path.join(work_dir, 'nbest.{}'.format(i)))
|
| 391 |
+
work_cmd.append(str(n_best_size))
|
| 392 |
+
if n_best_distinct:
|
| 393 |
+
work_cmd.append('distinct')
|
| 394 |
+
if hg_dir:
|
| 395 |
+
work_cmd.append('-output-search-graph-hypergraph')
|
| 396 |
+
work_cmd.append('true')
|
| 397 |
+
work_cmd.append(hg_ext)
|
| 398 |
+
work_cmd.append(os.path.join(work_dir, 'hg.{}'.format(i)))
|
| 399 |
+
in_file = os.path.join(work_dir, 'input.{}.xml'.format(i))
|
| 400 |
+
out_file = os.path.join(work_dir, 'out.{}'.format(i))
|
| 401 |
+
err_file = os.path.join(work_dir, 'err.{}'.format(i))
|
| 402 |
+
t = threading.Thread(
|
| 403 |
+
target=atomic_io,
|
| 404 |
+
args=(work_cmd, in_file, out_file, err_file, prog))
|
| 405 |
+
workers.append(t)
|
| 406 |
+
t.start()
|
| 407 |
+
# Wait for all to finish
|
| 408 |
+
for t in workers:
|
| 409 |
+
t.join()
|
| 410 |
+
prog.done()
|
| 411 |
+
|
| 412 |
+
# Gather N-best lists
|
| 413 |
+
if n_best_out:
|
| 414 |
+
with open(n_best_out, 'w') as out:
|
| 415 |
+
for i in range(threads):
|
| 416 |
+
path = os.path.join(work_dir, 'nbest.{}'.format(i))
|
| 417 |
+
for line in open(path, 'r'):
|
| 418 |
+
entry = line.partition(' ')
|
| 419 |
+
out.write(
|
| 420 |
+
'{} {}'.format(
|
| 421 |
+
int(entry[0]) + (i * batch_size), entry[2]))
|
| 422 |
+
|
| 423 |
+
# Gather hypergraphs
|
| 424 |
+
if hg_dir:
|
| 425 |
+
if not os.path.exists(hg_dir):
|
| 426 |
+
os.mkdir(hg_dir)
|
| 427 |
+
shutil.copy(
|
| 428 |
+
os.path.join(work_dir, 'hg.0', 'weights'),
|
| 429 |
+
os.path.join(hg_dir, 'weights'))
|
| 430 |
+
for i in range(threads):
|
| 431 |
+
for j in range(batch_size):
|
| 432 |
+
shutil.copy(
|
| 433 |
+
os.path.join(
|
| 434 |
+
work_dir, 'hg.{}'.format(i),
|
| 435 |
+
'{}.{}'.format(j, hg_ext)),
|
| 436 |
+
os.path.join(
|
| 437 |
+
hg_dir, '{}.{}'.format((i * batch_size) + j, hg_ext)))
|
| 438 |
+
|
| 439 |
+
# Gather stdout
|
| 440 |
+
for i in range(threads):
|
| 441 |
+
for line in open(os.path.join(work_dir, 'out.{}'.format(i)), 'r'):
|
| 442 |
+
sys.stdout.write(line)
|
| 443 |
+
|
| 444 |
+
# Cleanup
|
| 445 |
+
shutil.rmtree(work_dir)
|
| 446 |
+
|
| 447 |
+
if __name__ == '__main__':
|
| 448 |
+
try:
|
| 449 |
+
main(sys.argv)
|
| 450 |
+
except ProgramFailure as error:
|
| 451 |
+
sys.stderr.write("%s\n" % error)
|
| 452 |
+
sys.exit(1)
|
mosesdecoder/scripts/generic/mteval-v11b.pl
ADDED
|
@@ -0,0 +1,761 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/perl -w
|
| 2 |
+
|
| 3 |
+
use strict;
|
| 4 |
+
|
| 5 |
+
#################################
|
| 6 |
+
# History:
|
| 7 |
+
#
|
| 8 |
+
# version 11b -- text normalization modified:
|
| 9 |
+
# * take out the join digit line because it joins digits
|
| 10 |
+
# when it shouldn't have
|
| 11 |
+
# $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
|
| 12 |
+
#
|
| 13 |
+
# version 11a -- corrected output of individual n-gram precision values
|
| 14 |
+
#
|
| 15 |
+
# version 11 -- bug fixes:
|
| 16 |
+
# * make filehandle operate in binary mode to prevent Perl from operating
|
| 17 |
+
# (by default in Red Hat 9) in UTF-8
|
| 18 |
+
# * fix failure on joining digits
|
| 19 |
+
# version 10 -- updated output to include more details of n-gram scoring.
|
| 20 |
+
# Defaults to generate both NIST and BLEU scores. Use -b for BLEU
|
| 21 |
+
# only, use -n for NIST only
|
| 22 |
+
#
|
| 23 |
+
# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
|
| 24 |
+
# being the max, regardless what was entered on the command line.)
|
| 25 |
+
#
|
| 26 |
+
# version 09c -- bug fix (During the calculation of ngram information,
|
| 27 |
+
# each ngram was being counted only once for each segment. This has
|
| 28 |
+
# been fixed so that each ngram is counted correctly in each segment.)
|
| 29 |
+
#
|
| 30 |
+
# version 09b -- text normalization modified:
|
| 31 |
+
# * option flag added to preserve upper case
|
| 32 |
+
# * non-ASCII characters left in place.
|
| 33 |
+
#
|
| 34 |
+
# version 09a -- text normalization modified:
|
| 35 |
+
# * " and & converted to "" and &, respectively
|
| 36 |
+
# * non-ASCII characters kept together (bug fix)
|
| 37 |
+
#
|
| 38 |
+
# version 09 -- modified to accommodate sgml tag and attribute
|
| 39 |
+
# names revised to conform to default SGML conventions.
|
| 40 |
+
#
|
| 41 |
+
# version 08 -- modifies the NIST metric in accordance with the
|
| 42 |
+
# findings on the 2001 Chinese-English dry run corpus. Also
|
| 43 |
+
# incorporates the BLEU metric as an option and supports the
|
| 44 |
+
# output of ngram detail.
|
| 45 |
+
#
|
| 46 |
+
# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
|
| 47 |
+
# Keep strings of non-ASCII characters together as one word
|
| 48 |
+
# (rather than splitting them into one-character words).
|
| 49 |
+
# Change length penalty so that translations that are longer than
|
| 50 |
+
# the average reference translation are not penalized.
|
| 51 |
+
#
|
| 52 |
+
# version 06
|
| 53 |
+
# Prevent divide-by-zero when a segment has no evaluation N-grams.
|
| 54 |
+
# Correct segment index for level 3 debug output.
|
| 55 |
+
#
|
| 56 |
+
# version 05
|
| 57 |
+
# improve diagnostic error messages
|
| 58 |
+
#
|
| 59 |
+
# version 04
|
| 60 |
+
# tag segments
|
| 61 |
+
#
|
| 62 |
+
# version 03
|
| 63 |
+
# add detailed output option (intermediate document and segment scores)
|
| 64 |
+
#
|
| 65 |
+
# version 02
|
| 66 |
+
# accommodation of modified sgml tags and attributes
|
| 67 |
+
#
|
| 68 |
+
# version 01
|
| 69 |
+
# same as bleu version 15, but modified to provide formal score output.
|
| 70 |
+
#
|
| 71 |
+
# original IBM version
|
| 72 |
+
# Author: Kishore Papineni
|
| 73 |
+
# Date: 06/10/2001
|
| 74 |
+
#################################
|
| 75 |
+
|
| 76 |
+
######
|
| 77 |
+
# Intro
|
| 78 |
+
my ($date, $time) = date_time_stamp();
|
| 79 |
+
print "MT evaluation scorer began on $date at $time\n";
|
| 80 |
+
print "command line: ", $0, " ", join(" ", @ARGV), "\n";
|
| 81 |
+
my $usage = "\n\nUsage: $0 [-h] -r <ref_file> -s src_file -t <tst_file>\n\n".
|
| 82 |
+
"Description: This Perl script evaluates MT system performance.\n".
|
| 83 |
+
"\n".
|
| 84 |
+
"Required arguments:\n".
|
| 85 |
+
" -r <ref_file> is a file containing the reference translations for\n".
|
| 86 |
+
" the documents to be evaluated.\n".
|
| 87 |
+
" -s <src_file> is a file containing the source documents for which\n".
|
| 88 |
+
" translations are to be evaluated\n".
|
| 89 |
+
" -t <tst_file> is a file containing the translations to be evaluated\n".
|
| 90 |
+
"\n".
|
| 91 |
+
"Optional arguments:\n".
|
| 92 |
+
" -c preserves upper-case alphabetic characters\n".
|
| 93 |
+
" -b generate BLEU scores only\n".
|
| 94 |
+
" -n generate NIST scores only\n".
|
| 95 |
+
" -d detailed output flag used in conjunction with \"-b\" or \"-n\" flags:\n".
|
| 96 |
+
" 0 (default) for system-level score only\n".
|
| 97 |
+
" 1 to include document-level scores\n".
|
| 98 |
+
" 2 to include segment-level scores\n".
|
| 99 |
+
" 3 to include ngram-level scores\n".
|
| 100 |
+
" -h prints this help message to STDOUT\n".
|
| 101 |
+
"\n";
|
| 102 |
+
|
| 103 |
+
use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x);
|
| 104 |
+
use Getopt::Std;
|
| 105 |
+
getopts ('r:s:t:d:hbncx:');
|
| 106 |
+
die $usage if defined($opt_h);
|
| 107 |
+
die "Error in command line: ref_file not defined$usage" unless defined $opt_r;
|
| 108 |
+
die "Error in command line: src_file not defined$usage" unless defined $opt_s;
|
| 109 |
+
die "Error in command line: tst_file not defined$usage" unless defined $opt_t;
|
| 110 |
+
my $max_Ngram = 9;
|
| 111 |
+
my $detail = defined $opt_d ? $opt_d : 0;
|
| 112 |
+
my $preserve_case = defined $opt_c ? 1 : 0;
|
| 113 |
+
|
| 114 |
+
my $METHOD = "BOTH";
|
| 115 |
+
if (defined $opt_b) { $METHOD = "BLEU"; }
|
| 116 |
+
if (defined $opt_n) { $METHOD = "NIST"; }
|
| 117 |
+
my $method;
|
| 118 |
+
|
| 119 |
+
my ($ref_file) = $opt_r;
|
| 120 |
+
my ($src_file) = $opt_s;
|
| 121 |
+
my ($tst_file) = $opt_t;
|
| 122 |
+
|
| 123 |
+
######
|
| 124 |
+
# Global variables
|
| 125 |
+
my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
|
| 126 |
+
my (%tst_data, %ref_data); # the data -- with structure: {system}{document}[segments]
|
| 127 |
+
my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
|
| 128 |
+
my %eval_docs; # document information for the evaluation data set
|
| 129 |
+
my %ngram_info; # the information obtained from (the last word in) the ngram
|
| 130 |
+
|
| 131 |
+
######
|
| 132 |
+
# Get source document ID's
|
| 133 |
+
($src_id) = get_source_info ($src_file);
|
| 134 |
+
|
| 135 |
+
######
|
| 136 |
+
# Get reference translations
|
| 137 |
+
($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
|
| 138 |
+
|
| 139 |
+
compute_ngram_info ();
|
| 140 |
+
|
| 141 |
+
######
|
| 142 |
+
# Get translations to evaluate
|
| 143 |
+
($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
|
| 144 |
+
|
| 145 |
+
######
|
| 146 |
+
# Check data for completeness and correctness
|
| 147 |
+
check_MT_data ();
|
| 148 |
+
|
| 149 |
+
######
|
| 150 |
+
#
|
| 151 |
+
my %NISTmt = ();
|
| 152 |
+
my %BLEUmt = ();
|
| 153 |
+
|
| 154 |
+
######
|
| 155 |
+
# Evaluate
|
| 156 |
+
print " Evaluation of $src_lang-to-$tgt_lang translation using:\n";
|
| 157 |
+
my $cum_seg = 0;
|
| 158 |
+
foreach my $doc (sort keys %eval_docs) {
|
| 159 |
+
$cum_seg += @{$eval_docs{$doc}{SEGS}};
|
| 160 |
+
}
|
| 161 |
+
print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
|
| 162 |
+
print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
|
| 163 |
+
print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
|
| 164 |
+
|
| 165 |
+
foreach my $sys (sort @tst_sys) {
|
| 166 |
+
for (my $n=1; $n<=$max_Ngram; $n++) {
|
| 167 |
+
$NISTmt{$n}{$sys}{cum} = 0;
|
| 168 |
+
$NISTmt{$n}{$sys}{ind} = 0;
|
| 169 |
+
$BLEUmt{$n}{$sys}{cum} = 0;
|
| 170 |
+
$BLEUmt{$n}{$sys}{ind} = 0;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
if (($METHOD eq "BOTH") || ($METHOD eq "NIST")) {
|
| 174 |
+
$method="NIST";
|
| 175 |
+
score_system ($sys, %NISTmt);
|
| 176 |
+
}
|
| 177 |
+
if (($METHOD eq "BOTH") || ($METHOD eq "BLEU")) {
|
| 178 |
+
$method="BLEU";
|
| 179 |
+
score_system ($sys, %BLEUmt);
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
######
|
| 184 |
+
printout_report ();
|
| 185 |
+
|
| 186 |
+
($date, $time) = date_time_stamp();
|
| 187 |
+
print "MT evaluation scorer ended on $date at $time\n";
|
| 188 |
+
|
| 189 |
+
exit 0;
|
| 190 |
+
|
| 191 |
+
#################################
|
| 192 |
+
|
| 193 |
+
sub get_source_info {
|
| 194 |
+
|
| 195 |
+
my ($file) = @_;
|
| 196 |
+
my ($name, $id, $src, $doc);
|
| 197 |
+
my ($data, $tag, $span);
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
#read data from file
|
| 201 |
+
open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
|
| 202 |
+
binmode FILE;
|
| 203 |
+
$data .= $_ while <FILE>;
|
| 204 |
+
close (FILE);
|
| 205 |
+
|
| 206 |
+
#get source set info
|
| 207 |
+
die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n"
|
| 208 |
+
unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
|
| 209 |
+
|
| 210 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 211 |
+
unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
|
| 212 |
+
|
| 213 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 214 |
+
unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
|
| 215 |
+
die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
|
| 216 |
+
." with $name in previous input data ('$src_lang')\n\n"
|
| 217 |
+
unless (not defined $src_lang or $src eq $src_lang);
|
| 218 |
+
$src_lang = $src;
|
| 219 |
+
|
| 220 |
+
#get doc info -- ID and # of segs
|
| 221 |
+
$data = $span;
|
| 222 |
+
while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) {
|
| 223 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 224 |
+
unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
|
| 225 |
+
die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n"
|
| 226 |
+
if defined $eval_docs{$doc};
|
| 227 |
+
$span =~ s/[\s\n\r]+/ /g; # concatenate records
|
| 228 |
+
my $jseg=0, my $seg_data = $span;
|
| 229 |
+
while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
|
| 230 |
+
($eval_docs{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
|
| 231 |
+
}
|
| 232 |
+
die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
|
| 233 |
+
if $jseg == 0;
|
| 234 |
+
}
|
| 235 |
+
die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n"
|
| 236 |
+
unless keys %eval_docs > 0;
|
| 237 |
+
return $id;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
#################################
|
| 241 |
+
|
| 242 |
+
sub get_MT_data {
|
| 243 |
+
|
| 244 |
+
my ($docs, $set_tag, $file) = @_;
|
| 245 |
+
my ($name, $id, $src, $tgt, $sys, $doc);
|
| 246 |
+
my ($tag, $span, $data);
|
| 247 |
+
|
| 248 |
+
#read data from file
|
| 249 |
+
open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
|
| 250 |
+
binmode FILE;
|
| 251 |
+
$data .= $_ while <FILE>;
|
| 252 |
+
close (FILE);
|
| 253 |
+
|
| 254 |
+
#get tag info
|
| 255 |
+
while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) {
|
| 256 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
|
| 257 |
+
($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
|
| 258 |
+
|
| 259 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
|
| 260 |
+
($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
|
| 261 |
+
die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
|
| 262 |
+
." with $name of source ('$src_lang')\n\n"
|
| 263 |
+
unless $src eq $src_lang;
|
| 264 |
+
|
| 265 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
|
| 266 |
+
($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
|
| 267 |
+
die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n"
|
| 268 |
+
." with $name of the evaluation ('$tgt_lang')\n\n"
|
| 269 |
+
unless (not defined $tgt_lang or $tgt eq $tgt_lang);
|
| 270 |
+
$tgt_lang = $tgt;
|
| 271 |
+
|
| 272 |
+
my $mtdata = $span;
|
| 273 |
+
while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) {
|
| 274 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
|
| 275 |
+
(my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
|
| 276 |
+
|
| 277 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
|
| 278 |
+
$doc = extract_sgml_tag_attribute ($name="DocID", $tag);
|
| 279 |
+
|
| 280 |
+
die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n"
|
| 281 |
+
." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
|
| 282 |
+
unless (not defined $docs->{$sys}{$doc});
|
| 283 |
+
|
| 284 |
+
$span =~ s/[\s\n\r]+/ /g; # concatenate records
|
| 285 |
+
my $jseg=0, my $seg_data = $span;
|
| 286 |
+
while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
|
| 287 |
+
($docs->{$sys}{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
|
| 288 |
+
}
|
| 289 |
+
die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
|
| 290 |
+
if $jseg == 0;
|
| 291 |
+
$docs->{$sys}{$doc}{FILE} = $file;
|
| 292 |
+
}
|
| 293 |
+
}
|
| 294 |
+
return $id;
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
#################################
|
| 298 |
+
|
| 299 |
+
sub check_MT_data {
|
| 300 |
+
|
| 301 |
+
@tst_sys = sort keys %tst_data;
|
| 302 |
+
@ref_sys = sort keys %ref_data;
|
| 303 |
+
|
| 304 |
+
#every evaluation document must be represented for every system and every reference
|
| 305 |
+
foreach my $doc (sort keys %eval_docs) {
|
| 306 |
+
my $nseg_source = @{$eval_docs{$doc}{SEGS}};
|
| 307 |
+
foreach my $sys (@tst_sys) {
|
| 308 |
+
die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n"
|
| 309 |
+
unless defined $tst_data{$sys}{$doc};
|
| 310 |
+
my $nseg = @{$tst_data{$sys}{$doc}{SEGS}};
|
| 311 |
+
die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
|
| 312 |
+
." document '$doc' for system '$sys' contains $nseg segments, while\n"
|
| 313 |
+
." the source document contains $nseg_source segments.\n\n"
|
| 314 |
+
unless $nseg == $nseg_source;
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
foreach my $sys (@ref_sys) {
|
| 318 |
+
die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n"
|
| 319 |
+
unless defined $ref_data{$sys}{$doc};
|
| 320 |
+
my $nseg = @{$ref_data{$sys}{$doc}{SEGS}};
|
| 321 |
+
die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
|
| 322 |
+
." document '$doc' for system '$sys' contains $nseg segments, while\n"
|
| 323 |
+
." the source document contains $nseg_source segments.\n\n"
|
| 324 |
+
unless $nseg == $nseg_source;
|
| 325 |
+
}
|
| 326 |
+
}
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
#################################
|
| 330 |
+
|
| 331 |
+
sub compute_ngram_info {
|
| 332 |
+
|
| 333 |
+
my ($ref, $doc, $seg);
|
| 334 |
+
my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
|
| 335 |
+
my (%ngram_count, @tot_ngrams);
|
| 336 |
+
|
| 337 |
+
foreach $ref (keys %ref_data) {
|
| 338 |
+
foreach $doc (keys %{$ref_data{$ref}}) {
|
| 339 |
+
foreach $seg (@{$ref_data{$ref}{$doc}{SEGS}}) {
|
| 340 |
+
@wrds = split /\s+/, $seg;
|
| 341 |
+
$tot_wrds += @wrds;
|
| 342 |
+
%ngrams = %{Words2Ngrams (@wrds)};
|
| 343 |
+
foreach $ngram (keys %ngrams) {
|
| 344 |
+
$ngram_count{$ngram} += $ngrams{$ngram};
|
| 345 |
+
}
|
| 346 |
+
}
|
| 347 |
+
}
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
foreach $ngram (keys %ngram_count) {
|
| 351 |
+
@wrds = split / /, $ngram;
|
| 352 |
+
pop @wrds, $mgram = join " ", @wrds;
|
| 353 |
+
$ngram_info{$ngram} = - log
|
| 354 |
+
($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram}
|
| 355 |
+
: $ngram_count{$ngram}/$tot_wrds) / log 2;
|
| 356 |
+
if (defined $opt_x and $opt_x eq "ngram info") {
|
| 357 |
+
@wrds = split / /, $ngram;
|
| 358 |
+
printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
|
| 359 |
+
$mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
|
| 360 |
+
}
|
| 361 |
+
}
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
#################################
|
| 365 |
+
|
| 366 |
+
sub score_system {
|
| 367 |
+
|
| 368 |
+
my ($sys, $ref, $doc, %SCOREmt);
|
| 369 |
+
($sys, %SCOREmt) = @_;
|
| 370 |
+
my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
|
| 371 |
+
my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
|
| 372 |
+
|
| 373 |
+
$cum_ref_length = 0;
|
| 374 |
+
for (my $j=1; $j<=$max_Ngram; $j++) {
|
| 375 |
+
$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
foreach $doc (sort keys %eval_docs) {
|
| 379 |
+
($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc);
|
| 380 |
+
|
| 381 |
+
#output document summary score
|
| 382 |
+
if (($detail >= 1 ) && ($METHOD eq "NIST")) {
|
| 383 |
+
my %DOCmt = ();
|
| 384 |
+
printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
|
| 385 |
+
nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt),
|
| 386 |
+
scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
|
| 387 |
+
}
|
| 388 |
+
if (($detail >= 1 ) && ($METHOD eq "BLEU")) {
|
| 389 |
+
my %DOCmt = ();
|
| 390 |
+
printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
|
| 391 |
+
bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt),
|
| 392 |
+
scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
$cum_ref_length += $shortest_ref_length;
|
| 396 |
+
for (my $j=1; $j<=$max_Ngram; $j++) {
|
| 397 |
+
$cum_match[$j] += $match_cnt->[$j];
|
| 398 |
+
$cum_tst_cnt[$j] += $tst_cnt->[$j];
|
| 399 |
+
$cum_ref_cnt[$j] += $ref_cnt->[$j];
|
| 400 |
+
$cum_tst_info[$j] += $tst_info->[$j];
|
| 401 |
+
$cum_ref_info[$j] += $ref_info->[$j];
|
| 402 |
+
printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
|
| 403 |
+
$tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
|
| 404 |
+
if (defined $opt_x and $opt_x eq "document info");
|
| 405 |
+
}
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
#x #output system summary score
|
| 409 |
+
#x printf "$method score = %.4f for system \"$sys\"\n",
|
| 410 |
+
#x $method eq "BLEU" ? bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt) :
|
| 411 |
+
#x nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
|
| 412 |
+
if ($method eq "BLEU") {
|
| 413 |
+
bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, %SCOREmt);
|
| 414 |
+
}
|
| 415 |
+
if ($method eq "NIST") {
|
| 416 |
+
nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
|
| 417 |
+
}
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
#################################
|
| 421 |
+
|
| 422 |
+
sub score_document {
|
| 423 |
+
|
| 424 |
+
my ($sys, $ref, $doc);
|
| 425 |
+
($sys, $doc) = @_;
|
| 426 |
+
my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
|
| 427 |
+
my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
|
| 428 |
+
|
| 429 |
+
$cum_ref_length = 0;
|
| 430 |
+
for (my $j=1; $j<=$max_Ngram; $j++) {
|
| 431 |
+
$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
#score each segment
|
| 435 |
+
for (my $jseg=0; $jseg<@{$tst_data{$sys}{$doc}{SEGS}}; $jseg++) {
|
| 436 |
+
my @ref_segments = ();
|
| 437 |
+
foreach $ref (@ref_sys) {
|
| 438 |
+
push @ref_segments, $ref_data{$ref}{$doc}{SEGS}[$jseg];
|
| 439 |
+
printf "ref '$ref', seg %d: %s\n", $jseg+1, $ref_data{$ref}{$doc}{SEGS}[$jseg]
|
| 440 |
+
if $detail >= 3;
|
| 441 |
+
}
|
| 442 |
+
printf "sys '$sys', seg %d: %s\n", $jseg+1, $tst_data{$sys}{$doc}{SEGS}[$jseg]
|
| 443 |
+
if $detail >= 3;
|
| 444 |
+
($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) =
|
| 445 |
+
score_segment ($tst_data{$sys}{$doc}{SEGS}[$jseg], @ref_segments);
|
| 446 |
+
|
| 447 |
+
#output segment summary score
|
| 448 |
+
#x printf "$method score = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
|
| 449 |
+
#x $method eq "BLEU" ? bleu_score($shortest_ref_length, $match_cnt, $tst_cnt) :
|
| 450 |
+
#x nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info),
|
| 451 |
+
#x $jseg+1, $tst_cnt->[1]
|
| 452 |
+
#x if $detail >= 2;
|
| 453 |
+
if (($detail >=2) && ($METHOD eq "BLEU")) {
|
| 454 |
+
my %DOCmt = ();
|
| 455 |
+
printf " $method score using 4-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
|
| 456 |
+
bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
|
| 457 |
+
}
|
| 458 |
+
if (($detail >=2) && ($METHOD eq "NIST")) {
|
| 459 |
+
my %DOCmt = ();
|
| 460 |
+
printf " $method score using 5-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
|
| 461 |
+
nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
|
| 462 |
+
}
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
$cum_ref_length += $shortest_ref_length;
|
| 466 |
+
for (my $j=1; $j<=$max_Ngram; $j++) {
|
| 467 |
+
$cum_match[$j] += $match_cnt->[$j];
|
| 468 |
+
$cum_tst_cnt[$j] += $tst_cnt->[$j];
|
| 469 |
+
$cum_ref_cnt[$j] += $ref_cnt->[$j];
|
| 470 |
+
$cum_tst_info[$j] += $tst_info->[$j];
|
| 471 |
+
$cum_ref_info[$j] += $ref_info->[$j];
|
| 472 |
+
}
|
| 473 |
+
}
|
| 474 |
+
return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
#################################
|
| 478 |
+
|
| 479 |
+
sub score_segment {
|
| 480 |
+
|
| 481 |
+
my ($tst_seg, @ref_segs) = @_;
|
| 482 |
+
my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
|
| 483 |
+
my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
|
| 484 |
+
my ($ngram);
|
| 485 |
+
my (@nwrds_ref);
|
| 486 |
+
my $shortest_ref_length;
|
| 487 |
+
|
| 488 |
+
for (my $j=1; $j<= $max_Ngram; $j++) {
|
| 489 |
+
$match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
# get the ngram counts for the test segment
|
| 493 |
+
@tst_wrds = split /\s+/, $tst_seg;
|
| 494 |
+
%tst_ngrams = %{Words2Ngrams (@tst_wrds)};
|
| 495 |
+
for (my $j=1; $j<=$max_Ngram; $j++) { # compute ngram counts
|
| 496 |
+
$tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
# get the ngram counts for the reference segments
|
| 500 |
+
foreach $ref_seg (@ref_segs) {
|
| 501 |
+
@ref_wrds = split /\s+/, $ref_seg;
|
| 502 |
+
%ref_ngrams = %{Words2Ngrams (@ref_wrds)};
|
| 503 |
+
foreach $ngram (keys %ref_ngrams) { # find the maximum # of occurrences
|
| 504 |
+
my @wrds = split / /, $ngram;
|
| 505 |
+
$ref_info[@wrds] += $ngram_info{$ngram};
|
| 506 |
+
$ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ?
|
| 507 |
+
max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) :
|
| 508 |
+
$ref_ngrams{$ngram};
|
| 509 |
+
}
|
| 510 |
+
for (my $j=1; $j<=$max_Ngram; $j++) { # update ngram counts
|
| 511 |
+
$ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
|
| 512 |
+
}
|
| 513 |
+
$shortest_ref_length = scalar @ref_wrds # find the shortest reference segment
|
| 514 |
+
if (not defined $shortest_ref_length) or @ref_wrds < $shortest_ref_length;
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
|
| 518 |
+
foreach $ngram (keys %tst_ngrams) {
|
| 519 |
+
next unless defined $ref_ngrams_max{$ngram};
|
| 520 |
+
my @wrds = split / /, $ngram;
|
| 521 |
+
$tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
|
| 522 |
+
$match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
|
| 523 |
+
printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
|
| 524 |
+
if $detail >= 3;
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
return ($shortest_ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
#################################
|
| 531 |
+
|
| 532 |
+
sub bleu_score {
|
| 533 |
+
|
| 534 |
+
my ($shortest_ref_length, $matching_ngrams, $tst_ngrams, $sys, %SCOREmt) = @_;
|
| 535 |
+
|
| 536 |
+
my $score = 0;
|
| 537 |
+
my $iscore = 0;
|
| 538 |
+
my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
|
| 539 |
+
|
| 540 |
+
for (my $j=1; $j<=$max_Ngram; $j++) {
|
| 541 |
+
if ($matching_ngrams->[$j] == 0) {
|
| 542 |
+
$SCOREmt{$j}{$sys}{cum}=0;
|
| 543 |
+
} else {
|
| 544 |
+
# Cumulative N-Gram score
|
| 545 |
+
$score += log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
|
| 546 |
+
$SCOREmt{$j}{$sys}{cum} = exp($score/$j + $len_score);
|
| 547 |
+
# Individual N-Gram score
|
| 548 |
+
$iscore = log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
|
| 549 |
+
$SCOREmt{$j}{$sys}{ind} = exp($iscore);
|
| 550 |
+
}
|
| 551 |
+
}
|
| 552 |
+
return $SCOREmt{4}{$sys}{cum};
|
| 553 |
+
}
|
| 554 |
+
|
| 555 |
+
#################################
|
| 556 |
+
|
| 557 |
+
sub nist_score {
|
| 558 |
+
|
| 559 |
+
my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, %SCOREmt) = @_;
|
| 560 |
+
|
| 561 |
+
my $score = 0;
|
| 562 |
+
my $iscore = 0;
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
for (my $n=1; $n<=$max_Ngram; $n++) {
|
| 566 |
+
$score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
|
| 567 |
+
$SCOREmt{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
|
| 568 |
+
|
| 569 |
+
$iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
|
| 570 |
+
$SCOREmt{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
|
| 571 |
+
}
|
| 572 |
+
return $SCOREmt{5}{$sys}{cum};
|
| 573 |
+
}
|
| 574 |
+
|
| 575 |
+
#################################
|
| 576 |
+
|
| 577 |
+
sub Words2Ngrams { #convert a string of words to an Ngram count hash
|
| 578 |
+
|
| 579 |
+
my %count = ();
|
| 580 |
+
|
| 581 |
+
for (; @_; shift) {
|
| 582 |
+
my ($j, $ngram, $word);
|
| 583 |
+
for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) {
|
| 584 |
+
$ngram .= defined $ngram ? " $word" : $word;
|
| 585 |
+
$count{$ngram}++;
|
| 586 |
+
}
|
| 587 |
+
}
|
| 588 |
+
return {%count};
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
#################################
|
| 592 |
+
|
| 593 |
+
sub NormalizeText {
|
| 594 |
+
my ($norm_text) = @_;
|
| 595 |
+
|
| 596 |
+
# language-independent part:
|
| 597 |
+
$norm_text =~ s/<skipped>//g; # strip "skipped" tags
|
| 598 |
+
$norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
|
| 599 |
+
$norm_text =~ s/\n/ /g; # join lines
|
| 600 |
+
$norm_text =~ s/"/"/g; # convert SGML tag for quote to "
|
| 601 |
+
$norm_text =~ s/&/&/g; # convert SGML tag for ampersand to &
|
| 602 |
+
$norm_text =~ s/</</g; # convert SGML tag for less-than to >
|
| 603 |
+
$norm_text =~ s/>/>/g; # convert SGML tag for greater-than to <
|
| 604 |
+
|
| 605 |
+
# language-dependent part (assuming Western languages):
|
| 606 |
+
$norm_text = " $norm_text ";
|
| 607 |
+
$norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
|
| 608 |
+
$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation
|
| 609 |
+
$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
|
| 610 |
+
$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
|
| 611 |
+
$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
|
| 612 |
+
$norm_text =~ s/\s+/ /g; # one space only between words
|
| 613 |
+
$norm_text =~ s/^\s+//; # no leading space
|
| 614 |
+
$norm_text =~ s/\s+$//; # no trailing space
|
| 615 |
+
|
| 616 |
+
return $norm_text;
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
#################################
|
| 620 |
+
|
| 621 |
+
sub nist_length_penalty {
|
| 622 |
+
|
| 623 |
+
my ($ratio) = @_;
|
| 624 |
+
return 1 if $ratio >= 1;
|
| 625 |
+
return 0 if $ratio <= 0;
|
| 626 |
+
my $ratio_x = 1.5;
|
| 627 |
+
my $score_x = 0.5;
|
| 628 |
+
my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
|
| 629 |
+
return exp (-$beta*log($ratio)*log($ratio));
|
| 630 |
+
}
|
| 631 |
+
|
| 632 |
+
#################################
|
| 633 |
+
|
| 634 |
+
sub date_time_stamp {
|
| 635 |
+
|
| 636 |
+
my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
|
| 637 |
+
my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
|
| 638 |
+
my ($date, $time);
|
| 639 |
+
|
| 640 |
+
$time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
|
| 641 |
+
$date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
|
| 642 |
+
return ($date, $time);
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
#################################
|
| 646 |
+
|
| 647 |
+
sub extract_sgml_tag_and_span {
|
| 648 |
+
|
| 649 |
+
my ($name, $data) = @_;
|
| 650 |
+
|
| 651 |
+
($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
|
| 652 |
+
}
|
| 653 |
+
|
| 654 |
+
#################################
|
| 655 |
+
|
| 656 |
+
sub extract_sgml_tag_attribute {
|
| 657 |
+
|
| 658 |
+
my ($name, $data) = @_;
|
| 659 |
+
|
| 660 |
+
($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
|
| 661 |
+
}
|
| 662 |
+
|
| 663 |
+
#################################
|
| 664 |
+
|
| 665 |
+
sub max {
|
| 666 |
+
|
| 667 |
+
my ($max, $next);
|
| 668 |
+
|
| 669 |
+
return unless defined ($max=pop);
|
| 670 |
+
while (defined ($next=pop)) {
|
| 671 |
+
$max = $next if $next > $max;
|
| 672 |
+
}
|
| 673 |
+
return $max;
|
| 674 |
+
}
|
| 675 |
+
|
| 676 |
+
#################################
|
| 677 |
+
|
| 678 |
+
sub min {
|
| 679 |
+
|
| 680 |
+
my ($min, $next);
|
| 681 |
+
|
| 682 |
+
return unless defined ($min=pop);
|
| 683 |
+
while (defined ($next=pop)) {
|
| 684 |
+
$min = $next if $next < $min;
|
| 685 |
+
}
|
| 686 |
+
return $min;
|
| 687 |
+
}
|
| 688 |
+
|
| 689 |
+
#################################
|
| 690 |
+
|
| 691 |
+
sub printout_report
|
| 692 |
+
{
|
| 693 |
+
|
| 694 |
+
if ( $METHOD eq "BOTH" ) {
|
| 695 |
+
foreach my $sys (sort @tst_sys) {
|
| 696 |
+
printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
|
| 697 |
+
}
|
| 698 |
+
} elsif ($METHOD eq "NIST" ) {
|
| 699 |
+
foreach my $sys (sort @tst_sys) {
|
| 700 |
+
printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
|
| 701 |
+
}
|
| 702 |
+
} elsif ($METHOD eq "BLEU" ) {
|
| 703 |
+
foreach my $sys (sort @tst_sys) {
|
| 704 |
+
printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
|
| 705 |
+
}
|
| 706 |
+
}
|
| 707 |
+
|
| 708 |
+
|
| 709 |
+
printf "\n# ------------------------------------------------------------------------\n\n";
|
| 710 |
+
printf "Individual N-gram scoring\n";
|
| 711 |
+
printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
|
| 712 |
+
printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
|
| 713 |
+
|
| 714 |
+
if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
|
| 715 |
+
foreach my $sys (sort @tst_sys) {
|
| 716 |
+
printf " NIST:";
|
| 717 |
+
for (my $i=1; $i<=$max_Ngram; $i++) {
|
| 718 |
+
printf " %2.4f ",$NISTmt{$i}{$sys}{ind}
|
| 719 |
+
}
|
| 720 |
+
printf " \"$sys\"\n";
|
| 721 |
+
}
|
| 722 |
+
printf "\n";
|
| 723 |
+
}
|
| 724 |
+
|
| 725 |
+
if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
|
| 726 |
+
foreach my $sys (sort @tst_sys) {
|
| 727 |
+
printf " BLEU:";
|
| 728 |
+
for (my $i=1; $i<=$max_Ngram; $i++) {
|
| 729 |
+
printf " %2.4f ",$BLEUmt{$i}{$sys}{ind}
|
| 730 |
+
}
|
| 731 |
+
printf " \"$sys\"\n";
|
| 732 |
+
}
|
| 733 |
+
}
|
| 734 |
+
|
| 735 |
+
printf "\n# ------------------------------------------------------------------------\n";
|
| 736 |
+
printf "Cumulative N-gram scoring\n";
|
| 737 |
+
printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
|
| 738 |
+
printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
|
| 739 |
+
|
| 740 |
+
if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
|
| 741 |
+
foreach my $sys (sort @tst_sys) {
|
| 742 |
+
printf " NIST:";
|
| 743 |
+
for (my $i=1; $i<=$max_Ngram; $i++) {
|
| 744 |
+
printf " %2.4f ",$NISTmt{$i}{$sys}{cum}
|
| 745 |
+
}
|
| 746 |
+
printf " \"$sys\"\n";
|
| 747 |
+
}
|
| 748 |
+
}
|
| 749 |
+
printf "\n";
|
| 750 |
+
|
| 751 |
+
|
| 752 |
+
if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
|
| 753 |
+
foreach my $sys (sort @tst_sys) {
|
| 754 |
+
printf " BLEU:";
|
| 755 |
+
for (my $i=1; $i<=$max_Ngram; $i++) {
|
| 756 |
+
printf " %2.4f ",$BLEUmt{$i}{$sys}{cum}
|
| 757 |
+
}
|
| 758 |
+
printf " \"$sys\"\n";
|
| 759 |
+
}
|
| 760 |
+
}
|
| 761 |
+
}
|
mosesdecoder/scripts/generic/mteval-v12.pl
ADDED
|
@@ -0,0 +1,784 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
|
| 3 |
+
use warnings;
|
| 4 |
+
use strict;
|
| 5 |
+
use utf8;
|
| 6 |
+
use Encode;
|
| 7 |
+
|
| 8 |
+
binmode STDOUT, ":utf8";
|
| 9 |
+
binmode STDERR, ":utf8";
|
| 10 |
+
|
| 11 |
+
#################################
|
| 12 |
+
# History:
|
| 13 |
+
#
|
| 14 |
+
# version 12
|
| 15 |
+
# * Text normalization changes:
|
| 16 |
+
# * convert entity references (only the entities declared in the DTD)
|
| 17 |
+
# * now uses unicode categories
|
| 18 |
+
# * tokenize punctuation unless followed AND preceded by digits
|
| 19 |
+
# * tokenize symbols
|
| 20 |
+
# * UTF-8 handling:
|
| 21 |
+
# * files are now read using utf8 mode
|
| 22 |
+
# * Added the '-e' command-line option to enclose non-ASCII characters between spaces
|
| 23 |
+
#
|
| 24 |
+
# version 11b -- text normalization modified:
|
| 25 |
+
# * take out the join digit line because it joins digits
|
| 26 |
+
# when it shouldn't have
|
| 27 |
+
# $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
|
| 28 |
+
#
|
| 29 |
+
# version 11a -- corrected output of individual n-gram precision values
|
| 30 |
+
#
|
| 31 |
+
# version 11 -- bug fixes:
|
| 32 |
+
# * make filehandle operate in binary mode to prevent Perl from operating
|
| 33 |
+
# (by default in Red Hat 9) in UTF-8
|
| 34 |
+
# * fix failure on joining digits
|
| 35 |
+
# version 10 -- updated output to include more details of n-gram scoring.
|
| 36 |
+
# Defaults to generate both NIST and BLEU scores. Use -b for BLEU
|
| 37 |
+
# only, use -n for NIST only
|
| 38 |
+
#
|
| 39 |
+
# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
|
| 40 |
+
# being the max, regardless what was entered on the command line.)
|
| 41 |
+
#
|
| 42 |
+
# version 09c -- bug fix (During the calculation of ngram information,
|
| 43 |
+
# each ngram was being counted only once for each segment. This has
|
| 44 |
+
# been fixed so that each ngram is counted correctly in each segment.)
|
| 45 |
+
#
|
| 46 |
+
# version 09b -- text normalization modified:
|
| 47 |
+
# * option flag added to preserve upper case
|
| 48 |
+
# * non-ASCII characters left in place.
|
| 49 |
+
#
|
| 50 |
+
# version 09a -- text normalization modified:
|
| 51 |
+
# * " and & converted to "" and &, respectively
|
| 52 |
+
# * non-ASCII characters kept together (bug fix)
|
| 53 |
+
#
|
| 54 |
+
# version 09 -- modified to accommodate sgml tag and attribute
|
| 55 |
+
# names revised to conform to default SGML conventions.
|
| 56 |
+
#
|
| 57 |
+
# version 08 -- modifies the NIST metric in accordance with the
|
| 58 |
+
# findings on the 2001 Chinese-English dry run corpus. Also
|
| 59 |
+
# incorporates the BLEU metric as an option and supports the
|
| 60 |
+
# output of ngram detail.
|
| 61 |
+
#
|
| 62 |
+
# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
|
| 63 |
+
# Keep strings of non-ASCII characters together as one word
|
| 64 |
+
# (rather than splitting them into one-character words).
|
| 65 |
+
# Change length penalty so that translations that are longer than
|
| 66 |
+
# the average reference translation are not penalized.
|
| 67 |
+
#
|
| 68 |
+
# version 06
|
| 69 |
+
# Prevent divide-by-zero when a segment has no evaluation N-grams.
|
| 70 |
+
# Correct segment index for level 3 debug output.
|
| 71 |
+
#
|
| 72 |
+
# version 05
|
| 73 |
+
# improve diagnostic error messages
|
| 74 |
+
#
|
| 75 |
+
# version 04
|
| 76 |
+
# tag segments
|
| 77 |
+
#
|
| 78 |
+
# version 03
|
| 79 |
+
# add detailed output option (intermediate document and segment scores)
|
| 80 |
+
#
|
| 81 |
+
# version 02
|
| 82 |
+
# accommodation of modified sgml tags and attributes
|
| 83 |
+
#
|
| 84 |
+
# version 01
|
| 85 |
+
# same as bleu version 15, but modified to provide formal score output.
|
| 86 |
+
#
|
| 87 |
+
# original IBM version
|
| 88 |
+
# Author: Kishore Papineni
|
| 89 |
+
# Date: 06/10/2001
|
| 90 |
+
#################################
|
| 91 |
+
|
| 92 |
+
######
|
| 93 |
+
# Intro
|
| 94 |
+
my ($date, $time) = date_time_stamp();
|
| 95 |
+
print "MT evaluation scorer began on $date at $time\n";
|
| 96 |
+
print "command line: ", $0, " ", join(" ", @ARGV), "\n";
|
| 97 |
+
my $usage = "\n\nUsage: $0 [-h] -r <ref_file> -s <src_file> -t <tst_file>\n\n".
|
| 98 |
+
"Description: This Perl script evaluates MT system performance.\n".
|
| 99 |
+
"\n".
|
| 100 |
+
"Required arguments:\n".
|
| 101 |
+
" -r <ref_file> is a file containing the reference translations for\n".
|
| 102 |
+
" the documents to be evaluated.\n".
|
| 103 |
+
" -s <src_file> is a file containing the source documents for which\n".
|
| 104 |
+
" translations are to be evaluated\n".
|
| 105 |
+
" -t <tst_file> is a file containing the translations to be evaluated\n".
|
| 106 |
+
"\n".
|
| 107 |
+
"Optional arguments:\n".
|
| 108 |
+
" -c preserves upper-case alphabetic characters\n".
|
| 109 |
+
" -b generate BLEU scores only\n".
|
| 110 |
+
" -n generate NIST scores only\n".
|
| 111 |
+
" -d detailed output flag used in conjunction with \"-b\" or \"-n\" flags:\n".
|
| 112 |
+
" 0 (default) for system-level score only\n".
|
| 113 |
+
" 1 to include document-level scores\n".
|
| 114 |
+
" 2 to include segment-level scores\n".
|
| 115 |
+
" 3 to include ngram-level scores\n".
|
| 116 |
+
" -e enclose non-ASCII characters between spaces\n".
|
| 117 |
+
" -h prints this help message to STDOUT\n".
|
| 118 |
+
"\n";
|
| 119 |
+
|
| 120 |
+
use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e);
|
| 121 |
+
use Getopt::Std;
|
| 122 |
+
getopts ('r:s:t:d:hbncx:e');
|
| 123 |
+
die $usage if defined($opt_h);
|
| 124 |
+
die "Error in command line: ref_file not defined$usage" unless defined $opt_r;
|
| 125 |
+
die "Error in command line: src_file not defined$usage" unless defined $opt_s;
|
| 126 |
+
die "Error in command line: tst_file not defined$usage" unless defined $opt_t;
|
| 127 |
+
my $max_Ngram = 9;
|
| 128 |
+
my $detail = defined $opt_d ? $opt_d : 0;
|
| 129 |
+
my $preserve_case = defined $opt_c ? 1 : 0;
|
| 130 |
+
my $split_non_ASCII = defined $opt_e ? 1 : 0;
|
| 131 |
+
|
| 132 |
+
my $METHOD = "BOTH";
|
| 133 |
+
if (defined $opt_b) { $METHOD = "BLEU"; }
|
| 134 |
+
if (defined $opt_n) { $METHOD = "NIST"; }
|
| 135 |
+
my $method;
|
| 136 |
+
|
| 137 |
+
my ($ref_file) = $opt_r;
|
| 138 |
+
my ($src_file) = $opt_s;
|
| 139 |
+
my ($tst_file) = $opt_t;
|
| 140 |
+
|
| 141 |
+
######
|
| 142 |
+
# Global variables
|
| 143 |
+
my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
|
| 144 |
+
my (%tst_data, %ref_data); # the data -- with structure: {system}{document}[segments]
|
| 145 |
+
my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
|
| 146 |
+
my %eval_docs; # document information for the evaluation data set
|
| 147 |
+
my %ngram_info; # the information obtained from (the last word in) the ngram
|
| 148 |
+
|
| 149 |
+
######
|
| 150 |
+
# Get source document ID's
|
| 151 |
+
($src_id) = get_source_info ($src_file);
|
| 152 |
+
|
| 153 |
+
######
|
| 154 |
+
# Get reference translations
|
| 155 |
+
($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
|
| 156 |
+
|
| 157 |
+
compute_ngram_info ();
|
| 158 |
+
|
| 159 |
+
######
|
| 160 |
+
# Get translations to evaluate
|
| 161 |
+
($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
|
| 162 |
+
|
| 163 |
+
######
|
| 164 |
+
# Check data for completeness and correctness
|
| 165 |
+
check_MT_data ();
|
| 166 |
+
|
| 167 |
+
######
|
| 168 |
+
#
|
| 169 |
+
my %NISTmt = ();
|
| 170 |
+
my %BLEUmt = ();
|
| 171 |
+
|
| 172 |
+
######
|
| 173 |
+
# Evaluate
|
| 174 |
+
print " Evaluation of $src_lang-to-$tgt_lang translation using:\n";
|
| 175 |
+
my $cum_seg = 0;
|
| 176 |
+
foreach my $doc (sort keys %eval_docs) {
|
| 177 |
+
$cum_seg += @{$eval_docs{$doc}{SEGS}};
|
| 178 |
+
}
|
| 179 |
+
print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
|
| 180 |
+
print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
|
| 181 |
+
print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
|
| 182 |
+
|
| 183 |
+
foreach my $sys (sort @tst_sys) {
|
| 184 |
+
for (my $n=1; $n<=$max_Ngram; $n++) {
|
| 185 |
+
$NISTmt{$n}{$sys}{cum} = 0;
|
| 186 |
+
$NISTmt{$n}{$sys}{ind} = 0;
|
| 187 |
+
$BLEUmt{$n}{$sys}{cum} = 0;
|
| 188 |
+
$BLEUmt{$n}{$sys}{ind} = 0;
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
if (($METHOD eq "BOTH") || ($METHOD eq "NIST")) {
|
| 192 |
+
$method="NIST";
|
| 193 |
+
score_system ($sys, %NISTmt);
|
| 194 |
+
}
|
| 195 |
+
if (($METHOD eq "BOTH") || ($METHOD eq "BLEU")) {
|
| 196 |
+
$method="BLEU";
|
| 197 |
+
score_system ($sys, %BLEUmt);
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
######
|
| 202 |
+
printout_report ();
|
| 203 |
+
|
| 204 |
+
($date, $time) = date_time_stamp();
|
| 205 |
+
print "MT evaluation scorer ended on $date at $time\n";
|
| 206 |
+
|
| 207 |
+
exit 0;
|
| 208 |
+
|
| 209 |
+
#################################
|
| 210 |
+
|
| 211 |
+
sub get_source_info {
|
| 212 |
+
|
| 213 |
+
my ($file) = @_;
|
| 214 |
+
my ($name, $id, $src, $doc);
|
| 215 |
+
my ($data, $tag, $span);
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
#read data from file
|
| 219 |
+
open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
|
| 220 |
+
binmode FILE, ":utf8";
|
| 221 |
+
$data .= $_ while <FILE>;
|
| 222 |
+
close (FILE);
|
| 223 |
+
|
| 224 |
+
#get source set info
|
| 225 |
+
die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n"
|
| 226 |
+
unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
|
| 227 |
+
|
| 228 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 229 |
+
unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
|
| 230 |
+
|
| 231 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 232 |
+
unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
|
| 233 |
+
die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
|
| 234 |
+
." with $name in previous input data ('$src_lang')\n\n"
|
| 235 |
+
unless (not defined $src_lang or $src eq $src_lang);
|
| 236 |
+
$src_lang = $src;
|
| 237 |
+
|
| 238 |
+
#get doc info -- ID and # of segs
|
| 239 |
+
$data = $span;
|
| 240 |
+
while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) {
|
| 241 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 242 |
+
unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
|
| 243 |
+
die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n"
|
| 244 |
+
if defined $eval_docs{$doc};
|
| 245 |
+
$span =~ s/[\s\n\r]+/ /g; # concatenate records
|
| 246 |
+
my $jseg=0, my $seg_data = $span;
|
| 247 |
+
while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
|
| 248 |
+
($eval_docs{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
|
| 249 |
+
}
|
| 250 |
+
die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
|
| 251 |
+
if $jseg == 0;
|
| 252 |
+
}
|
| 253 |
+
die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n"
|
| 254 |
+
unless keys %eval_docs > 0;
|
| 255 |
+
return $id;
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
#################################
|
| 259 |
+
|
| 260 |
+
sub get_MT_data {
|
| 261 |
+
|
| 262 |
+
my ($docs, $set_tag, $file) = @_;
|
| 263 |
+
my ($name, $id, $src, $tgt, $sys, $doc);
|
| 264 |
+
my ($tag, $span, $data);
|
| 265 |
+
|
| 266 |
+
#read data from file
|
| 267 |
+
open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
|
| 268 |
+
binmode FILE, ":utf8";
|
| 269 |
+
$data .= $_ while <FILE>;
|
| 270 |
+
close (FILE);
|
| 271 |
+
|
| 272 |
+
#get tag info
|
| 273 |
+
while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) {
|
| 274 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
|
| 275 |
+
($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
|
| 276 |
+
|
| 277 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
|
| 278 |
+
($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
|
| 279 |
+
die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
|
| 280 |
+
." with $name of source ('$src_lang')\n\n"
|
| 281 |
+
unless $src eq $src_lang;
|
| 282 |
+
|
| 283 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
|
| 284 |
+
($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
|
| 285 |
+
die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n"
|
| 286 |
+
." with $name of the evaluation ('$tgt_lang')\n\n"
|
| 287 |
+
unless (not defined $tgt_lang or $tgt eq $tgt_lang);
|
| 288 |
+
$tgt_lang = $tgt;
|
| 289 |
+
|
| 290 |
+
my $mtdata = $span;
|
| 291 |
+
while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) {
|
| 292 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
|
| 293 |
+
(my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
|
| 294 |
+
|
| 295 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
|
| 296 |
+
$doc = extract_sgml_tag_attribute ($name="DocID", $tag);
|
| 297 |
+
|
| 298 |
+
die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n"
|
| 299 |
+
." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
|
| 300 |
+
unless (not defined $docs->{$sys}{$doc});
|
| 301 |
+
|
| 302 |
+
$span =~ s/[\s\n\r]+/ /g; # concatenate records
|
| 303 |
+
my $jseg=0, my $seg_data = $span;
|
| 304 |
+
while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
|
| 305 |
+
($docs->{$sys}{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
|
| 306 |
+
}
|
| 307 |
+
die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
|
| 308 |
+
if $jseg == 0;
|
| 309 |
+
$docs->{$sys}{$doc}{FILE} = $file;
|
| 310 |
+
}
|
| 311 |
+
}
|
| 312 |
+
return $id;
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
#################################
|
| 316 |
+
|
| 317 |
+
sub check_MT_data {
|
| 318 |
+
|
| 319 |
+
@tst_sys = sort keys %tst_data;
|
| 320 |
+
@ref_sys = sort keys %ref_data;
|
| 321 |
+
|
| 322 |
+
#every evaluation document must be represented for every system and every reference
|
| 323 |
+
foreach my $doc (sort keys %eval_docs) {
|
| 324 |
+
my $nseg_source = @{$eval_docs{$doc}{SEGS}};
|
| 325 |
+
foreach my $sys (@tst_sys) {
|
| 326 |
+
die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n"
|
| 327 |
+
unless defined $tst_data{$sys}{$doc};
|
| 328 |
+
my $nseg = @{$tst_data{$sys}{$doc}{SEGS}};
|
| 329 |
+
die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
|
| 330 |
+
." document '$doc' for system '$sys' contains $nseg segments, while\n"
|
| 331 |
+
." the source document contains $nseg_source segments.\n\n"
|
| 332 |
+
unless $nseg == $nseg_source;
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
foreach my $sys (@ref_sys) {
|
| 336 |
+
die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n"
|
| 337 |
+
unless defined $ref_data{$sys}{$doc};
|
| 338 |
+
my $nseg = @{$ref_data{$sys}{$doc}{SEGS}};
|
| 339 |
+
die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
|
| 340 |
+
." document '$doc' for system '$sys' contains $nseg segments, while\n"
|
| 341 |
+
." the source document contains $nseg_source segments.\n\n"
|
| 342 |
+
unless $nseg == $nseg_source;
|
| 343 |
+
}
|
| 344 |
+
}
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
#################################
|
| 348 |
+
|
| 349 |
+
sub compute_ngram_info {
|
| 350 |
+
|
| 351 |
+
my ($ref, $doc, $seg);
|
| 352 |
+
my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
|
| 353 |
+
my (%ngram_count, @tot_ngrams);
|
| 354 |
+
|
| 355 |
+
foreach $ref (keys %ref_data) {
|
| 356 |
+
foreach $doc (keys %{$ref_data{$ref}}) {
|
| 357 |
+
foreach $seg (@{$ref_data{$ref}{$doc}{SEGS}}) {
|
| 358 |
+
@wrds = split /\s+/, $seg;
|
| 359 |
+
$tot_wrds += @wrds;
|
| 360 |
+
%ngrams = %{Words2Ngrams (@wrds)};
|
| 361 |
+
foreach $ngram (keys %ngrams) {
|
| 362 |
+
$ngram_count{$ngram} += $ngrams{$ngram};
|
| 363 |
+
}
|
| 364 |
+
}
|
| 365 |
+
}
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
foreach $ngram (keys %ngram_count) {
|
| 369 |
+
@wrds = split / /, $ngram;
|
| 370 |
+
pop @wrds, $mgram = join " ", @wrds;
|
| 371 |
+
$ngram_info{$ngram} = - log
|
| 372 |
+
($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram}
|
| 373 |
+
: $ngram_count{$ngram}/$tot_wrds) / log 2;
|
| 374 |
+
if (defined $opt_x and $opt_x eq "ngram info") {
|
| 375 |
+
@wrds = split / /, $ngram;
|
| 376 |
+
printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
|
| 377 |
+
$mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
|
| 378 |
+
}
|
| 379 |
+
}
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
#################################
|
| 383 |
+
|
| 384 |
+
sub score_system {
|
| 385 |
+
|
| 386 |
+
my ($sys, $ref, $doc, %SCOREmt);
|
| 387 |
+
($sys, %SCOREmt) = @_;
|
| 388 |
+
my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
|
| 389 |
+
my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
|
| 390 |
+
|
| 391 |
+
$cum_ref_length = 0;
|
| 392 |
+
for (my $j=1; $j<=$max_Ngram; $j++) {
|
| 393 |
+
$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
foreach $doc (sort keys %eval_docs) {
|
| 397 |
+
($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc);
|
| 398 |
+
|
| 399 |
+
#output document summary score
|
| 400 |
+
if (($detail >= 1 ) && ($METHOD eq "NIST")) {
|
| 401 |
+
my %DOCmt = ();
|
| 402 |
+
printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
|
| 403 |
+
nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt),
|
| 404 |
+
scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
|
| 405 |
+
}
|
| 406 |
+
if (($detail >= 1 ) && ($METHOD eq "BLEU")) {
|
| 407 |
+
my %DOCmt = ();
|
| 408 |
+
printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
|
| 409 |
+
bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt),
|
| 410 |
+
scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
$cum_ref_length += $shortest_ref_length;
|
| 414 |
+
for (my $j=1; $j<=$max_Ngram; $j++) {
|
| 415 |
+
$cum_match[$j] += $match_cnt->[$j];
|
| 416 |
+
$cum_tst_cnt[$j] += $tst_cnt->[$j];
|
| 417 |
+
$cum_ref_cnt[$j] += $ref_cnt->[$j];
|
| 418 |
+
$cum_tst_info[$j] += $tst_info->[$j];
|
| 419 |
+
$cum_ref_info[$j] += $ref_info->[$j];
|
| 420 |
+
printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
|
| 421 |
+
$tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
|
| 422 |
+
if (defined $opt_x and $opt_x eq "document info");
|
| 423 |
+
}
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
#x #output system summary score
|
| 427 |
+
#x printf "$method score = %.4f for system \"$sys\"\n",
|
| 428 |
+
#x $method eq "BLEU" ? bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt) :
|
| 429 |
+
#x nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
|
| 430 |
+
if ($method eq "BLEU") {
|
| 431 |
+
bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, %SCOREmt);
|
| 432 |
+
}
|
| 433 |
+
if ($method eq "NIST") {
|
| 434 |
+
nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
|
| 435 |
+
}
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
#################################
|
| 439 |
+
|
| 440 |
+
sub score_document {
|
| 441 |
+
|
| 442 |
+
my ($sys, $ref, $doc);
|
| 443 |
+
($sys, $doc) = @_;
|
| 444 |
+
my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
|
| 445 |
+
my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
|
| 446 |
+
|
| 447 |
+
$cum_ref_length = 0;
|
| 448 |
+
for (my $j=1; $j<=$max_Ngram; $j++) {
|
| 449 |
+
$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
#score each segment
|
| 453 |
+
for (my $jseg=0; $jseg<@{$tst_data{$sys}{$doc}{SEGS}}; $jseg++) {
|
| 454 |
+
my @ref_segments = ();
|
| 455 |
+
foreach $ref (@ref_sys) {
|
| 456 |
+
push @ref_segments, $ref_data{$ref}{$doc}{SEGS}[$jseg];
|
| 457 |
+
printf "ref '$ref', seg %d: %s\n", $jseg+1, $ref_data{$ref}{$doc}{SEGS}[$jseg]
|
| 458 |
+
if $detail >= 3;
|
| 459 |
+
}
|
| 460 |
+
printf "sys '$sys', seg %d: %s\n", $jseg+1, $tst_data{$sys}{$doc}{SEGS}[$jseg]
|
| 461 |
+
if $detail >= 3;
|
| 462 |
+
($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) =
|
| 463 |
+
score_segment ($tst_data{$sys}{$doc}{SEGS}[$jseg], @ref_segments);
|
| 464 |
+
|
| 465 |
+
#output segment summary score
|
| 466 |
+
#x printf "$method score = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
|
| 467 |
+
#x $method eq "BLEU" ? bleu_score($shortest_ref_length, $match_cnt, $tst_cnt) :
|
| 468 |
+
#x nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info),
|
| 469 |
+
#x $jseg+1, $tst_cnt->[1]
|
| 470 |
+
#x if $detail >= 2;
|
| 471 |
+
if (($detail >=2) && ($METHOD eq "BLEU")) {
|
| 472 |
+
my %DOCmt = ();
|
| 473 |
+
printf " $method score using 4-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
|
| 474 |
+
bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
|
| 475 |
+
}
|
| 476 |
+
if (($detail >=2) && ($METHOD eq "NIST")) {
|
| 477 |
+
my %DOCmt = ();
|
| 478 |
+
printf " $method score using 5-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
|
| 479 |
+
nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
$cum_ref_length += $shortest_ref_length;
|
| 484 |
+
for (my $j=1; $j<=$max_Ngram; $j++) {
|
| 485 |
+
$cum_match[$j] += $match_cnt->[$j];
|
| 486 |
+
$cum_tst_cnt[$j] += $tst_cnt->[$j];
|
| 487 |
+
$cum_ref_cnt[$j] += $ref_cnt->[$j];
|
| 488 |
+
$cum_tst_info[$j] += $tst_info->[$j];
|
| 489 |
+
$cum_ref_info[$j] += $ref_info->[$j];
|
| 490 |
+
}
|
| 491 |
+
}
|
| 492 |
+
return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
#################################
|
| 496 |
+
|
| 497 |
+
sub score_segment {
|
| 498 |
+
|
| 499 |
+
my ($tst_seg, @ref_segs) = @_;
|
| 500 |
+
my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
|
| 501 |
+
my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
|
| 502 |
+
my ($ngram);
|
| 503 |
+
my (@nwrds_ref);
|
| 504 |
+
my $shortest_ref_length;
|
| 505 |
+
|
| 506 |
+
for (my $j=1; $j<= $max_Ngram; $j++) {
|
| 507 |
+
$match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
# get the ngram counts for the test segment
|
| 511 |
+
@tst_wrds = split /\s+/, $tst_seg;
|
| 512 |
+
%tst_ngrams = %{Words2Ngrams (@tst_wrds)};
|
| 513 |
+
for (my $j=1; $j<=$max_Ngram; $j++) { # compute ngram counts
|
| 514 |
+
$tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
# get the ngram counts for the reference segments
|
| 518 |
+
foreach $ref_seg (@ref_segs) {
|
| 519 |
+
@ref_wrds = split /\s+/, $ref_seg;
|
| 520 |
+
%ref_ngrams = %{Words2Ngrams (@ref_wrds)};
|
| 521 |
+
foreach $ngram (keys %ref_ngrams) { # find the maximum # of occurrences
|
| 522 |
+
my @wrds = split / /, $ngram;
|
| 523 |
+
$ref_info[@wrds] += $ngram_info{$ngram};
|
| 524 |
+
$ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ?
|
| 525 |
+
max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) :
|
| 526 |
+
$ref_ngrams{$ngram};
|
| 527 |
+
}
|
| 528 |
+
for (my $j=1; $j<=$max_Ngram; $j++) { # update ngram counts
|
| 529 |
+
$ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
|
| 530 |
+
}
|
| 531 |
+
$shortest_ref_length = scalar @ref_wrds # find the shortest reference segment
|
| 532 |
+
if (not defined $shortest_ref_length) or @ref_wrds < $shortest_ref_length;
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
+
# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
|
| 536 |
+
foreach $ngram (keys %tst_ngrams) {
|
| 537 |
+
next unless defined $ref_ngrams_max{$ngram};
|
| 538 |
+
my @wrds = split / /, $ngram;
|
| 539 |
+
$tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
|
| 540 |
+
$match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
|
| 541 |
+
printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
|
| 542 |
+
if $detail >= 3;
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
return ($shortest_ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
#################################
|
| 549 |
+
|
| 550 |
+
sub bleu_score {
|
| 551 |
+
|
| 552 |
+
my ($shortest_ref_length, $matching_ngrams, $tst_ngrams, $sys, %SCOREmt) = @_;
|
| 553 |
+
|
| 554 |
+
my $score = 0;
|
| 555 |
+
my $iscore = 0;
|
| 556 |
+
my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
|
| 557 |
+
print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n";
|
| 558 |
+
|
| 559 |
+
for (my $j=1; $j<=$max_Ngram; $j++) {
|
| 560 |
+
if ($matching_ngrams->[$j] == 0) {
|
| 561 |
+
$SCOREmt{$j}{$sys}{cum}=0;
|
| 562 |
+
} else {
|
| 563 |
+
# Cumulative N-Gram score
|
| 564 |
+
$score += log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
|
| 565 |
+
$SCOREmt{$j}{$sys}{cum} = exp($score/$j + $len_score);
|
| 566 |
+
# Individual N-Gram score
|
| 567 |
+
$iscore = log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
|
| 568 |
+
$SCOREmt{$j}{$sys}{ind} = exp($iscore);
|
| 569 |
+
}
|
| 570 |
+
}
|
| 571 |
+
return $SCOREmt{4}{$sys}{cum};
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
#################################
|
| 575 |
+
|
| 576 |
+
sub nist_score {
|
| 577 |
+
|
| 578 |
+
my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, %SCOREmt) = @_;
|
| 579 |
+
|
| 580 |
+
my $score = 0;
|
| 581 |
+
my $iscore = 0;
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
for (my $n=1; $n<=$max_Ngram; $n++) {
|
| 585 |
+
$score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
|
| 586 |
+
$SCOREmt{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
|
| 587 |
+
|
| 588 |
+
$iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
|
| 589 |
+
$SCOREmt{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
|
| 590 |
+
}
|
| 591 |
+
return $SCOREmt{5}{$sys}{cum};
|
| 592 |
+
}
|
| 593 |
+
|
| 594 |
+
#################################
|
| 595 |
+
|
| 596 |
+
sub Words2Ngrams { #convert a string of words to an Ngram count hash
|
| 597 |
+
|
| 598 |
+
my %count = ();
|
| 599 |
+
|
| 600 |
+
for (; @_; shift) {
|
| 601 |
+
my ($j, $ngram, $word);
|
| 602 |
+
for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) {
|
| 603 |
+
$ngram .= defined $ngram ? " $word" : $word;
|
| 604 |
+
$count{$ngram}++;
|
| 605 |
+
}
|
| 606 |
+
}
|
| 607 |
+
return {%count};
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
#################################
|
| 611 |
+
|
| 612 |
+
sub NormalizeText {
|
| 613 |
+
my ($norm_text) = @_;
|
| 614 |
+
|
| 615 |
+
$norm_text =~ s/<skipped>//g; # strip "skipped" tags
|
| 616 |
+
$norm_text =~ s/\p{Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines
|
| 617 |
+
$norm_text =~ s/\p{Zl}/ /g; # join lines
|
| 618 |
+
|
| 619 |
+
# replace entities
|
| 620 |
+
$norm_text =~ s/"/\"/g; # quote to "
|
| 621 |
+
$norm_text =~ s/&/&/g; # ampersand to &
|
| 622 |
+
$norm_text =~ s/</</g; # less-than to <
|
| 623 |
+
$norm_text =~ s/>/>/g; # greater-than to >
|
| 624 |
+
$norm_text =~ s/'/\'/g; # apostrophe to '
|
| 625 |
+
|
| 626 |
+
$norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed
|
| 627 |
+
$norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII );
|
| 628 |
+
|
| 629 |
+
# punctuation: tokenize any punctuation unless followed AND preceded by a digit
|
| 630 |
+
$norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g;
|
| 631 |
+
$norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g;
|
| 632 |
+
|
| 633 |
+
$norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols
|
| 634 |
+
|
| 635 |
+
$norm_text =~ s/\p{Z}+/ /g; # one space only between words
|
| 636 |
+
$norm_text =~ s/^\p{Z}+//; # no leading space
|
| 637 |
+
$norm_text =~ s/\p{Z}+$//; # no trailing space
|
| 638 |
+
|
| 639 |
+
return $norm_text;
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
+
#################################
|
| 643 |
+
|
| 644 |
+
sub nist_length_penalty {
|
| 645 |
+
|
| 646 |
+
my ($ratio) = @_;
|
| 647 |
+
return 1 if $ratio >= 1;
|
| 648 |
+
return 0 if $ratio <= 0;
|
| 649 |
+
my $ratio_x = 1.5;
|
| 650 |
+
my $score_x = 0.5;
|
| 651 |
+
my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
|
| 652 |
+
return exp (-$beta*log($ratio)*log($ratio));
|
| 653 |
+
}
|
| 654 |
+
|
| 655 |
+
#################################
|
| 656 |
+
|
| 657 |
+
sub date_time_stamp {
|
| 658 |
+
|
| 659 |
+
my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
|
| 660 |
+
my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
|
| 661 |
+
my ($date, $time);
|
| 662 |
+
|
| 663 |
+
$time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
|
| 664 |
+
$date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
|
| 665 |
+
return ($date, $time);
|
| 666 |
+
}
|
| 667 |
+
|
| 668 |
+
#################################
|
| 669 |
+
|
| 670 |
+
sub extract_sgml_tag_and_span {
|
| 671 |
+
|
| 672 |
+
my ($name, $data) = @_;
|
| 673 |
+
|
| 674 |
+
($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
|
| 675 |
+
}
|
| 676 |
+
|
| 677 |
+
#################################
|
| 678 |
+
|
| 679 |
+
sub extract_sgml_tag_attribute {
|
| 680 |
+
|
| 681 |
+
my ($name, $data) = @_;
|
| 682 |
+
|
| 683 |
+
($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
|
| 684 |
+
}
|
| 685 |
+
|
| 686 |
+
#################################
|
| 687 |
+
|
| 688 |
+
sub max {
|
| 689 |
+
|
| 690 |
+
my ($max, $next);
|
| 691 |
+
|
| 692 |
+
return unless defined ($max=pop);
|
| 693 |
+
while (defined ($next=pop)) {
|
| 694 |
+
$max = $next if $next > $max;
|
| 695 |
+
}
|
| 696 |
+
return $max;
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
+
#################################
|
| 700 |
+
|
| 701 |
+
sub min {
|
| 702 |
+
|
| 703 |
+
my ($min, $next);
|
| 704 |
+
|
| 705 |
+
return unless defined ($min=pop);
|
| 706 |
+
while (defined ($next=pop)) {
|
| 707 |
+
$min = $next if $next < $min;
|
| 708 |
+
}
|
| 709 |
+
return $min;
|
| 710 |
+
}
|
| 711 |
+
|
| 712 |
+
#################################
|
| 713 |
+
|
| 714 |
+
sub printout_report
|
| 715 |
+
{
|
| 716 |
+
|
| 717 |
+
if ( $METHOD eq "BOTH" ) {
|
| 718 |
+
foreach my $sys (sort @tst_sys) {
|
| 719 |
+
printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
|
| 720 |
+
}
|
| 721 |
+
} elsif ($METHOD eq "NIST" ) {
|
| 722 |
+
foreach my $sys (sort @tst_sys) {
|
| 723 |
+
printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
|
| 724 |
+
}
|
| 725 |
+
} elsif ($METHOD eq "BLEU" ) {
|
| 726 |
+
foreach my $sys (sort @tst_sys) {
|
| 727 |
+
printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
|
| 728 |
+
}
|
| 729 |
+
}
|
| 730 |
+
|
| 731 |
+
|
| 732 |
+
printf "\n# ------------------------------------------------------------------------\n\n";
|
| 733 |
+
printf "Individual N-gram scoring\n";
|
| 734 |
+
printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
|
| 735 |
+
printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
|
| 736 |
+
|
| 737 |
+
if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
|
| 738 |
+
foreach my $sys (sort @tst_sys) {
|
| 739 |
+
printf " NIST:";
|
| 740 |
+
for (my $i=1; $i<=$max_Ngram; $i++) {
|
| 741 |
+
printf " %2.4f ",$NISTmt{$i}{$sys}{ind}
|
| 742 |
+
}
|
| 743 |
+
printf " \"$sys\"\n";
|
| 744 |
+
}
|
| 745 |
+
printf "\n";
|
| 746 |
+
}
|
| 747 |
+
|
| 748 |
+
if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
|
| 749 |
+
foreach my $sys (sort @tst_sys) {
|
| 750 |
+
printf " BLEU:";
|
| 751 |
+
for (my $i=1; $i<=$max_Ngram; $i++) {
|
| 752 |
+
printf " %2.4f ",$BLEUmt{$i}{$sys}{ind}
|
| 753 |
+
}
|
| 754 |
+
printf " \"$sys\"\n";
|
| 755 |
+
}
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
+
printf "\n# ------------------------------------------------------------------------\n";
|
| 759 |
+
printf "Cumulative N-gram scoring\n";
|
| 760 |
+
printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
|
| 761 |
+
printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
|
| 762 |
+
|
| 763 |
+
if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
|
| 764 |
+
foreach my $sys (sort @tst_sys) {
|
| 765 |
+
printf " NIST:";
|
| 766 |
+
for (my $i=1; $i<=$max_Ngram; $i++) {
|
| 767 |
+
printf " %2.4f ",$NISTmt{$i}{$sys}{cum}
|
| 768 |
+
}
|
| 769 |
+
printf " \"$sys\"\n";
|
| 770 |
+
}
|
| 771 |
+
}
|
| 772 |
+
printf "\n";
|
| 773 |
+
|
| 774 |
+
|
| 775 |
+
if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
|
| 776 |
+
foreach my $sys (sort @tst_sys) {
|
| 777 |
+
printf " BLEU:";
|
| 778 |
+
for (my $i=1; $i<=$max_Ngram; $i++) {
|
| 779 |
+
printf " %2.4f ",$BLEUmt{$i}{$sys}{cum}
|
| 780 |
+
}
|
| 781 |
+
printf " \"$sys\"\n";
|
| 782 |
+
}
|
| 783 |
+
}
|
| 784 |
+
}
|
mosesdecoder/scripts/generic/mteval-v13a.pl
ADDED
|
@@ -0,0 +1,1170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
|
| 3 |
+
use warnings;
|
| 4 |
+
use strict;
|
| 5 |
+
use utf8;
|
| 6 |
+
use Encode;
|
| 7 |
+
use XML::Twig;
|
| 8 |
+
|
| 9 |
+
binmode STDOUT, ":utf8";
|
| 10 |
+
binmode STDERR, ":utf8";
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
#################################
|
| 14 |
+
# History:
|
| 15 |
+
#
|
| 16 |
+
# version 13a
|
| 17 |
+
# * modified the scoring functions to prevent division-by-zero errors when a system segment is empty
|
| 18 |
+
# * affected methods: 'bleu_score' and 'bleu_score_smoothing'
|
| 19 |
+
# * use \p{Line_Breaks} instead of \p{Hyphen} when stripping end-of-line hyphenation and join lines
|
| 20 |
+
# * because \p{Hyphen} is deprecated since 2016-06-01, see http://www.unicode.org/reports/tr14/#Hyphen
|
| 21 |
+
#
|
| 22 |
+
# version 13
|
| 23 |
+
# * Uses a XML parser to read data (only when extension is .xml)
|
| 24 |
+
# * Smoothing of the segment-level BLEU scores, done by default
|
| 25 |
+
# * smoothing method similar to that of bleu-1.04.pl (IBM)
|
| 26 |
+
# * see comments above the 'bleu_score' method for more details on how the smoothing is computed
|
| 27 |
+
# * added a '--no-smoothing' option to simulate old scripts behavior
|
| 28 |
+
# * Introduction of the 'brevity-penalty' option, taking one of two values:
|
| 29 |
+
# * 'closest' (default) : act as IBM BLEU (taking the closest reference translation length)
|
| 30 |
+
# * in case two reference translations are at the same distance, will take the shortest one
|
| 31 |
+
# * for more details regarding how the BP is computed, see comments of the 'brevity_penalty_closest' function
|
| 32 |
+
# * 'shortest' : act as previous versions of the script (taking shortest reference translation length)
|
| 33 |
+
# * Introduction of the 'international-tokenization' option, boolean, disabled by default
|
| 34 |
+
# by default (when the option is not provided), uses 11b's tokenization function
|
| 35 |
+
# when option specified, uses v12's tokenization function
|
| 36 |
+
# * Introduction of a 'Metrics MATR output' flag (option '--metricsMATR')
|
| 37 |
+
# when used, creates three files for both BLEU score and NIST score:
|
| 38 |
+
# * BLEU-seg.scr and NIST-seg.scr: contain segment-level scores
|
| 39 |
+
# * BLEU-doc.scr and NIST-doc.scr: contain document-level scores
|
| 40 |
+
# * BLEU-sys.scr and NIST-sys.scr: contain system-level scores
|
| 41 |
+
# * SGML parsing
|
| 42 |
+
# * script will halt if source, reference and test files don't share the same setid attribute value (used for metricsMATR output)
|
| 43 |
+
# * correct segment IDs extracted from the files (was previously using an array, and using the index as a segID for output)
|
| 44 |
+
# * detailed output flag (-d) can now be used when running both BLEU and NIST
|
| 45 |
+
#
|
| 46 |
+
# version 12
|
| 47 |
+
# * Text normalization changes:
|
| 48 |
+
# * convert entity references (only the entities declared in the DTD)
|
| 49 |
+
# * now uses unicode categories
|
| 50 |
+
# * tokenize punctuation unless followed AND preceded by digits
|
| 51 |
+
# * tokenize symbols
|
| 52 |
+
# * UTF-8 handling:
|
| 53 |
+
# * files are now read using utf8 mode
|
| 54 |
+
# * Added the '-e' command-line option to enclose non-ASCII characters between spaces
|
| 55 |
+
#
|
| 56 |
+
# version 11b -- text normalization modified:
|
| 57 |
+
# * take out the join digit line because it joins digits
|
| 58 |
+
# when it shouldn't have
|
| 59 |
+
# $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
|
| 60 |
+
#
|
| 61 |
+
# version 11a -- corrected output of individual n-gram precision values
|
| 62 |
+
#
|
| 63 |
+
# version 11 -- bug fixes:
|
| 64 |
+
# * make filehandle operate in binary mode to prevent Perl from operating
|
| 65 |
+
# (by default in Red Hat 9) in UTF-8
|
| 66 |
+
# * fix failure on joining digits
|
| 67 |
+
# version 10 -- updated output to include more details of n-gram scoring.
|
| 68 |
+
# Defaults to generate both NIST and BLEU scores. Use -b for BLEU
|
| 69 |
+
# only, use -n for NIST only
|
| 70 |
+
#
|
| 71 |
+
# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
|
| 72 |
+
# being the max, regardless what was entered on the command line.)
|
| 73 |
+
#
|
| 74 |
+
# version 09c -- bug fix (During the calculation of ngram information,
|
| 75 |
+
# each ngram was being counted only once for each segment. This has
|
| 76 |
+
# been fixed so that each ngram is counted correctly in each segment.)
|
| 77 |
+
#
|
| 78 |
+
# version 09b -- text normalization modified:
|
| 79 |
+
# * option flag added to preserve upper case
|
| 80 |
+
# * non-ASCII characters left in place.
|
| 81 |
+
#
|
| 82 |
+
# version 09a -- text normalization modified:
|
| 83 |
+
# * " and & converted to "" and &, respectively
|
| 84 |
+
# * non-ASCII characters kept together (bug fix)
|
| 85 |
+
#
|
| 86 |
+
# version 09 -- modified to accommodate sgml tag and attribute
|
| 87 |
+
# names revised to conform to default SGML conventions.
|
| 88 |
+
#
|
| 89 |
+
# version 08 -- modifies the NIST metric in accordance with the
|
| 90 |
+
# findings on the 2001 Chinese-English dry run corpus. Also
|
| 91 |
+
# incorporates the BLEU metric as an option and supports the
|
| 92 |
+
# output of ngram detail.
|
| 93 |
+
#
|
| 94 |
+
# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
|
| 95 |
+
# Keep strings of non-ASCII characters together as one word
|
| 96 |
+
# (rather than splitting them into one-character words).
|
| 97 |
+
# Change length penalty so that translations that are longer than
|
| 98 |
+
# the average reference translation are not penalized.
|
| 99 |
+
#
|
| 100 |
+
# version 06
|
| 101 |
+
# Prevent divide-by-zero when a segment has no evaluation N-grams.
|
| 102 |
+
# Correct segment index for level 3 debug output.
|
| 103 |
+
#
|
| 104 |
+
# version 05
|
| 105 |
+
# improve diagnostic error messages
|
| 106 |
+
#
|
| 107 |
+
# version 04
|
| 108 |
+
# tag segments
|
| 109 |
+
#
|
| 110 |
+
# version 03
|
| 111 |
+
# add detailed output option (intermediate document and segment scores)
|
| 112 |
+
#
|
| 113 |
+
# version 02
|
| 114 |
+
# accommodation of modified sgml tags and attributes
|
| 115 |
+
#
|
| 116 |
+
# version 01
|
| 117 |
+
# same as bleu version 15, but modified to provide formal score output.
|
| 118 |
+
#
|
| 119 |
+
# original IBM version
|
| 120 |
+
# Author: Kishore Papineni
|
| 121 |
+
# Date: 06/10/2001
|
| 122 |
+
#################################
|
| 123 |
+
|
| 124 |
+
######
|
| 125 |
+
# Intro
|
| 126 |
+
my ($date, $time) = date_time_stamp();
|
| 127 |
+
print "MT evaluation scorer began on $date at $time\n";
|
| 128 |
+
print "command line: ", $0, " ", join(" ", @ARGV), "\n";
|
| 129 |
+
my $usage = "\n\nUsage: $0 -r <ref_file> -s <src_file> -t <tst_file>\n\n".
|
| 130 |
+
"Description: This Perl script evaluates MT system performance.\n".
|
| 131 |
+
"\n".
|
| 132 |
+
"Required arguments:\n".
|
| 133 |
+
" -r <ref_file> is a file containing the reference translations for\n".
|
| 134 |
+
" the documents to be evaluated.\n".
|
| 135 |
+
" -s <src_file> is a file containing the source documents for which\n".
|
| 136 |
+
" translations are to be evaluated\n".
|
| 137 |
+
" -t <tst_file> is a file containing the translations to be evaluated\n".
|
| 138 |
+
"\n".
|
| 139 |
+
"Optional arguments:\n".
|
| 140 |
+
" -h prints this help message to STDOUT\n".
|
| 141 |
+
" -c preserves upper-case alphabetic characters\n".
|
| 142 |
+
" -b generate BLEU scores only\n".
|
| 143 |
+
" -n generate NIST scores only\n".
|
| 144 |
+
" -d detailed output flag:\n".
|
| 145 |
+
" 0 (default) for system-level score only\n".
|
| 146 |
+
" 1 to include document-level scores\n".
|
| 147 |
+
" 2 to include segment-level scores\n".
|
| 148 |
+
" 3 to include ngram-level scores\n".
|
| 149 |
+
" -e enclose non-ASCII characters between spaces\n".
|
| 150 |
+
" --brevity-penalty ( closest | shortest )\n" .
|
| 151 |
+
" closest (default) : acts as IBM BLEU (takes the closest reference translation length)\n" .
|
| 152 |
+
" shortest : acts as previous versions of the script (takes the shortest reference translation length)\n" .
|
| 153 |
+
" --international-tokenization\n" .
|
| 154 |
+
" when specified, uses Unicode-based (only) tokenization rules\n" .
|
| 155 |
+
" when not specified (default), uses default tokenization (some language-dependant rules)\n" .
|
| 156 |
+
" --metricsMATR : create three files for both BLEU scores and NIST scores:\n" .
|
| 157 |
+
" BLEU-seg.scr and NIST-seg.scr : segment-level scores\n" .
|
| 158 |
+
" BLEU-doc.scr and NIST-doc.scr : document-level scores\n" .
|
| 159 |
+
" BLEU-sys.scr and NIST-sys.scr : system-level scores\n" .
|
| 160 |
+
" --no-smoothing : disable smoothing on BLEU scores\n" .
|
| 161 |
+
"\n";
|
| 162 |
+
|
| 163 |
+
use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e);
|
| 164 |
+
use Getopt::Long;
|
| 165 |
+
my $ref_file = '';
|
| 166 |
+
my $src_file = '';
|
| 167 |
+
my $tst_file = '';
|
| 168 |
+
my $detail = 0;
|
| 169 |
+
my $help = '';
|
| 170 |
+
my $preserve_case = '';
|
| 171 |
+
my $split_non_ASCII = '';
|
| 172 |
+
my $brevity_penalty = 'closest';
|
| 173 |
+
my $international_tokenization;
|
| 174 |
+
my $metricsMATR_output = '';
|
| 175 |
+
my $no_smoothing = '';
|
| 176 |
+
our $opt_x = '';
|
| 177 |
+
our $opt_b = '';
|
| 178 |
+
our $opt_n = '';
|
| 179 |
+
GetOptions(
|
| 180 |
+
'r=s' => \$ref_file,
|
| 181 |
+
's=s' => \$src_file,
|
| 182 |
+
't=s' => \$tst_file,
|
| 183 |
+
'd:i' => \$detail,
|
| 184 |
+
'h|help' => \$help,
|
| 185 |
+
'b',
|
| 186 |
+
'n',
|
| 187 |
+
'c' => \$preserve_case,
|
| 188 |
+
'x:s',
|
| 189 |
+
'e' => \$split_non_ASCII,
|
| 190 |
+
'brevity-penalty:s' => \$brevity_penalty,
|
| 191 |
+
'international-tokenization' => \$international_tokenization,
|
| 192 |
+
'metricsMATR-output' => \$metricsMATR_output,
|
| 193 |
+
'no-smoothing' => \$no_smoothing
|
| 194 |
+
);
|
| 195 |
+
die $usage if $help;
|
| 196 |
+
|
| 197 |
+
die "Error in command line: ref_file not defined$usage" unless ( $ref_file );
|
| 198 |
+
die "Error in command line: src_file not defined$usage" unless ( $src_file );
|
| 199 |
+
die "Error in command line: tst_file not defined$usage" unless ( $tst_file );
|
| 200 |
+
my $BLEU_BP;
|
| 201 |
+
if ( !( $brevity_penalty cmp 'closest' ) )
|
| 202 |
+
{
|
| 203 |
+
$BLEU_BP = \&brevity_penalty_closest;
|
| 204 |
+
}
|
| 205 |
+
elsif ( !( $brevity_penalty cmp 'shortest' ) )
|
| 206 |
+
{
|
| 207 |
+
$BLEU_BP = \&brevity_penalty_shortest;
|
| 208 |
+
}
|
| 209 |
+
else
|
| 210 |
+
{
|
| 211 |
+
die "Incorrect value supplied for 'brevity_penalty'$usage";
|
| 212 |
+
}
|
| 213 |
+
my $TOKENIZATION = \&tokenization;
|
| 214 |
+
$TOKENIZATION = \&tokenization_international if ( $international_tokenization );
|
| 215 |
+
|
| 216 |
+
my $BLEU_SCORE = \&bleu_score;
|
| 217 |
+
$BLEU_SCORE = \&bleu_score_nosmoothing if ( $no_smoothing );
|
| 218 |
+
|
| 219 |
+
my $max_Ngram = 9;
|
| 220 |
+
|
| 221 |
+
my $METHOD = "BOTH";
|
| 222 |
+
if ( $opt_b ) { $METHOD = "BLEU"; }
|
| 223 |
+
if ( $opt_n ) { $METHOD = "NIST"; }
|
| 224 |
+
my $method;
|
| 225 |
+
|
| 226 |
+
######
|
| 227 |
+
# Global variables
|
| 228 |
+
my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
|
| 229 |
+
my (%tst_data, %ref_data); # the data -- with structure: {system}{document}{segments}
|
| 230 |
+
my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
|
| 231 |
+
my %eval_docs; # document information for the evaluation data set
|
| 232 |
+
my %ngram_info; # the information obtained from (the last word in) the ngram
|
| 233 |
+
|
| 234 |
+
######
|
| 235 |
+
# Get source document ID's
|
| 236 |
+
($src_id) = get_source_info ($src_file);
|
| 237 |
+
|
| 238 |
+
######
|
| 239 |
+
# Get reference translations
|
| 240 |
+
($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
|
| 241 |
+
|
| 242 |
+
compute_ngram_info ();
|
| 243 |
+
|
| 244 |
+
######
|
| 245 |
+
# Get translations to evaluate
|
| 246 |
+
($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
|
| 247 |
+
|
| 248 |
+
######
|
| 249 |
+
# Check data for completeness and correctness
|
| 250 |
+
check_MT_data ();
|
| 251 |
+
|
| 252 |
+
######
|
| 253 |
+
#
|
| 254 |
+
my %NISTmt;
|
| 255 |
+
my %NISTOverall;
|
| 256 |
+
my %BLEUmt;
|
| 257 |
+
my %BLEUOverall;
|
| 258 |
+
|
| 259 |
+
######
|
| 260 |
+
# Evaluate
|
| 261 |
+
print " Evaluation of $src_lang-to-$tgt_lang translation using:\n";
|
| 262 |
+
my $cum_seg = 0;
|
| 263 |
+
foreach my $doc (sort keys %eval_docs)
|
| 264 |
+
{
|
| 265 |
+
$cum_seg += scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
|
| 266 |
+
}
|
| 267 |
+
print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
|
| 268 |
+
print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
|
| 269 |
+
print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
|
| 270 |
+
|
| 271 |
+
foreach my $sys (sort @tst_sys)
|
| 272 |
+
{
|
| 273 |
+
for (my $n=1; $n<=$max_Ngram; $n++)
|
| 274 |
+
{
|
| 275 |
+
$NISTmt{$n}{$sys}{cum} = 0;
|
| 276 |
+
$NISTmt{$n}{$sys}{ind} = 0;
|
| 277 |
+
$BLEUmt{$n}{$sys}{cum} = 0;
|
| 278 |
+
$BLEUmt{$n}{$sys}{ind} = 0;
|
| 279 |
+
}
|
| 280 |
+
if ( ($METHOD eq "BOTH") || ($METHOD eq "NIST") )
|
| 281 |
+
{
|
| 282 |
+
$method="NIST";
|
| 283 |
+
score_system ($sys, \%NISTmt, \%NISTOverall);
|
| 284 |
+
}
|
| 285 |
+
if ( ($METHOD eq "BOTH") || ($METHOD eq "BLEU") )
|
| 286 |
+
{
|
| 287 |
+
$method="BLEU";
|
| 288 |
+
score_system ($sys, \%BLEUmt, \%BLEUOverall);
|
| 289 |
+
}
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
######
|
| 293 |
+
printout_report ();
|
| 294 |
+
if ( $metricsMATR_output )
|
| 295 |
+
{
|
| 296 |
+
outputMetricsMATR( 'NIST', %NISTOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'NIST' ) );
|
| 297 |
+
outputMetricsMATR( 'BLEU', %BLEUOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'BLEU' ) );
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
($date, $time) = date_time_stamp();
|
| 301 |
+
print "MT evaluation scorer ended on $date at $time\n";
|
| 302 |
+
|
| 303 |
+
exit 0;
|
| 304 |
+
|
| 305 |
+
#################################
|
| 306 |
+
|
| 307 |
+
sub get_source_info
|
| 308 |
+
{
|
| 309 |
+
my ($file) = @_;
|
| 310 |
+
my ($name, $id, $src, $doc, $seg);
|
| 311 |
+
my ($data, $tag, $span);
|
| 312 |
+
|
| 313 |
+
# Extension of the file determines the parser used:
|
| 314 |
+
# .xml : XML::Twig
|
| 315 |
+
# otherwise : simple SGML parsing functions
|
| 316 |
+
if ( $file =~ /\.xml$/i )
|
| 317 |
+
{
|
| 318 |
+
my $twig = XML::Twig->new();
|
| 319 |
+
$twig->parsefile( $file );
|
| 320 |
+
my $root = $twig->root;
|
| 321 |
+
my $currentSet = $root->first_child( 'srcset' );
|
| 322 |
+
die "Source XML file '$file' does not contain the 'srcset' element" if ( not $currentSet );
|
| 323 |
+
$id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
|
| 324 |
+
$src = $currentSet->{ 'att' }->{ 'srclang' } or die "No srcset 'srclang' attribute value in '$file'";
|
| 325 |
+
die "Not the same srclang attribute values across sets" unless ( not defined $src_lang or $src eq $src_lang );
|
| 326 |
+
$src_lang = $src;
|
| 327 |
+
foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
|
| 328 |
+
{
|
| 329 |
+
my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
|
| 330 |
+
foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
|
| 331 |
+
{
|
| 332 |
+
my $segID = $currentSeg->{ 'att' }->{ 'id' } or die "No segment 'id' attribute value in '$file'";
|
| 333 |
+
my $segData = $currentSeg->text;
|
| 334 |
+
($eval_docs{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
|
| 335 |
+
}
|
| 336 |
+
}
|
| 337 |
+
}
|
| 338 |
+
else
|
| 339 |
+
{
|
| 340 |
+
#read data from file
|
| 341 |
+
open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
|
| 342 |
+
binmode FILE, ":utf8";
|
| 343 |
+
$data .= $_ while <FILE>;
|
| 344 |
+
close (FILE);
|
| 345 |
+
|
| 346 |
+
#get source set info
|
| 347 |
+
die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n"
|
| 348 |
+
unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
|
| 349 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 350 |
+
unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
|
| 351 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 352 |
+
unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
|
| 353 |
+
die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
|
| 354 |
+
." with $name in previous input data ('$src_lang')\n\n"
|
| 355 |
+
unless (not defined $src_lang or $src eq $src_lang);
|
| 356 |
+
$src_lang = $src;
|
| 357 |
+
|
| 358 |
+
#get doc info -- ID and # of segs
|
| 359 |
+
$data = $span;
|
| 360 |
+
while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data))
|
| 361 |
+
{
|
| 362 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 363 |
+
unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
|
| 364 |
+
die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n"
|
| 365 |
+
if defined $eval_docs{$doc};
|
| 366 |
+
$span =~ s/[\s\n\r]+/ /g; # concatenate records
|
| 367 |
+
my $nseg=0, my $seg_data = $span;
|
| 368 |
+
while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
|
| 369 |
+
{
|
| 370 |
+
die "\n\nFATAL INPUT ERROR: no attribute '$name' in file '$file'\n\n"
|
| 371 |
+
unless ($seg) = extract_sgml_tag_attribute( $name='id', $tag );
|
| 372 |
+
($eval_docs{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
|
| 373 |
+
$nseg++;
|
| 374 |
+
}
|
| 375 |
+
die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
|
| 376 |
+
if $nseg == 0;
|
| 377 |
+
}
|
| 378 |
+
die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n"
|
| 379 |
+
unless keys %eval_docs > 0;
|
| 380 |
+
}
|
| 381 |
+
return $id;
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
#################################
|
| 385 |
+
|
| 386 |
+
sub get_MT_data
|
| 387 |
+
{
|
| 388 |
+
my ($docs, $set_tag, $file) = @_;
|
| 389 |
+
my ($name, $id, $src, $tgt, $sys, $doc, $seg);
|
| 390 |
+
my ($tag, $span, $data);
|
| 391 |
+
|
| 392 |
+
# Extension of the file determines the parser used:
|
| 393 |
+
# .xml : XML::Twig
|
| 394 |
+
# otherwise : simple SGML parsing functions
|
| 395 |
+
if ( $file =~ /\.xml$/i )
|
| 396 |
+
{
|
| 397 |
+
my $twig = XML::Twig->new();
|
| 398 |
+
$twig->parsefile( $file );
|
| 399 |
+
my $root = $twig->root;
|
| 400 |
+
foreach my $currentSet ( $root->get_xpath( 'refset' ), $root->get_xpath( 'tstset' ) )
|
| 401 |
+
{
|
| 402 |
+
$id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
|
| 403 |
+
$src = $currentSet->{ 'att' }->{ 'srclang' } or die "No 'srclang' attribute value in '$file'";
|
| 404 |
+
$tgt = $currentSet->{ 'att' }->{ 'trglang' } or die "No 'trglang' attribute value in '$file'";
|
| 405 |
+
die "Not the same 'srclang' attribute value across sets" unless ( $src eq $src_lang );
|
| 406 |
+
die "Not the same 'trglang' attribute value across sets" unless ( ( not defined $tgt_lang ) or ( $tgt = $tgt_lang ) );
|
| 407 |
+
$tgt_lang = $tgt;
|
| 408 |
+
my $sys;
|
| 409 |
+
if ( $currentSet->name eq 'tstset' )
|
| 410 |
+
{
|
| 411 |
+
$sys = $currentSet->{ 'att' }->{ 'sysid' } or die "No 'sysid' attribute value in '$file'";
|
| 412 |
+
}
|
| 413 |
+
else
|
| 414 |
+
{
|
| 415 |
+
$sys = $currentSet->{ 'att' }->{ 'refid' } or die "No 'refid' attribute value in '$file'";
|
| 416 |
+
}
|
| 417 |
+
foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
|
| 418 |
+
{
|
| 419 |
+
my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
|
| 420 |
+
$docs->{ $sys }{ $docID }{ FILE } = $file;
|
| 421 |
+
foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
|
| 422 |
+
{
|
| 423 |
+
my $segID = $currentSeg->{ 'att' }->{ 'id' } or die "No segment 'id' attribute value in '$file'";
|
| 424 |
+
my $segData = $currentSeg->text;
|
| 425 |
+
($docs->{$sys}{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
|
| 426 |
+
}
|
| 427 |
+
}
|
| 428 |
+
}
|
| 429 |
+
}
|
| 430 |
+
else
|
| 431 |
+
{
|
| 432 |
+
#read data from file
|
| 433 |
+
open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
|
| 434 |
+
binmode FILE, ":utf8";
|
| 435 |
+
$data .= $_ while <FILE>;
|
| 436 |
+
close (FILE);
|
| 437 |
+
|
| 438 |
+
#get tag info
|
| 439 |
+
while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data))
|
| 440 |
+
{
|
| 441 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 442 |
+
unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
|
| 443 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 444 |
+
unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
|
| 445 |
+
die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
|
| 446 |
+
." with $name of source ('$src_lang')\n\n"
|
| 447 |
+
unless $src eq $src_lang;
|
| 448 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 449 |
+
unless ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
|
| 450 |
+
die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n"
|
| 451 |
+
." with $name of the evaluation ('$tgt_lang')\n\n"
|
| 452 |
+
unless (not defined $tgt_lang or $tgt eq $tgt_lang);
|
| 453 |
+
$tgt_lang = $tgt;
|
| 454 |
+
|
| 455 |
+
my $mtdata = $span;
|
| 456 |
+
while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata))
|
| 457 |
+
{
|
| 458 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 459 |
+
unless (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
|
| 460 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 461 |
+
unless $doc = extract_sgml_tag_attribute ($name="DocID", $tag);
|
| 462 |
+
die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n"
|
| 463 |
+
." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
|
| 464 |
+
unless (not defined $docs->{$sys}{$doc});
|
| 465 |
+
|
| 466 |
+
$span =~ s/[\s\n\r]+/ /g; # concatenate records
|
| 467 |
+
my $nseg=0, my $seg_data = $span;
|
| 468 |
+
while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
|
| 469 |
+
{
|
| 470 |
+
die "\n\nFATAIL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 471 |
+
unless $seg = extract_sgml_tag_attribute( $name="id", $tag );
|
| 472 |
+
($docs->{$sys}{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
|
| 473 |
+
$nseg++;
|
| 474 |
+
}
|
| 475 |
+
die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" if $nseg == 0;
|
| 476 |
+
$docs->{$sys}{$doc}{FILE} = $file;
|
| 477 |
+
}
|
| 478 |
+
}
|
| 479 |
+
}
|
| 480 |
+
return $id;
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
#################################
|
| 484 |
+
|
| 485 |
+
sub check_MT_data
|
| 486 |
+
{
|
| 487 |
+
@tst_sys = sort keys %tst_data;
|
| 488 |
+
@ref_sys = sort keys %ref_data;
|
| 489 |
+
|
| 490 |
+
die "Not the same 'setid' attribute values across files" unless ( ( $src_id eq $tst_id ) && ( $src_id eq $ref_id ) );
|
| 491 |
+
|
| 492 |
+
#every evaluation document must be represented for every system and every reference
|
| 493 |
+
foreach my $doc (sort keys %eval_docs)
|
| 494 |
+
{
|
| 495 |
+
my $nseg_source = scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
|
| 496 |
+
foreach my $sys (@tst_sys)
|
| 497 |
+
{
|
| 498 |
+
die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n" unless defined $tst_data{$sys}{$doc};
|
| 499 |
+
my $nseg = scalar( keys( %{$tst_data{$sys}{$doc}{SEGS}} ) );
|
| 500 |
+
die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
|
| 501 |
+
." document '$doc' for system '$sys' contains $nseg segments, while\n"
|
| 502 |
+
." the source document contains $nseg_source segments.\n\n"
|
| 503 |
+
unless $nseg == $nseg_source;
|
| 504 |
+
}
|
| 505 |
+
foreach my $sys (@ref_sys)
|
| 506 |
+
{
|
| 507 |
+
die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n" unless defined $ref_data{$sys}{$doc};
|
| 508 |
+
my $nseg = scalar( keys( %{$ref_data{$sys}{$doc}{SEGS}} ) );
|
| 509 |
+
die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
|
| 510 |
+
." document '$doc' for system '$sys' contains $nseg segments, while\n"
|
| 511 |
+
." the source document contains $nseg_source segments.\n\n"
|
| 512 |
+
unless $nseg == $nseg_source;
|
| 513 |
+
}
|
| 514 |
+
}
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
#################################
|
| 518 |
+
|
| 519 |
+
sub compute_ngram_info
|
| 520 |
+
{
|
| 521 |
+
my ($ref, $doc, $seg);
|
| 522 |
+
my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
|
| 523 |
+
my (%ngram_count, @tot_ngrams);
|
| 524 |
+
|
| 525 |
+
foreach $ref (keys %ref_data)
|
| 526 |
+
{
|
| 527 |
+
foreach $doc (keys %{$ref_data{$ref}})
|
| 528 |
+
{
|
| 529 |
+
foreach $seg ( keys %{$ref_data{$ref}{$doc}{SEGS}})
|
| 530 |
+
{
|
| 531 |
+
@wrds = split /\s+/, $ref_data{ $ref }{ $doc }{ SEGS }{ $seg };
|
| 532 |
+
$tot_wrds += @wrds;
|
| 533 |
+
%ngrams = %{Words2Ngrams (@wrds)};
|
| 534 |
+
foreach $ngram (keys %ngrams)
|
| 535 |
+
{
|
| 536 |
+
$ngram_count{$ngram} += $ngrams{$ngram};
|
| 537 |
+
}
|
| 538 |
+
}
|
| 539 |
+
}
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
foreach $ngram (keys %ngram_count)
|
| 543 |
+
{
|
| 544 |
+
@wrds = split / /, $ngram;
|
| 545 |
+
pop @wrds, $mgram = join " ", @wrds;
|
| 546 |
+
$ngram_info{$ngram} = - log ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} : $ngram_count{$ngram}/$tot_wrds) / log 2;
|
| 547 |
+
if (defined $opt_x and $opt_x eq "ngram info")
|
| 548 |
+
{
|
| 549 |
+
@wrds = split / /, $ngram;
|
| 550 |
+
printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
|
| 551 |
+
$mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
|
| 552 |
+
}
|
| 553 |
+
}
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
#################################
|
| 557 |
+
|
| 558 |
+
sub score_system
|
| 559 |
+
{
|
| 560 |
+
my ($sys, $ref, $doc, $SCOREmt, $overallScore);
|
| 561 |
+
($sys, $SCOREmt, $overallScore) = @_;
|
| 562 |
+
my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
|
| 563 |
+
my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
|
| 564 |
+
|
| 565 |
+
$cum_ref_length = 0;
|
| 566 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 567 |
+
{
|
| 568 |
+
$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
|
| 569 |
+
}
|
| 570 |
+
foreach $doc (sort keys %eval_docs)
|
| 571 |
+
{
|
| 572 |
+
($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc, $overallScore);
|
| 573 |
+
if ( $method eq "NIST" )
|
| 574 |
+
{
|
| 575 |
+
my %DOCmt = ();
|
| 576 |
+
my $docScore = nist_score( scalar( @ref_sys ), $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, \%DOCmt );
|
| 577 |
+
$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
|
| 578 |
+
if ( $detail >= 1 )
|
| 579 |
+
{
|
| 580 |
+
printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
|
| 581 |
+
$docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
|
| 582 |
+
}
|
| 583 |
+
}
|
| 584 |
+
|
| 585 |
+
if ( $method eq "BLEU" )
|
| 586 |
+
{
|
| 587 |
+
my %DOCmt = ();
|
| 588 |
+
my $docScore = &{$BLEU_SCORE}( $ref_length, $match_cnt, $tst_cnt, $sys, \%DOCmt );
|
| 589 |
+
$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
|
| 590 |
+
if ( $detail >= 1 )
|
| 591 |
+
{
|
| 592 |
+
printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
|
| 593 |
+
$docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
|
| 594 |
+
}
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
$cum_ref_length += $ref_length;
|
| 598 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 599 |
+
{
|
| 600 |
+
$cum_match[$j] += $match_cnt->[$j];
|
| 601 |
+
$cum_tst_cnt[$j] += $tst_cnt->[$j];
|
| 602 |
+
$cum_ref_cnt[$j] += $ref_cnt->[$j];
|
| 603 |
+
$cum_tst_info[$j] += $tst_info->[$j];
|
| 604 |
+
$cum_ref_info[$j] += $ref_info->[$j];
|
| 605 |
+
printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
|
| 606 |
+
$tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
|
| 607 |
+
if (defined $opt_x and $opt_x eq "document info");
|
| 608 |
+
}
|
| 609 |
+
}
|
| 610 |
+
|
| 611 |
+
if ($method eq "BLEU")
|
| 612 |
+
{
|
| 613 |
+
$overallScore->{ $sys }{ 'score' } = &{$BLEU_SCORE}($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, $SCOREmt, 1);
|
| 614 |
+
}
|
| 615 |
+
if ($method eq "NIST")
|
| 616 |
+
{
|
| 617 |
+
$overallScore->{ $sys }{ 'score' } = nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, $SCOREmt);
|
| 618 |
+
}
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
#################################
|
| 622 |
+
|
| 623 |
+
sub score_document
|
| 624 |
+
{
|
| 625 |
+
my ($sys, $ref, $doc, $overallScore);
|
| 626 |
+
($sys, $doc, $overallScore) = @_;
|
| 627 |
+
my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
|
| 628 |
+
my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
|
| 629 |
+
|
| 630 |
+
$cum_ref_length = 0;
|
| 631 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 632 |
+
{
|
| 633 |
+
$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
|
| 634 |
+
}
|
| 635 |
+
|
| 636 |
+
#score each segment
|
| 637 |
+
foreach my $seg ( sort{ $a <=> $b } keys( %{$tst_data{$sys}{$doc}{SEGS}} ) )
|
| 638 |
+
{
|
| 639 |
+
my @ref_segments = ();
|
| 640 |
+
foreach $ref (@ref_sys)
|
| 641 |
+
{
|
| 642 |
+
push @ref_segments, $ref_data{$ref}{$doc}{SEGS}{$seg};
|
| 643 |
+
if ( $detail >= 3 )
|
| 644 |
+
{
|
| 645 |
+
printf "ref '$ref', seg $seg: %s\n", $ref_data{$ref}{$doc}{SEGS}{$seg}
|
| 646 |
+
}
|
| 647 |
+
|
| 648 |
+
}
|
| 649 |
+
|
| 650 |
+
printf "sys '$sys', seg $seg: %s\n", $tst_data{$sys}{$doc}{SEGS}{$seg} if ( $detail >= 3 );
|
| 651 |
+
($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_segment ($tst_data{$sys}{$doc}{SEGS}{$seg}, @ref_segments);
|
| 652 |
+
|
| 653 |
+
if ( $method eq "BLEU" )
|
| 654 |
+
{
|
| 655 |
+
my %DOCmt = ();
|
| 656 |
+
my $segScore = &{$BLEU_SCORE}($ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt);
|
| 657 |
+
$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
|
| 658 |
+
if ( $detail >= 2 )
|
| 659 |
+
{
|
| 660 |
+
printf " $method score using 4-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1]
|
| 661 |
+
}
|
| 662 |
+
}
|
| 663 |
+
if ( $method eq "NIST" )
|
| 664 |
+
{
|
| 665 |
+
my %DOCmt = ();
|
| 666 |
+
my $segScore = nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt);
|
| 667 |
+
$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
|
| 668 |
+
if ( $detail >= 2 )
|
| 669 |
+
{
|
| 670 |
+
printf " $method score using 5-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1];
|
| 671 |
+
}
|
| 672 |
+
}
|
| 673 |
+
$cum_ref_length += $ref_length;
|
| 674 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 675 |
+
{
|
| 676 |
+
$cum_match[$j] += $match_cnt->[$j];
|
| 677 |
+
$cum_tst_cnt[$j] += $tst_cnt->[$j];
|
| 678 |
+
$cum_ref_cnt[$j] += $ref_cnt->[$j];
|
| 679 |
+
$cum_tst_info[$j] += $tst_info->[$j];
|
| 680 |
+
$cum_ref_info[$j] += $ref_info->[$j];
|
| 681 |
+
}
|
| 682 |
+
}
|
| 683 |
+
return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
|
| 684 |
+
}
|
| 685 |
+
|
| 686 |
+
###############################################################################################################################
|
| 687 |
+
# function returning the shortest reference length
|
| 688 |
+
# takes as input:
|
| 689 |
+
# - currentLength : the current (shortest) reference length
|
| 690 |
+
# - referenceSentenceLength : the current reference sentence length
|
| 691 |
+
# - candidateSentenceLength : the current candidate sentence length (unused)
|
| 692 |
+
###############################################################################################################################
|
| 693 |
+
sub brevity_penalty_shortest
|
| 694 |
+
{
|
| 695 |
+
my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
|
| 696 |
+
return ( $referenceSentenceLength < $currentLength ? $referenceSentenceLength : $currentLength );
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
+
###############################################################################################################################
|
| 700 |
+
# function returning the closest reference length (to the candidate sentence length)
|
| 701 |
+
# takes as input:
|
| 702 |
+
# - currentLength: the current (closest) reference length.
|
| 703 |
+
# - candidateSentenceLength : the current reference sentence length
|
| 704 |
+
# - candidateSentenceLength : the current candidate sentence length
|
| 705 |
+
# when two reference sentences are at the same distance, it will return the shortest reference sentence length
|
| 706 |
+
# example of 4 iterations, given:
|
| 707 |
+
# - one candidate sentence containing 7 tokens
|
| 708 |
+
# - one reference translation containing 11 tokens
|
| 709 |
+
# - one reference translation containing 8 tokens
|
| 710 |
+
# - one reference translation containing 6 tokens
|
| 711 |
+
# - one reference translation containing 7 tokens
|
| 712 |
+
# the multiple invokations will return:
|
| 713 |
+
# - currentLength is set to 11 (outside of this function)
|
| 714 |
+
# - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 )
|
| 715 |
+
# - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8
|
| 716 |
+
# - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 )
|
| 717 |
+
###############################################################################################################################
|
| 718 |
+
sub brevity_penalty_closest
|
| 719 |
+
{
|
| 720 |
+
my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
|
| 721 |
+
my $result = $currentLength;
|
| 722 |
+
if ( abs( $candidateSentenceLength - $referenceSentenceLength ) <= abs( $candidateSentenceLength - $currentLength ) )
|
| 723 |
+
{
|
| 724 |
+
if ( abs( $candidateSentenceLength - $referenceSentenceLength ) == abs( $candidateSentenceLength - $currentLength ) )
|
| 725 |
+
{
|
| 726 |
+
if ( $currentLength > $referenceSentenceLength )
|
| 727 |
+
{
|
| 728 |
+
$result = $referenceSentenceLength;
|
| 729 |
+
}
|
| 730 |
+
}
|
| 731 |
+
else
|
| 732 |
+
{
|
| 733 |
+
$result = $referenceSentenceLength;
|
| 734 |
+
}
|
| 735 |
+
}
|
| 736 |
+
return $result;
|
| 737 |
+
}
|
| 738 |
+
|
| 739 |
+
#################################
|
| 740 |
+
|
| 741 |
+
sub score_segment
|
| 742 |
+
{
|
| 743 |
+
my ($tst_seg, @ref_segs) = @_;
|
| 744 |
+
my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
|
| 745 |
+
my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
|
| 746 |
+
my ($ngram);
|
| 747 |
+
my (@nwrds_ref);
|
| 748 |
+
my $ref_length;
|
| 749 |
+
|
| 750 |
+
for (my $j=1; $j<= $max_Ngram; $j++)
|
| 751 |
+
{
|
| 752 |
+
$match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
|
| 753 |
+
}
|
| 754 |
+
|
| 755 |
+
# get the ngram counts for the test segment
|
| 756 |
+
@tst_wrds = split /\s+/, $tst_seg;
|
| 757 |
+
%tst_ngrams = %{Words2Ngrams (@tst_wrds)};
|
| 758 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 759 |
+
{
|
| 760 |
+
# compute ngram counts
|
| 761 |
+
$tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
|
| 762 |
+
}
|
| 763 |
+
|
| 764 |
+
# get the ngram counts for the reference segments
|
| 765 |
+
foreach $ref_seg (@ref_segs)
|
| 766 |
+
{
|
| 767 |
+
@ref_wrds = split /\s+/, $ref_seg;
|
| 768 |
+
%ref_ngrams = %{Words2Ngrams (@ref_wrds)};
|
| 769 |
+
foreach $ngram (keys %ref_ngrams)
|
| 770 |
+
{
|
| 771 |
+
# find the maximum # of occurrences
|
| 772 |
+
my @wrds = split / /, $ngram;
|
| 773 |
+
$ref_info[@wrds] += $ngram_info{$ngram};
|
| 774 |
+
$ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : $ref_ngrams{$ngram};
|
| 775 |
+
}
|
| 776 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 777 |
+
{
|
| 778 |
+
# update ngram counts
|
| 779 |
+
$ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
|
| 780 |
+
}
|
| 781 |
+
if ( not defined( $ref_length ) )
|
| 782 |
+
{
|
| 783 |
+
$ref_length = scalar( @ref_wrds );
|
| 784 |
+
}
|
| 785 |
+
else
|
| 786 |
+
{
|
| 787 |
+
$ref_length = &{$BLEU_BP}( $ref_length, scalar( @ref_wrds ), scalar( @tst_wrds ) );
|
| 788 |
+
}
|
| 789 |
+
}
|
| 790 |
+
|
| 791 |
+
# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
|
| 792 |
+
foreach $ngram (keys %tst_ngrams)
|
| 793 |
+
{
|
| 794 |
+
next unless defined $ref_ngrams_max{$ngram};
|
| 795 |
+
my @wrds = split / /, $ngram;
|
| 796 |
+
$tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
|
| 797 |
+
$match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
|
| 798 |
+
printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
|
| 799 |
+
if $detail >= 3;
|
| 800 |
+
}
|
| 801 |
+
|
| 802 |
+
return ($ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
|
| 803 |
+
}
|
| 804 |
+
|
| 805 |
+
#################################
|
| 806 |
+
|
| 807 |
+
sub bleu_score_nosmoothing
|
| 808 |
+
{
|
| 809 |
+
my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_;
|
| 810 |
+
my $score = 0;
|
| 811 |
+
my $iscore = 0;
|
| 812 |
+
|
| 813 |
+
for ( my $j = 1; $j <= $max_Ngram; ++$j )
|
| 814 |
+
{
|
| 815 |
+
if ($matching_ngrams->[ $j ] == 0)
|
| 816 |
+
{
|
| 817 |
+
$SCOREmt->{ $j }{ $sys }{ cum }=0;
|
| 818 |
+
}
|
| 819 |
+
else
|
| 820 |
+
{
|
| 821 |
+
my $len_score = min (0, 1-$ref_length/$tst_ngrams->[1]);
|
| 822 |
+
# Cumulative N-Gram score
|
| 823 |
+
$score += log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
|
| 824 |
+
$SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j + $len_score );
|
| 825 |
+
# Individual N-Gram score
|
| 826 |
+
$iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
|
| 827 |
+
$SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
|
| 828 |
+
}
|
| 829 |
+
}
|
| 830 |
+
return $SCOREmt->{ 4 }{ $sys }{ cum };
|
| 831 |
+
}
|
| 832 |
+
|
| 833 |
+
###############################################################################################################################
|
| 834 |
+
# Default method used to compute the BLEU score, using smoothing.
|
| 835 |
+
# Note that the method used can be overridden using the '--no-smoothing' command-line argument
|
| 836 |
+
# The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each precision score whose matching n-gram count is null
|
| 837 |
+
# k is 1 for the first 'n' value for which the n-gram match count is null
|
| 838 |
+
# For example, if the text contains:
|
| 839 |
+
# - one 2-gram match
|
| 840 |
+
# - and (consequently) two 1-gram matches
|
| 841 |
+
# the n-gram count for each individual precision score would be:
|
| 842 |
+
# - n=1 => prec_count = 2 (two unigrams)
|
| 843 |
+
# - n=2 => prec_count = 1 (one bigram)
|
| 844 |
+
# - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
|
| 845 |
+
# - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
|
| 846 |
+
###############################################################################################################################
|
| 847 |
+
sub bleu_score
|
| 848 |
+
{
|
| 849 |
+
my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt,$report_length) = @_;
|
| 850 |
+
my $score = 0;
|
| 851 |
+
my $iscore = 0;
|
| 852 |
+
my $exp_len_score = 0;
|
| 853 |
+
$exp_len_score = exp( min (0, 1 - $ref_length / $tst_ngrams->[ 1 ] ) ) if ( $tst_ngrams->[ 1 ] > 0 );
|
| 854 |
+
print "length ratio: ".($tst_ngrams->[1]/$ref_length)." ($tst_ngrams->[1]/$ref_length), penalty (log): ".log($exp_len_score)."\n" if $report_length;
|
| 855 |
+
my $smooth = 1;
|
| 856 |
+
for ( my $j = 1; $j <= $max_Ngram; ++$j )
|
| 857 |
+
{
|
| 858 |
+
if ( $tst_ngrams->[ $j ] == 0 )
|
| 859 |
+
{
|
| 860 |
+
$iscore = 0;
|
| 861 |
+
}
|
| 862 |
+
elsif ( $matching_ngrams->[ $j ] == 0 )
|
| 863 |
+
{
|
| 864 |
+
$smooth *= 2;
|
| 865 |
+
$iscore = log( 1 / ( $smooth * $tst_ngrams->[ $j ] ) );
|
| 866 |
+
}
|
| 867 |
+
else
|
| 868 |
+
{
|
| 869 |
+
$iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
|
| 870 |
+
}
|
| 871 |
+
$SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
|
| 872 |
+
$score += $iscore;
|
| 873 |
+
$SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j ) * $exp_len_score;
|
| 874 |
+
}
|
| 875 |
+
return $SCOREmt->{ 4 }{ $sys }{ cum };
|
| 876 |
+
}
|
| 877 |
+
|
| 878 |
+
#################################
|
| 879 |
+
|
| 880 |
+
sub nist_score
|
| 881 |
+
{
|
| 882 |
+
my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, $SCOREmt) = @_;
|
| 883 |
+
my $score = 0;
|
| 884 |
+
my $iscore = 0;
|
| 885 |
+
|
| 886 |
+
for (my $n=1; $n<=$max_Ngram; $n++)
|
| 887 |
+
{
|
| 888 |
+
$score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
|
| 889 |
+
$SCOREmt->{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
|
| 890 |
+
$iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
|
| 891 |
+
$SCOREmt->{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
|
| 892 |
+
}
|
| 893 |
+
return $SCOREmt->{5}{$sys}{cum};
|
| 894 |
+
}
|
| 895 |
+
|
| 896 |
+
#################################
|
| 897 |
+
|
| 898 |
+
sub Words2Ngrams
|
| 899 |
+
{
|
| 900 |
+
#convert a string of words to an Ngram count hash
|
| 901 |
+
my %count = ();
|
| 902 |
+
|
| 903 |
+
for (; @_; shift)
|
| 904 |
+
{
|
| 905 |
+
my ($j, $ngram, $word);
|
| 906 |
+
for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++)
|
| 907 |
+
{
|
| 908 |
+
$ngram .= defined $ngram ? " $word" : $word;
|
| 909 |
+
$count{$ngram}++;
|
| 910 |
+
}
|
| 911 |
+
}
|
| 912 |
+
return {%count};
|
| 913 |
+
}
|
| 914 |
+
|
| 915 |
+
#################################
|
| 916 |
+
|
| 917 |
+
sub tokenization
|
| 918 |
+
{
|
| 919 |
+
my ($norm_text) = @_;
|
| 920 |
+
|
| 921 |
+
# language-independent part:
|
| 922 |
+
$norm_text =~ s/<skipped>//g; # strip "skipped" tags
|
| 923 |
+
$norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
|
| 924 |
+
$norm_text =~ s/\n/ /g; # join lines
|
| 925 |
+
$norm_text =~ s/"/"/g; # convert SGML tag for quote to "
|
| 926 |
+
$norm_text =~ s/&/&/g; # convert SGML tag for ampersand to &
|
| 927 |
+
$norm_text =~ s/</</g; # convert SGML tag for less-than to >
|
| 928 |
+
$norm_text =~ s/>/>/g; # convert SGML tag for greater-than to <
|
| 929 |
+
|
| 930 |
+
# language-dependent part (assuming Western languages):
|
| 931 |
+
$norm_text = " $norm_text ";
|
| 932 |
+
$norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
|
| 933 |
+
$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation
|
| 934 |
+
$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
|
| 935 |
+
$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
|
| 936 |
+
$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
|
| 937 |
+
$norm_text =~ s/\s+/ /g; # one space only between words
|
| 938 |
+
$norm_text =~ s/^\s+//; # no leading space
|
| 939 |
+
$norm_text =~ s/\s+$//; # no trailing space
|
| 940 |
+
|
| 941 |
+
return $norm_text;
|
| 942 |
+
}
|
| 943 |
+
|
| 944 |
+
|
| 945 |
+
sub tokenization_international
|
| 946 |
+
{
|
| 947 |
+
my ($norm_text) = @_;
|
| 948 |
+
|
| 949 |
+
$norm_text =~ s/<skipped>//g; # strip "skipped" tags
|
| 950 |
+
$norm_text =~ s/\p{Line_Break: Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines
|
| 951 |
+
$norm_text =~ s/\p{Zl}/ /g; # join lines
|
| 952 |
+
|
| 953 |
+
# replace entities
|
| 954 |
+
$norm_text =~ s/"/\"/g; # quote to "
|
| 955 |
+
$norm_text =~ s/&/&/g; # ampersand to &
|
| 956 |
+
$norm_text =~ s/</</g; # less-than to <
|
| 957 |
+
$norm_text =~ s/>/>/g; # greater-than to >
|
| 958 |
+
$norm_text =~ s/'/\'/g; # apostrophe to '
|
| 959 |
+
|
| 960 |
+
$norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed
|
| 961 |
+
$norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII );
|
| 962 |
+
|
| 963 |
+
# punctuation: tokenize any punctuation unless followed AND preceded by a digit
|
| 964 |
+
$norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g;
|
| 965 |
+
$norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g;
|
| 966 |
+
|
| 967 |
+
$norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols
|
| 968 |
+
|
| 969 |
+
$norm_text =~ s/\p{Z}+/ /g; # one space only between words
|
| 970 |
+
$norm_text =~ s/^\p{Z}+//; # no leading space
|
| 971 |
+
$norm_text =~ s/\p{Z}+$//; # no trailing space
|
| 972 |
+
|
| 973 |
+
return $norm_text;
|
| 974 |
+
}
|
| 975 |
+
|
| 976 |
+
#################################
|
| 977 |
+
|
| 978 |
+
sub nist_length_penalty
|
| 979 |
+
{
|
| 980 |
+
my ($ratio) = @_;
|
| 981 |
+
return 1 if $ratio >= 1;
|
| 982 |
+
return 0 if $ratio <= 0;
|
| 983 |
+
my $ratio_x = 1.5;
|
| 984 |
+
my $score_x = 0.5;
|
| 985 |
+
my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
|
| 986 |
+
return exp (-$beta*log($ratio)*log($ratio));
|
| 987 |
+
}
|
| 988 |
+
|
| 989 |
+
#################################
|
| 990 |
+
|
| 991 |
+
sub date_time_stamp
|
| 992 |
+
{
|
| 993 |
+
my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
|
| 994 |
+
my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
|
| 995 |
+
my ($date, $time);
|
| 996 |
+
$time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
|
| 997 |
+
$date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
|
| 998 |
+
return ($date, $time);
|
| 999 |
+
}
|
| 1000 |
+
|
| 1001 |
+
#################################
|
| 1002 |
+
|
| 1003 |
+
sub extract_sgml_tag_and_span
|
| 1004 |
+
{
|
| 1005 |
+
my ($name, $data) = @_;
|
| 1006 |
+
($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
|
| 1007 |
+
}
|
| 1008 |
+
|
| 1009 |
+
#################################
|
| 1010 |
+
|
| 1011 |
+
sub extract_sgml_tag_attribute
|
| 1012 |
+
{
|
| 1013 |
+
my ($name, $data) = @_;
|
| 1014 |
+
($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
|
| 1015 |
+
}
|
| 1016 |
+
|
| 1017 |
+
#################################
|
| 1018 |
+
|
| 1019 |
+
sub max
|
| 1020 |
+
{
|
| 1021 |
+
my ($max, $next);
|
| 1022 |
+
|
| 1023 |
+
return unless defined ($max=pop);
|
| 1024 |
+
while (defined ($next=pop))
|
| 1025 |
+
{
|
| 1026 |
+
$max = $next if $next > $max;
|
| 1027 |
+
}
|
| 1028 |
+
return $max;
|
| 1029 |
+
}
|
| 1030 |
+
|
| 1031 |
+
#################################
|
| 1032 |
+
|
| 1033 |
+
sub min
|
| 1034 |
+
{
|
| 1035 |
+
my ($min, $next);
|
| 1036 |
+
|
| 1037 |
+
return unless defined ($min=pop);
|
| 1038 |
+
while (defined ($next=pop))
|
| 1039 |
+
{
|
| 1040 |
+
$min = $next if $next < $min;
|
| 1041 |
+
}
|
| 1042 |
+
return $min;
|
| 1043 |
+
}
|
| 1044 |
+
|
| 1045 |
+
#################################
|
| 1046 |
+
|
| 1047 |
+
sub printout_report
|
| 1048 |
+
{
|
| 1049 |
+
if ( $METHOD eq "BOTH" )
|
| 1050 |
+
{
|
| 1051 |
+
foreach my $sys (sort @tst_sys)
|
| 1052 |
+
{
|
| 1053 |
+
printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
|
| 1054 |
+
}
|
| 1055 |
+
}
|
| 1056 |
+
elsif ($METHOD eq "NIST" )
|
| 1057 |
+
{
|
| 1058 |
+
foreach my $sys (sort @tst_sys)
|
| 1059 |
+
{
|
| 1060 |
+
printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
|
| 1061 |
+
}
|
| 1062 |
+
}
|
| 1063 |
+
elsif ($METHOD eq "BLEU" )
|
| 1064 |
+
{
|
| 1065 |
+
foreach my $sys (sort @tst_sys)
|
| 1066 |
+
{
|
| 1067 |
+
printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
|
| 1068 |
+
}
|
| 1069 |
+
}
|
| 1070 |
+
printf "\n# ------------------------------------------------------------------------\n\n";
|
| 1071 |
+
printf "Individual N-gram scoring\n";
|
| 1072 |
+
printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
|
| 1073 |
+
printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
|
| 1074 |
+
|
| 1075 |
+
if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "NIST") )
|
| 1076 |
+
{
|
| 1077 |
+
foreach my $sys (sort @tst_sys)
|
| 1078 |
+
{
|
| 1079 |
+
printf " NIST:";
|
| 1080 |
+
for (my $i=1; $i<=$max_Ngram; $i++)
|
| 1081 |
+
{
|
| 1082 |
+
printf " %2.4f ",$NISTmt{$i}{$sys}{ind}
|
| 1083 |
+
}
|
| 1084 |
+
printf " \"$sys\"\n";
|
| 1085 |
+
}
|
| 1086 |
+
printf "\n";
|
| 1087 |
+
}
|
| 1088 |
+
|
| 1089 |
+
if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
|
| 1090 |
+
{
|
| 1091 |
+
foreach my $sys (sort @tst_sys)
|
| 1092 |
+
{
|
| 1093 |
+
printf " BLEU:";
|
| 1094 |
+
for (my $i=1; $i<=$max_Ngram; $i++)
|
| 1095 |
+
{
|
| 1096 |
+
printf " %2.4f ",$BLEUmt{$i}{$sys}{ind}
|
| 1097 |
+
}
|
| 1098 |
+
printf " \"$sys\"\n";
|
| 1099 |
+
}
|
| 1100 |
+
}
|
| 1101 |
+
|
| 1102 |
+
printf "\n# ------------------------------------------------------------------------\n";
|
| 1103 |
+
printf "Cumulative N-gram scoring\n";
|
| 1104 |
+
printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
|
| 1105 |
+
printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
|
| 1106 |
+
|
| 1107 |
+
if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST"))
|
| 1108 |
+
{
|
| 1109 |
+
foreach my $sys (sort @tst_sys)
|
| 1110 |
+
{
|
| 1111 |
+
printf " NIST:";
|
| 1112 |
+
for (my $i=1; $i<=$max_Ngram; $i++)
|
| 1113 |
+
{
|
| 1114 |
+
printf " %2.4f ",$NISTmt{$i}{$sys}{cum}
|
| 1115 |
+
}
|
| 1116 |
+
printf " \"$sys\"\n";
|
| 1117 |
+
}
|
| 1118 |
+
}
|
| 1119 |
+
printf "\n";
|
| 1120 |
+
if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
|
| 1121 |
+
{
|
| 1122 |
+
foreach my $sys (sort @tst_sys)
|
| 1123 |
+
{
|
| 1124 |
+
printf " BLEU:";
|
| 1125 |
+
for (my $i=1; $i<=$max_Ngram; $i++)
|
| 1126 |
+
{
|
| 1127 |
+
printf " %2.4f ",$BLEUmt{$i}{$sys}{cum}
|
| 1128 |
+
}
|
| 1129 |
+
printf " \"$sys\"\n";
|
| 1130 |
+
}
|
| 1131 |
+
}
|
| 1132 |
+
}
|
| 1133 |
+
|
| 1134 |
+
###############################################################################################################################
|
| 1135 |
+
# Create three files, by using:
|
| 1136 |
+
# - $prefix : the prefix used for the output file names
|
| 1137 |
+
# - %overall : a hash containing seg/doc/sys-level scores:
|
| 1138 |
+
# - $overall{ $SYSTEM_ID }{ 'score' } => system-level score
|
| 1139 |
+
# - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'score' } => document-level score
|
| 1140 |
+
# - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'segments' }{ $SEGMENT_ID } => segment-level score
|
| 1141 |
+
###############################################################################################################################
|
| 1142 |
+
sub outputMetricsMATR
|
| 1143 |
+
{
|
| 1144 |
+
my ( $prefix, %overall ) = @_;
|
| 1145 |
+
my $fileNameSys = $prefix . '-sys.scr';
|
| 1146 |
+
my $fileNameDoc = $prefix . '-doc.scr';
|
| 1147 |
+
my $fileNameSeg = $prefix . '-seg.scr';
|
| 1148 |
+
open FILEOUT_SYS, '>', $fileNameSys or die "Could not open file: ${fileNameSys}";
|
| 1149 |
+
open FILEOUT_DOC, '>', $fileNameDoc or die "Could not open file: ${fileNameDoc}";
|
| 1150 |
+
open FILEOUT_SEG, '>', $fileNameSeg or die "Could not open file: ${fileNameSeg}";
|
| 1151 |
+
foreach my $sys ( sort( keys( %overall ) ) )
|
| 1152 |
+
{
|
| 1153 |
+
my $scoreSys = $overall{ $sys }{ 'score' };
|
| 1154 |
+
print FILEOUT_SYS "${tst_id}\t${sys}\t${scoreSys}\n";
|
| 1155 |
+
foreach my $doc ( sort( keys( %{$overall{ $sys }{ 'documents' }} ) ) )
|
| 1156 |
+
{
|
| 1157 |
+
my $scoreDoc = $overall{ $sys }{ 'documents' }{ $doc }{ 'score' };
|
| 1158 |
+
print FILEOUT_DOC "${tst_id}\t${sys}\t${doc}\t${scoreDoc}\n";
|
| 1159 |
+
foreach my $seg ( sort{ $a <=> $b }( keys( %{$overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }} ) ) )
|
| 1160 |
+
{
|
| 1161 |
+
my $scoreSeg = $overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' };
|
| 1162 |
+
print FILEOUT_SEG "${tst_id}\t${sys}\t${doc}\t${seg}\t${scoreSeg}\n";
|
| 1163 |
+
}
|
| 1164 |
+
}
|
| 1165 |
+
}
|
| 1166 |
+
close FILEOUT_SEG;
|
| 1167 |
+
close FILEOUT_DOC;
|
| 1168 |
+
close FILEOUT_SYS;
|
| 1169 |
+
}
|
| 1170 |
+
|
mosesdecoder/scripts/generic/mteval-v14.pl
ADDED
|
@@ -0,0 +1,1179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
|
| 3 |
+
use warnings;
|
| 4 |
+
use strict;
|
| 5 |
+
use utf8;
|
| 6 |
+
use Encode;
|
| 7 |
+
use XML::Twig;
|
| 8 |
+
use Sort::Naturally;
|
| 9 |
+
|
| 10 |
+
binmode STDOUT, ":utf8";
|
| 11 |
+
binmode STDERR, ":utf8";
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
#################################
|
| 15 |
+
# History:
|
| 16 |
+
#
|
| 17 |
+
# version 14
|
| 18 |
+
# (2016-03-29 lukas.diduch@nist.gov)
|
| 19 |
+
# * Fixed warning message in case seg-id is a string, by sorting in correct order using Sort::Naturally.
|
| 20 |
+
#
|
| 21 |
+
# version 13b
|
| 22 |
+
# * Fixed die 'bug' in case seg->id = 0
|
| 23 |
+
#
|
| 24 |
+
# version 13a
|
| 25 |
+
# * modified the scoring functions to prevent division-by-zero errors when a system segment is empty
|
| 26 |
+
# * affected methods: 'bleu_score' and 'bleu_score_smoothing'
|
| 27 |
+
#
|
| 28 |
+
# version 13
|
| 29 |
+
# * Uses a XML parser to read data (only when extension is .xml)
|
| 30 |
+
# * Smoothing of the segment-level BLEU scores, done by default
|
| 31 |
+
# * smoothing method similar to that of bleu-1.04.pl (IBM)
|
| 32 |
+
# * see comments above the 'bleu_score' method for more details on how the smoothing is computed
|
| 33 |
+
# * added a '--no-smoothing' option to simulate old scripts behavior
|
| 34 |
+
# * Introduction of the 'brevity-penalty' option, taking one of two values:
|
| 35 |
+
# * 'closest' (default) : act as IBM BLEU (taking the closest reference translation length)
|
| 36 |
+
# * in case two reference translations are at the same distance, will take the shortest one
|
| 37 |
+
# * for more details regarding how the BP is computed, see comments of the 'brevity_penalty_closest' function
|
| 38 |
+
# * 'shortest' : act as previous versions of the script (taking shortest reference translation length)
|
| 39 |
+
# * Introduction of the 'international-tokenization' option, boolean, disabled by default
|
| 40 |
+
# by default (when the option is not provided), uses 11b's tokenization function
|
| 41 |
+
# when option specified, uses v12's tokenization function
|
| 42 |
+
# * Introduction of a 'Metrics MATR output' flag (option '--metricsMATR')
|
| 43 |
+
# when used, creates three files for both BLEU score and NIST score:
|
| 44 |
+
# * BLEU-seg.scr and NIST-seg.scr: contain segment-level scores
|
| 45 |
+
# * BLEU-doc.scr and NIST-doc.scr: contain document-level scores
|
| 46 |
+
# * BLEU-sys.scr and NIST-sys.scr: contain system-level scores
|
| 47 |
+
# * SGML parsing
|
| 48 |
+
# * script will halt if source, reference and test files don't share the same setid attribute value (used for metricsMATR output)
|
| 49 |
+
# * correct segment IDs extracted from the files (was previously using an array, and using the index as a segID for output)
|
| 50 |
+
# * detailed output flag (-d) can now be used when running both BLEU and NIST
|
| 51 |
+
#
|
| 52 |
+
# version 12
|
| 53 |
+
# * Text normalization changes:
|
| 54 |
+
# * convert entity references (only the entities declared in the DTD)
|
| 55 |
+
# * now uses unicode categories
|
| 56 |
+
# * tokenize punctuation unless followed AND preceded by digits
|
| 57 |
+
# * tokenize symbols
|
| 58 |
+
# * UTF-8 handling:
|
| 59 |
+
# * files are now read using utf8 mode
|
| 60 |
+
# * Added the '-e' command-line option to enclose non-ASCII characters between spaces
|
| 61 |
+
#
|
| 62 |
+
# version 11b -- text normalization modified:
|
| 63 |
+
# * take out the join digit line because it joins digits
|
| 64 |
+
# when it shouldn't have
|
| 65 |
+
# $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
|
| 66 |
+
#
|
| 67 |
+
# version 11a -- corrected output of individual n-gram precision values
|
| 68 |
+
#
|
| 69 |
+
# version 11 -- bug fixes:
|
| 70 |
+
# * make filehandle operate in binary mode to prevent Perl from operating
|
| 71 |
+
# (by default in Red Hat 9) in UTF-8
|
| 72 |
+
# * fix failure on joining digits
|
| 73 |
+
# version 10 -- updated output to include more details of n-gram scoring.
|
| 74 |
+
# Defaults to generate both NIST and BLEU scores. Use -b for BLEU
|
| 75 |
+
# only, use -n for NIST only
|
| 76 |
+
#
|
| 77 |
+
# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
|
| 78 |
+
# being the max, regardless what was entered on the command line.)
|
| 79 |
+
#
|
| 80 |
+
# version 09c -- bug fix (During the calculation of ngram information,
|
| 81 |
+
# each ngram was being counted only once for each segment. This has
|
| 82 |
+
# been fixed so that each ngram is counted correctly in each segment.)
|
| 83 |
+
#
|
| 84 |
+
# version 09b -- text normalization modified:
|
| 85 |
+
# * option flag added to preserve upper case
|
| 86 |
+
# * non-ASCII characters left in place.
|
| 87 |
+
#
|
| 88 |
+
# version 09a -- text normalization modified:
|
| 89 |
+
# * " and & converted to "" and &, respectively
|
| 90 |
+
# * non-ASCII characters kept together (bug fix)
|
| 91 |
+
#
|
| 92 |
+
# version 09 -- modified to accommodate sgml tag and attribute
|
| 93 |
+
# names revised to conform to default SGML conventions.
|
| 94 |
+
#
|
| 95 |
+
# version 08 -- modifies the NIST metric in accordance with the
|
| 96 |
+
# findings on the 2001 Chinese-English dry run corpus. Also
|
| 97 |
+
# incorporates the BLEU metric as an option and supports the
|
| 98 |
+
# output of ngram detail.
|
| 99 |
+
#
|
| 100 |
+
# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
|
| 101 |
+
# Keep strings of non-ASCII characters together as one word
|
| 102 |
+
# (rather than splitting them into one-character words).
|
| 103 |
+
# Change length penalty so that translations that are longer than
|
| 104 |
+
# the average reference translation are not penalized.
|
| 105 |
+
#
|
| 106 |
+
# version 06
|
| 107 |
+
# Prevent divide-by-zero when a segment has no evaluation N-grams.
|
| 108 |
+
# Correct segment index for level 3 debug output.
|
| 109 |
+
#
|
| 110 |
+
# version 05
|
| 111 |
+
# improve diagnostic error messages
|
| 112 |
+
#
|
| 113 |
+
# version 04
|
| 114 |
+
# tag segments
|
| 115 |
+
#
|
| 116 |
+
# version 03
|
| 117 |
+
# add detailed output option (intermediate document and segment scores)
|
| 118 |
+
#
|
| 119 |
+
# version 02
|
| 120 |
+
# accommodation of modified sgml tags and attributes
|
| 121 |
+
#
|
| 122 |
+
# version 01
|
| 123 |
+
# same as bleu version 15, but modified to provide formal score output.
|
| 124 |
+
#
|
| 125 |
+
# original IBM version
|
| 126 |
+
# Author: Kishore Papineni
|
| 127 |
+
# Date: 06/10/2001
|
| 128 |
+
#################################
|
| 129 |
+
|
| 130 |
+
######
|
| 131 |
+
# Intro
|
| 132 |
+
my ($date, $time) = date_time_stamp();
|
| 133 |
+
print "MT evaluation scorer began on $date at $time\n";
|
| 134 |
+
print "\ncommand line: ", $0, " ", join(" ", @ARGV), "\n";
|
| 135 |
+
my $usage = "\n\nUsage: $0 -r <ref_file> -s <src_file> -t <tst_file>\n\n".
|
| 136 |
+
"Description: This Perl script evaluates MT system performance.\n".
|
| 137 |
+
"\n".
|
| 138 |
+
"Required arguments:\n".
|
| 139 |
+
" -r <ref_file> is a file containing the reference translations for\n".
|
| 140 |
+
" the documents to be evaluated.\n".
|
| 141 |
+
" -s <src_file> is a file containing the source documents for which\n".
|
| 142 |
+
" translations are to be evaluated\n".
|
| 143 |
+
" -t <tst_file> is a file containing the translations to be evaluated\n".
|
| 144 |
+
"\n".
|
| 145 |
+
"Optional arguments:\n".
|
| 146 |
+
" -h prints this help message to STDOUT\n".
|
| 147 |
+
" -c preserves upper-case alphabetic characters\n".
|
| 148 |
+
" -b generate BLEU scores only\n".
|
| 149 |
+
" -n generate NIST scores only\n".
|
| 150 |
+
" -d detailed output flag:\n".
|
| 151 |
+
" 0 (default) for system-level score only\n".
|
| 152 |
+
" 1 to include document-level scores\n".
|
| 153 |
+
" 2 to include segment-level scores\n".
|
| 154 |
+
" 3 to include ngram-level scores\n".
|
| 155 |
+
" -e enclose non-ASCII characters between spaces\n".
|
| 156 |
+
" --brevity-penalty ( closest | shortest )\n" .
|
| 157 |
+
" closest (default) : acts as IBM BLEU (takes the closest reference translation length)\n" .
|
| 158 |
+
" shortest : acts as previous versions of the script (takes the shortest reference translation length)\n" .
|
| 159 |
+
" --international-tokenization\n" .
|
| 160 |
+
" when specified, uses Unicode-based (only) tokenization rules\n" .
|
| 161 |
+
" when not specified (default), uses default tokenization (some language-dependant rules)\n" .
|
| 162 |
+
" --metricsMATR : create three files for both BLEU scores and NIST scores:\n" .
|
| 163 |
+
" BLEU-seg.scr and NIST-seg.scr : segment-level scores\n" .
|
| 164 |
+
" BLEU-doc.scr and NIST-doc.scr : document-level scores\n" .
|
| 165 |
+
" BLEU-sys.scr and NIST-sys.scr : system-level scores\n" .
|
| 166 |
+
" --no-smoothing : disable smoothing on BLEU scores\n" .
|
| 167 |
+
"\n";
|
| 168 |
+
|
| 169 |
+
use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e);
|
| 170 |
+
use Getopt::Long;
|
| 171 |
+
my $ref_file = '';
|
| 172 |
+
my $src_file = '';
|
| 173 |
+
my $tst_file = '';
|
| 174 |
+
my $detail = 0;
|
| 175 |
+
my $help = '';
|
| 176 |
+
my $preserve_case = '';
|
| 177 |
+
my $split_non_ASCII = '';
|
| 178 |
+
my $brevity_penalty = 'closest';
|
| 179 |
+
my $international_tokenization;
|
| 180 |
+
my $metricsMATR_output = '';
|
| 181 |
+
my $no_smoothing = '';
|
| 182 |
+
our $opt_x = '';
|
| 183 |
+
our $opt_b = '';
|
| 184 |
+
our $opt_n = '';
|
| 185 |
+
GetOptions(
|
| 186 |
+
'r=s' => \$ref_file,
|
| 187 |
+
's=s' => \$src_file,
|
| 188 |
+
't=s' => \$tst_file,
|
| 189 |
+
'd:i' => \$detail,
|
| 190 |
+
'h|help' => \$help,
|
| 191 |
+
'b',
|
| 192 |
+
'n',
|
| 193 |
+
'c' => \$preserve_case,
|
| 194 |
+
'x:s',
|
| 195 |
+
'e' => \$split_non_ASCII,
|
| 196 |
+
'brevity-penalty:s' => \$brevity_penalty,
|
| 197 |
+
'international-tokenization' => \$international_tokenization,
|
| 198 |
+
'metricsMATR-output' => \$metricsMATR_output,
|
| 199 |
+
'no-smoothing' => \$no_smoothing
|
| 200 |
+
);
|
| 201 |
+
die $usage if $help;
|
| 202 |
+
|
| 203 |
+
die "Error in command line: ref_file not defined$usage" unless ( $ref_file );
|
| 204 |
+
die "Error in command line: src_file not defined$usage" unless ( $src_file );
|
| 205 |
+
die "Error in command line: tst_file not defined$usage" unless ( $tst_file );
|
| 206 |
+
my $BLEU_BP;
|
| 207 |
+
if ( !( $brevity_penalty cmp 'closest' ) )
|
| 208 |
+
{
|
| 209 |
+
$BLEU_BP = \&brevity_penalty_closest;
|
| 210 |
+
}
|
| 211 |
+
elsif ( !( $brevity_penalty cmp 'shortest' ) )
|
| 212 |
+
{
|
| 213 |
+
$BLEU_BP = \&brevity_penalty_shortest;
|
| 214 |
+
}
|
| 215 |
+
else
|
| 216 |
+
{
|
| 217 |
+
die "Incorrect value supplied for 'brevity_penalty'$usage";
|
| 218 |
+
}
|
| 219 |
+
my $TOKENIZATION = \&tokenization;
|
| 220 |
+
$TOKENIZATION = \&tokenization_international if ( $international_tokenization );
|
| 221 |
+
|
| 222 |
+
my $BLEU_SCORE = \&bleu_score;
|
| 223 |
+
$BLEU_SCORE = \&bleu_score_nosmoothing if ( $no_smoothing );
|
| 224 |
+
|
| 225 |
+
my $max_Ngram = 9;
|
| 226 |
+
|
| 227 |
+
my $METHOD = "BOTH";
|
| 228 |
+
if ( $opt_b ) { $METHOD = "BLEU"; }
|
| 229 |
+
if ( $opt_n ) { $METHOD = "NIST"; }
|
| 230 |
+
my $method;
|
| 231 |
+
|
| 232 |
+
######
|
| 233 |
+
# Global variables
|
| 234 |
+
my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
|
| 235 |
+
my (%tst_data, %ref_data); # the data -- with structure: {system}{document}{segments}
|
| 236 |
+
my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
|
| 237 |
+
my %eval_docs; # document information for the evaluation data set
|
| 238 |
+
my %ngram_info; # the information obtained from (the last word in) the ngram
|
| 239 |
+
|
| 240 |
+
######
|
| 241 |
+
# Get source document ID's
|
| 242 |
+
($src_id) = get_source_info ($src_file);
|
| 243 |
+
|
| 244 |
+
######
|
| 245 |
+
# Get reference translations
|
| 246 |
+
($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
|
| 247 |
+
|
| 248 |
+
compute_ngram_info ();
|
| 249 |
+
|
| 250 |
+
######
|
| 251 |
+
# Get translations to evaluate
|
| 252 |
+
($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
|
| 253 |
+
|
| 254 |
+
######
|
| 255 |
+
# Check data for completeness and correctness
|
| 256 |
+
check_MT_data ();
|
| 257 |
+
|
| 258 |
+
######
|
| 259 |
+
#
|
| 260 |
+
my %NISTmt;
|
| 261 |
+
my %NISTOverall;
|
| 262 |
+
my %BLEUmt;
|
| 263 |
+
my %BLEUOverall;
|
| 264 |
+
|
| 265 |
+
######
|
| 266 |
+
# Evaluate
|
| 267 |
+
print "\nEvaluation of $src_lang-to-$tgt_lang translation using:\n";
|
| 268 |
+
my $cum_seg = 0;
|
| 269 |
+
foreach my $doc (sort keys %eval_docs)
|
| 270 |
+
{
|
| 271 |
+
$cum_seg += scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
|
| 272 |
+
}
|
| 273 |
+
print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
|
| 274 |
+
print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
|
| 275 |
+
print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
|
| 276 |
+
|
| 277 |
+
foreach my $sys (sort @tst_sys)
|
| 278 |
+
{
|
| 279 |
+
for (my $n=1; $n<=$max_Ngram; $n++)
|
| 280 |
+
{
|
| 281 |
+
$NISTmt{$n}{$sys}{cum} = 0;
|
| 282 |
+
$NISTmt{$n}{$sys}{ind} = 0;
|
| 283 |
+
$BLEUmt{$n}{$sys}{cum} = 0;
|
| 284 |
+
$BLEUmt{$n}{$sys}{ind} = 0;
|
| 285 |
+
}
|
| 286 |
+
if ( ($METHOD eq "BOTH") || ($METHOD eq "NIST") )
|
| 287 |
+
{
|
| 288 |
+
$method="NIST";
|
| 289 |
+
score_system ($sys, \%NISTmt, \%NISTOverall);
|
| 290 |
+
}
|
| 291 |
+
if ( ($METHOD eq "BOTH") || ($METHOD eq "BLEU") )
|
| 292 |
+
{
|
| 293 |
+
$method="BLEU";
|
| 294 |
+
score_system ($sys, \%BLEUmt, \%BLEUOverall);
|
| 295 |
+
}
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
######
|
| 299 |
+
printout_report ();
|
| 300 |
+
if ( $metricsMATR_output )
|
| 301 |
+
{
|
| 302 |
+
outputMetricsMATR( 'NIST', %NISTOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'NIST' ) );
|
| 303 |
+
outputMetricsMATR( 'BLEU', %BLEUOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'BLEU' ) );
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
($date, $time) = date_time_stamp();
|
| 307 |
+
print "\nMT evaluation scorer ended on $date at $time\n";
|
| 308 |
+
|
| 309 |
+
exit 0;
|
| 310 |
+
|
| 311 |
+
#################################
|
| 312 |
+
|
| 313 |
+
sub get_source_info
|
| 314 |
+
{
|
| 315 |
+
my ($file) = @_;
|
| 316 |
+
my ($name, $id, $src, $doc, $seg);
|
| 317 |
+
my ($data, $tag, $span);
|
| 318 |
+
|
| 319 |
+
# Extension of the file determines the parser used:
|
| 320 |
+
# .xml : XML::Twig
|
| 321 |
+
# otherwise : simple SGML parsing functions
|
| 322 |
+
if ( $file =~ /\.xml$/i )
|
| 323 |
+
{
|
| 324 |
+
my $twig = XML::Twig->new();
|
| 325 |
+
$twig->parsefile( $file );
|
| 326 |
+
my $root = $twig->root;
|
| 327 |
+
my $currentSet = $root->first_child( 'srcset' );
|
| 328 |
+
die "Source XML file '$file' does not contain the 'srcset' element" if ( not $currentSet );
|
| 329 |
+
$id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
|
| 330 |
+
$src = $currentSet->{ 'att' }->{ 'srclang' } or die "No srcset 'srclang' attribute value in '$file'";
|
| 331 |
+
die "Not the same srclang attribute values across sets" unless ( not defined $src_lang or $src eq $src_lang );
|
| 332 |
+
$src_lang = $src;
|
| 333 |
+
foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
|
| 334 |
+
{
|
| 335 |
+
my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
|
| 336 |
+
foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
|
| 337 |
+
{
|
| 338 |
+
|
| 339 |
+
my $segID = $currentSeg->{ 'att' }->{ 'id' };
|
| 340 |
+
die "No segment 'id' attribute value in '$file'" if (! defined $segID);
|
| 341 |
+
my $segData = $currentSeg->text;
|
| 342 |
+
($eval_docs{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
|
| 343 |
+
}
|
| 344 |
+
}
|
| 345 |
+
}
|
| 346 |
+
else
|
| 347 |
+
{
|
| 348 |
+
#read data from file
|
| 349 |
+
open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
|
| 350 |
+
binmode FILE, ":utf8";
|
| 351 |
+
$data .= $_ while <FILE>;
|
| 352 |
+
close (FILE);
|
| 353 |
+
|
| 354 |
+
#get source set info
|
| 355 |
+
die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n"
|
| 356 |
+
unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
|
| 357 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 358 |
+
unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
|
| 359 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 360 |
+
unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
|
| 361 |
+
die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
|
| 362 |
+
." with $name in previous input data ('$src_lang')\n\n"
|
| 363 |
+
unless (not defined $src_lang or $src eq $src_lang);
|
| 364 |
+
$src_lang = $src;
|
| 365 |
+
|
| 366 |
+
#get doc info -- ID and # of segs
|
| 367 |
+
$data = $span;
|
| 368 |
+
while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data))
|
| 369 |
+
{
|
| 370 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 371 |
+
unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
|
| 372 |
+
die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n"
|
| 373 |
+
if defined $eval_docs{$doc};
|
| 374 |
+
$span =~ s/[\s\n\r]+/ /g; # concatenate records
|
| 375 |
+
my $nseg=0, my $seg_data = $span;
|
| 376 |
+
while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
|
| 377 |
+
{
|
| 378 |
+
die "\n\nFATAL INPUT ERROR: no attribute '$name' in file '$file'\n\n"
|
| 379 |
+
unless ($seg) = extract_sgml_tag_attribute( $name='id', $tag );
|
| 380 |
+
($eval_docs{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
|
| 381 |
+
$nseg++;
|
| 382 |
+
}
|
| 383 |
+
die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
|
| 384 |
+
if $nseg == 0;
|
| 385 |
+
}
|
| 386 |
+
die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n"
|
| 387 |
+
unless keys %eval_docs > 0;
|
| 388 |
+
}
|
| 389 |
+
return $id;
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
#################################
|
| 393 |
+
|
| 394 |
+
sub get_MT_data
|
| 395 |
+
{
|
| 396 |
+
my ($docs, $set_tag, $file) = @_;
|
| 397 |
+
my ($name, $id, $src, $tgt, $sys, $doc, $seg);
|
| 398 |
+
my ($tag, $span, $data);
|
| 399 |
+
|
| 400 |
+
# Extension of the file determines the parser used:
|
| 401 |
+
# .xml : XML::Twig
|
| 402 |
+
# otherwise : simple SGML parsing functions
|
| 403 |
+
if ( $file =~ /\.xml$/i )
|
| 404 |
+
{
|
| 405 |
+
my $twig = XML::Twig->new();
|
| 406 |
+
$twig->parsefile( $file );
|
| 407 |
+
my $root = $twig->root;
|
| 408 |
+
foreach my $currentSet ( $root->get_xpath( 'refset' ), $root->get_xpath( 'tstset' ) )
|
| 409 |
+
{
|
| 410 |
+
$id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
|
| 411 |
+
$src = $currentSet->{ 'att' }->{ 'srclang' } or die "No 'srclang' attribute value in '$file'";
|
| 412 |
+
$tgt = $currentSet->{ 'att' }->{ 'trglang' } or die "No 'trglang' attribute value in '$file'";
|
| 413 |
+
die "Not the same 'srclang' attribute value across sets" unless ( $src eq $src_lang );
|
| 414 |
+
die "Not the same 'trglang' attribute value across sets" unless ( ( not defined $tgt_lang ) or ( $tgt = $tgt_lang ) );
|
| 415 |
+
$tgt_lang = $tgt;
|
| 416 |
+
my $sys;
|
| 417 |
+
if ( $currentSet->name eq 'tstset' )
|
| 418 |
+
{
|
| 419 |
+
$sys = $currentSet->{ 'att' }->{ 'sysid' } or die "No 'sysid' attribute value in '$file'";
|
| 420 |
+
}
|
| 421 |
+
else
|
| 422 |
+
{
|
| 423 |
+
$sys = $currentSet->{ 'att' }->{ 'refid' } or die "No 'refid' attribute value in '$file'";
|
| 424 |
+
}
|
| 425 |
+
foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
|
| 426 |
+
{
|
| 427 |
+
my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
|
| 428 |
+
$docs->{ $sys }{ $docID }{ FILE } = $file;
|
| 429 |
+
foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
|
| 430 |
+
{
|
| 431 |
+
my $segID = $currentSeg->{ 'att' }->{ 'id' };
|
| 432 |
+
die "No segment 'id' attribute value in '$file'" if (! defined $segID);
|
| 433 |
+
my $segData = $currentSeg->text;
|
| 434 |
+
($docs->{$sys}{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
|
| 435 |
+
}
|
| 436 |
+
}
|
| 437 |
+
}
|
| 438 |
+
}
|
| 439 |
+
else
|
| 440 |
+
{
|
| 441 |
+
#read data from file
|
| 442 |
+
open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
|
| 443 |
+
binmode FILE, ":utf8";
|
| 444 |
+
$data .= $_ while <FILE>;
|
| 445 |
+
close (FILE);
|
| 446 |
+
|
| 447 |
+
#get tag info
|
| 448 |
+
while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data))
|
| 449 |
+
{
|
| 450 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 451 |
+
unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
|
| 452 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 453 |
+
unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
|
| 454 |
+
die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
|
| 455 |
+
." with $name of source ('$src_lang')\n\n"
|
| 456 |
+
unless $src eq $src_lang;
|
| 457 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 458 |
+
unless ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
|
| 459 |
+
die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n"
|
| 460 |
+
." with $name of the evaluation ('$tgt_lang')\n\n"
|
| 461 |
+
unless (not defined $tgt_lang or $tgt eq $tgt_lang);
|
| 462 |
+
$tgt_lang = $tgt;
|
| 463 |
+
|
| 464 |
+
my $mtdata = $span;
|
| 465 |
+
while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata))
|
| 466 |
+
{
|
| 467 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 468 |
+
unless (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
|
| 469 |
+
die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 470 |
+
unless $doc = extract_sgml_tag_attribute ($name="DocID", $tag);
|
| 471 |
+
die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n"
|
| 472 |
+
." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
|
| 473 |
+
unless (not defined $docs->{$sys}{$doc});
|
| 474 |
+
|
| 475 |
+
$span =~ s/[\s\n\r]+/ /g; # concatenate records
|
| 476 |
+
my $nseg=0, my $seg_data = $span;
|
| 477 |
+
while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
|
| 478 |
+
{
|
| 479 |
+
die "\n\nFATAIL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
|
| 480 |
+
unless $seg = extract_sgml_tag_attribute( $name="id", $tag );
|
| 481 |
+
($docs->{$sys}{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
|
| 482 |
+
$nseg++;
|
| 483 |
+
}
|
| 484 |
+
die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" if $nseg == 0;
|
| 485 |
+
$docs->{$sys}{$doc}{FILE} = $file;
|
| 486 |
+
}
|
| 487 |
+
}
|
| 488 |
+
}
|
| 489 |
+
return $id;
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
#################################
|
| 493 |
+
|
| 494 |
+
sub check_MT_data
|
| 495 |
+
{
|
| 496 |
+
@tst_sys = sort keys %tst_data;
|
| 497 |
+
@ref_sys = sort keys %ref_data;
|
| 498 |
+
|
| 499 |
+
die "Not the same 'setid' attribute values across files" unless ( ( $src_id eq $tst_id ) && ( $src_id eq $ref_id ) );
|
| 500 |
+
|
| 501 |
+
#every evaluation document must be represented for every system and every reference
|
| 502 |
+
foreach my $doc (sort keys %eval_docs)
|
| 503 |
+
{
|
| 504 |
+
my $nseg_source = scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
|
| 505 |
+
foreach my $sys (@tst_sys)
|
| 506 |
+
{
|
| 507 |
+
die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n" unless defined $tst_data{$sys}{$doc};
|
| 508 |
+
my $nseg = scalar( keys( %{$tst_data{$sys}{$doc}{SEGS}} ) );
|
| 509 |
+
die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
|
| 510 |
+
." document '$doc' for system '$sys' contains $nseg segments, while\n"
|
| 511 |
+
." the source document contains $nseg_source segments.\n\n"
|
| 512 |
+
unless $nseg == $nseg_source;
|
| 513 |
+
}
|
| 514 |
+
foreach my $sys (@ref_sys)
|
| 515 |
+
{
|
| 516 |
+
die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n" unless defined $ref_data{$sys}{$doc};
|
| 517 |
+
my $nseg = scalar( keys( %{$ref_data{$sys}{$doc}{SEGS}} ) );
|
| 518 |
+
die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
|
| 519 |
+
." document '$doc' for system '$sys' contains $nseg segments, while\n"
|
| 520 |
+
." the source document contains $nseg_source segments.\n\n"
|
| 521 |
+
unless $nseg == $nseg_source;
|
| 522 |
+
}
|
| 523 |
+
}
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
#################################
|
| 527 |
+
|
| 528 |
+
sub compute_ngram_info
|
| 529 |
+
{
|
| 530 |
+
my ($ref, $doc, $seg);
|
| 531 |
+
my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
|
| 532 |
+
my (%ngram_count, @tot_ngrams);
|
| 533 |
+
|
| 534 |
+
foreach $ref (keys %ref_data)
|
| 535 |
+
{
|
| 536 |
+
foreach $doc (keys %{$ref_data{$ref}})
|
| 537 |
+
{
|
| 538 |
+
foreach $seg ( keys %{$ref_data{$ref}{$doc}{SEGS}})
|
| 539 |
+
{
|
| 540 |
+
@wrds = split /\s+/, $ref_data{ $ref }{ $doc }{ SEGS }{ $seg };
|
| 541 |
+
$tot_wrds += @wrds;
|
| 542 |
+
%ngrams = %{Words2Ngrams (@wrds)};
|
| 543 |
+
foreach $ngram (keys %ngrams)
|
| 544 |
+
{
|
| 545 |
+
$ngram_count{$ngram} += $ngrams{$ngram};
|
| 546 |
+
}
|
| 547 |
+
}
|
| 548 |
+
}
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
foreach $ngram (keys %ngram_count)
|
| 552 |
+
{
|
| 553 |
+
@wrds = split / /, $ngram;
|
| 554 |
+
pop @wrds, $mgram = join " ", @wrds;
|
| 555 |
+
$ngram_info{$ngram} = - log ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} : $ngram_count{$ngram}/$tot_wrds) / log 2;
|
| 556 |
+
if (defined $opt_x and $opt_x eq "ngram info")
|
| 557 |
+
{
|
| 558 |
+
@wrds = split / /, $ngram;
|
| 559 |
+
printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
|
| 560 |
+
$mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
|
| 561 |
+
}
|
| 562 |
+
}
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
#################################
|
| 566 |
+
|
| 567 |
+
sub score_system
|
| 568 |
+
{
|
| 569 |
+
my ($sys, $ref, $doc, $SCOREmt, $overallScore);
|
| 570 |
+
($sys, $SCOREmt, $overallScore) = @_;
|
| 571 |
+
my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
|
| 572 |
+
my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
|
| 573 |
+
|
| 574 |
+
$cum_ref_length = 0;
|
| 575 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 576 |
+
{
|
| 577 |
+
$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
|
| 578 |
+
}
|
| 579 |
+
foreach $doc (sort keys %eval_docs)
|
| 580 |
+
{
|
| 581 |
+
($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc, $overallScore);
|
| 582 |
+
if ( $method eq "NIST" )
|
| 583 |
+
{
|
| 584 |
+
my %DOCmt = ();
|
| 585 |
+
my $docScore = nist_score( scalar( @ref_sys ), $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, \%DOCmt );
|
| 586 |
+
$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
|
| 587 |
+
if ( $detail >= 1 )
|
| 588 |
+
{
|
| 589 |
+
printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
|
| 590 |
+
$docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
|
| 591 |
+
}
|
| 592 |
+
}
|
| 593 |
+
|
| 594 |
+
if ( $method eq "BLEU" )
|
| 595 |
+
{
|
| 596 |
+
my %DOCmt = ();
|
| 597 |
+
my $docScore = &{$BLEU_SCORE}( $ref_length, $match_cnt, $tst_cnt, $sys, \%DOCmt );
|
| 598 |
+
$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
|
| 599 |
+
if ( $detail >= 1 )
|
| 600 |
+
{
|
| 601 |
+
printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
|
| 602 |
+
$docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
|
| 603 |
+
}
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
$cum_ref_length += $ref_length;
|
| 607 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 608 |
+
{
|
| 609 |
+
$cum_match[$j] += $match_cnt->[$j];
|
| 610 |
+
$cum_tst_cnt[$j] += $tst_cnt->[$j];
|
| 611 |
+
$cum_ref_cnt[$j] += $ref_cnt->[$j];
|
| 612 |
+
$cum_tst_info[$j] += $tst_info->[$j];
|
| 613 |
+
$cum_ref_info[$j] += $ref_info->[$j];
|
| 614 |
+
printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
|
| 615 |
+
$tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
|
| 616 |
+
if (defined $opt_x and $opt_x eq "document info");
|
| 617 |
+
}
|
| 618 |
+
}
|
| 619 |
+
|
| 620 |
+
if ($method eq "BLEU")
|
| 621 |
+
{
|
| 622 |
+
$overallScore->{ $sys }{ 'score' } = &{$BLEU_SCORE}($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, $SCOREmt);
|
| 623 |
+
}
|
| 624 |
+
if ($method eq "NIST")
|
| 625 |
+
{
|
| 626 |
+
$overallScore->{ $sys }{ 'score' } = nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, $SCOREmt);
|
| 627 |
+
}
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
+
#################################
|
| 631 |
+
|
| 632 |
+
sub score_document
|
| 633 |
+
{
|
| 634 |
+
my ($sys, $ref, $doc, $overallScore);
|
| 635 |
+
($sys, $doc, $overallScore) = @_;
|
| 636 |
+
my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
|
| 637 |
+
my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
|
| 638 |
+
|
| 639 |
+
$cum_ref_length = 0;
|
| 640 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 641 |
+
{
|
| 642 |
+
$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
# score each segment
|
| 646 |
+
foreach my $seg ( nsort keys( %{$tst_data{$sys}{$doc}{SEGS}} ) )
|
| 647 |
+
{
|
| 648 |
+
|
| 649 |
+
my @ref_segments = ();
|
| 650 |
+
foreach $ref (@ref_sys)
|
| 651 |
+
{
|
| 652 |
+
push @ref_segments, $ref_data{$ref}{$doc}{SEGS}{$seg};
|
| 653 |
+
if ( $detail >= 3 )
|
| 654 |
+
{
|
| 655 |
+
printf "ref '$ref', seg $seg: %s\n", $ref_data{$ref}{$doc}{SEGS}{$seg}
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
}
|
| 659 |
+
|
| 660 |
+
printf "sys '$sys', seg $seg: %s\n", $tst_data{$sys}{$doc}{SEGS}{$seg} if ( $detail >= 3 );
|
| 661 |
+
($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_segment ($tst_data{$sys}{$doc}{SEGS}{$seg}, @ref_segments);
|
| 662 |
+
|
| 663 |
+
if ( $method eq "BLEU" )
|
| 664 |
+
{
|
| 665 |
+
my %DOCmt = ();
|
| 666 |
+
my $segScore = &{$BLEU_SCORE}($ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt);
|
| 667 |
+
$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
|
| 668 |
+
if ( $detail >= 2 )
|
| 669 |
+
{
|
| 670 |
+
printf " $method score using 4-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1]
|
| 671 |
+
}
|
| 672 |
+
}
|
| 673 |
+
if ( $method eq "NIST" )
|
| 674 |
+
{
|
| 675 |
+
my %DOCmt = ();
|
| 676 |
+
my $segScore = nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt);
|
| 677 |
+
$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
|
| 678 |
+
if ( $detail >= 2 )
|
| 679 |
+
{
|
| 680 |
+
printf " $method score using 5-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1];
|
| 681 |
+
}
|
| 682 |
+
}
|
| 683 |
+
$cum_ref_length += $ref_length;
|
| 684 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 685 |
+
{
|
| 686 |
+
$cum_match[$j] += $match_cnt->[$j];
|
| 687 |
+
$cum_tst_cnt[$j] += $tst_cnt->[$j];
|
| 688 |
+
$cum_ref_cnt[$j] += $ref_cnt->[$j];
|
| 689 |
+
$cum_tst_info[$j] += $tst_info->[$j];
|
| 690 |
+
$cum_ref_info[$j] += $ref_info->[$j];
|
| 691 |
+
}
|
| 692 |
+
}
|
| 693 |
+
return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
|
| 694 |
+
}
|
| 695 |
+
|
| 696 |
+
###############################################################################################################################
|
| 697 |
+
# function returning the shortest reference length
|
| 698 |
+
# takes as input:
|
| 699 |
+
# - currentLength : the current (shortest) reference length
|
| 700 |
+
# - referenceSentenceLength : the current reference sentence length
|
| 701 |
+
# - candidateSentenceLength : the current candidate sentence length (unused)
|
| 702 |
+
###############################################################################################################################
|
| 703 |
+
sub brevity_penalty_shortest
|
| 704 |
+
{
|
| 705 |
+
my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
|
| 706 |
+
return ( $referenceSentenceLength < $currentLength ? $referenceSentenceLength : $currentLength );
|
| 707 |
+
}
|
| 708 |
+
|
| 709 |
+
###############################################################################################################################
|
| 710 |
+
# function returning the closest reference length (to the candidate sentence length)
|
| 711 |
+
# takes as input:
|
| 712 |
+
# - currentLength: the current (closest) reference length.
|
| 713 |
+
# - candidateSentenceLength : the current reference sentence length
|
| 714 |
+
# - candidateSentenceLength : the current candidate sentence length
|
| 715 |
+
# when two reference sentences are at the same distance, it will return the shortest reference sentence length
|
| 716 |
+
# example of 4 iterations, given:
|
| 717 |
+
# - one candidate sentence containing 7 tokens
|
| 718 |
+
# - one reference translation containing 11 tokens
|
| 719 |
+
# - one reference translation containing 8 tokens
|
| 720 |
+
# - one reference translation containing 6 tokens
|
| 721 |
+
# - one reference translation containing 7 tokens
|
| 722 |
+
# the multiple invokations will return:
|
| 723 |
+
# - currentLength is set to 11 (outside of this function)
|
| 724 |
+
# - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 )
|
| 725 |
+
# - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8
|
| 726 |
+
# - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 )
|
| 727 |
+
###############################################################################################################################
|
| 728 |
+
sub brevity_penalty_closest
|
| 729 |
+
{
|
| 730 |
+
my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
|
| 731 |
+
my $result = $currentLength;
|
| 732 |
+
if ( abs( $candidateSentenceLength - $referenceSentenceLength ) <= abs( $candidateSentenceLength - $currentLength ) )
|
| 733 |
+
{
|
| 734 |
+
if ( abs( $candidateSentenceLength - $referenceSentenceLength ) == abs( $candidateSentenceLength - $currentLength ) )
|
| 735 |
+
{
|
| 736 |
+
if ( $currentLength > $referenceSentenceLength )
|
| 737 |
+
{
|
| 738 |
+
$result = $referenceSentenceLength;
|
| 739 |
+
}
|
| 740 |
+
}
|
| 741 |
+
else
|
| 742 |
+
{
|
| 743 |
+
$result = $referenceSentenceLength;
|
| 744 |
+
}
|
| 745 |
+
}
|
| 746 |
+
return $result;
|
| 747 |
+
}
|
| 748 |
+
|
| 749 |
+
#################################
|
| 750 |
+
|
| 751 |
+
sub score_segment
|
| 752 |
+
{
|
| 753 |
+
my ($tst_seg, @ref_segs) = @_;
|
| 754 |
+
my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
|
| 755 |
+
my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
|
| 756 |
+
my ($ngram);
|
| 757 |
+
my (@nwrds_ref);
|
| 758 |
+
my $ref_length;
|
| 759 |
+
|
| 760 |
+
for (my $j=1; $j<= $max_Ngram; $j++)
|
| 761 |
+
{
|
| 762 |
+
$match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
|
| 763 |
+
}
|
| 764 |
+
|
| 765 |
+
# get the ngram counts for the test segment
|
| 766 |
+
@tst_wrds = split /\s+/, $tst_seg;
|
| 767 |
+
%tst_ngrams = %{Words2Ngrams (@tst_wrds)};
|
| 768 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 769 |
+
{
|
| 770 |
+
# compute ngram counts
|
| 771 |
+
$tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
|
| 772 |
+
}
|
| 773 |
+
|
| 774 |
+
# get the ngram counts for the reference segments
|
| 775 |
+
foreach $ref_seg (@ref_segs)
|
| 776 |
+
{
|
| 777 |
+
@ref_wrds = split /\s+/, $ref_seg;
|
| 778 |
+
%ref_ngrams = %{Words2Ngrams (@ref_wrds)};
|
| 779 |
+
foreach $ngram (keys %ref_ngrams)
|
| 780 |
+
{
|
| 781 |
+
# find the maximum # of occurrences
|
| 782 |
+
my @wrds = split / /, $ngram;
|
| 783 |
+
$ref_info[@wrds] += $ngram_info{$ngram};
|
| 784 |
+
$ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : $ref_ngrams{$ngram};
|
| 785 |
+
}
|
| 786 |
+
for (my $j=1; $j<=$max_Ngram; $j++)
|
| 787 |
+
{
|
| 788 |
+
# update ngram counts
|
| 789 |
+
$ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
|
| 790 |
+
}
|
| 791 |
+
if ( not defined( $ref_length ) )
|
| 792 |
+
{
|
| 793 |
+
$ref_length = scalar( @ref_wrds );
|
| 794 |
+
}
|
| 795 |
+
else
|
| 796 |
+
{
|
| 797 |
+
$ref_length = &{$BLEU_BP}( $ref_length, scalar( @ref_wrds ), scalar( @tst_wrds ) );
|
| 798 |
+
}
|
| 799 |
+
}
|
| 800 |
+
|
| 801 |
+
# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
|
| 802 |
+
foreach $ngram (keys %tst_ngrams)
|
| 803 |
+
{
|
| 804 |
+
next unless defined $ref_ngrams_max{$ngram};
|
| 805 |
+
my @wrds = split / /, $ngram;
|
| 806 |
+
$tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
|
| 807 |
+
$match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
|
| 808 |
+
printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
|
| 809 |
+
if $detail >= 3;
|
| 810 |
+
}
|
| 811 |
+
|
| 812 |
+
return ($ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
|
| 813 |
+
}
|
| 814 |
+
|
| 815 |
+
#################################
|
| 816 |
+
|
| 817 |
+
sub bleu_score_nosmoothing
|
| 818 |
+
{
|
| 819 |
+
my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_;
|
| 820 |
+
my $score = 0;
|
| 821 |
+
my $iscore = 0;
|
| 822 |
+
|
| 823 |
+
for ( my $j = 1; $j <= $max_Ngram; ++$j )
|
| 824 |
+
{
|
| 825 |
+
if ($matching_ngrams->[ $j ] == 0)
|
| 826 |
+
{
|
| 827 |
+
$SCOREmt->{ $j }{ $sys }{ cum }=0;
|
| 828 |
+
}
|
| 829 |
+
else
|
| 830 |
+
{
|
| 831 |
+
my $len_score = min (0, 1-$ref_length/$tst_ngrams->[1]);
|
| 832 |
+
# Cumulative N-Gram score
|
| 833 |
+
$score += log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
|
| 834 |
+
$SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j + $len_score );
|
| 835 |
+
# Individual N-Gram score
|
| 836 |
+
$iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
|
| 837 |
+
$SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
|
| 838 |
+
}
|
| 839 |
+
}
|
| 840 |
+
return $SCOREmt->{ 4 }{ $sys }{ cum };
|
| 841 |
+
}
|
| 842 |
+
|
| 843 |
+
###############################################################################################################################
|
| 844 |
+
# Default method used to compute the BLEU score, using smoothing.
|
| 845 |
+
# Note that the method used can be overridden using the '--no-smoothing' command-line argument
|
| 846 |
+
# The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each precision score whose matching n-gram count is null
|
| 847 |
+
# k is 1 for the first 'n' value for which the n-gram match count is null
|
| 848 |
+
# For example, if the text contains:
|
| 849 |
+
# - one 2-gram match
|
| 850 |
+
# - and (consequently) two 1-gram matches
|
| 851 |
+
# the n-gram count for each individual precision score would be:
|
| 852 |
+
# - n=1 => prec_count = 2 (two unigrams)
|
| 853 |
+
# - n=2 => prec_count = 1 (one bigram)
|
| 854 |
+
# - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
|
| 855 |
+
# - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
|
| 856 |
+
###############################################################################################################################
|
| 857 |
+
sub bleu_score
|
| 858 |
+
{
|
| 859 |
+
my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_;
|
| 860 |
+
my $score = 0;
|
| 861 |
+
my $iscore = 0;
|
| 862 |
+
my $exp_len_score = 0;
|
| 863 |
+
$exp_len_score = exp( min (0, 1 - $ref_length / $tst_ngrams->[ 1 ] ) ) if ( $tst_ngrams->[ 1 ] > 0 );
|
| 864 |
+
my $smooth = 1;
|
| 865 |
+
for ( my $j = 1; $j <= $max_Ngram; ++$j )
|
| 866 |
+
{
|
| 867 |
+
if ( $tst_ngrams->[ $j ] == 0 )
|
| 868 |
+
{
|
| 869 |
+
$iscore = 0;
|
| 870 |
+
}
|
| 871 |
+
elsif ( $matching_ngrams->[ $j ] == 0 )
|
| 872 |
+
{
|
| 873 |
+
$smooth *= 2;
|
| 874 |
+
$iscore = log( 1 / ( $smooth * $tst_ngrams->[ $j ] ) );
|
| 875 |
+
}
|
| 876 |
+
else
|
| 877 |
+
{
|
| 878 |
+
$iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
|
| 879 |
+
}
|
| 880 |
+
$SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
|
| 881 |
+
$score += $iscore;
|
| 882 |
+
$SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j ) * $exp_len_score;
|
| 883 |
+
}
|
| 884 |
+
return $SCOREmt->{ 4 }{ $sys }{ cum };
|
| 885 |
+
}
|
| 886 |
+
|
| 887 |
+
#################################
|
| 888 |
+
|
| 889 |
+
sub nist_score
|
| 890 |
+
{
|
| 891 |
+
my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, $SCOREmt) = @_;
|
| 892 |
+
my $score = 0;
|
| 893 |
+
my $iscore = 0;
|
| 894 |
+
|
| 895 |
+
for (my $n=1; $n<=$max_Ngram; $n++)
|
| 896 |
+
{
|
| 897 |
+
$score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
|
| 898 |
+
$SCOREmt->{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
|
| 899 |
+
$iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
|
| 900 |
+
$SCOREmt->{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
|
| 901 |
+
}
|
| 902 |
+
return $SCOREmt->{5}{$sys}{cum};
|
| 903 |
+
}
|
| 904 |
+
|
| 905 |
+
#################################
|
| 906 |
+
|
| 907 |
+
sub Words2Ngrams
|
| 908 |
+
{
|
| 909 |
+
#convert a string of words to an Ngram count hash
|
| 910 |
+
my %count = ();
|
| 911 |
+
|
| 912 |
+
for (; @_; shift)
|
| 913 |
+
{
|
| 914 |
+
my ($j, $ngram, $word);
|
| 915 |
+
for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++)
|
| 916 |
+
{
|
| 917 |
+
$ngram .= defined $ngram ? " $word" : $word;
|
| 918 |
+
$count{$ngram}++;
|
| 919 |
+
}
|
| 920 |
+
}
|
| 921 |
+
return {%count};
|
| 922 |
+
}
|
| 923 |
+
|
| 924 |
+
#################################
|
| 925 |
+
|
| 926 |
+
sub tokenization
|
| 927 |
+
{
|
| 928 |
+
my ($norm_text) = @_;
|
| 929 |
+
|
| 930 |
+
# language-independent part:
|
| 931 |
+
$norm_text =~ s/<skipped>//g; # strip "skipped" tags
|
| 932 |
+
$norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
|
| 933 |
+
$norm_text =~ s/\n/ /g; # join lines
|
| 934 |
+
$norm_text =~ s/"/"/g; # convert SGML tag for quote to "
|
| 935 |
+
$norm_text =~ s/&/&/g; # convert SGML tag for ampersand to &
|
| 936 |
+
$norm_text =~ s/</</g; # convert SGML tag for less-than to >
|
| 937 |
+
$norm_text =~ s/>/>/g; # convert SGML tag for greater-than to <
|
| 938 |
+
|
| 939 |
+
# language-dependent part (assuming Western languages):
|
| 940 |
+
$norm_text = " $norm_text ";
|
| 941 |
+
$norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
|
| 942 |
+
$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation
|
| 943 |
+
$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
|
| 944 |
+
$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
|
| 945 |
+
$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
|
| 946 |
+
$norm_text =~ s/\s+/ /g; # one space only between words
|
| 947 |
+
$norm_text =~ s/^\s+//; # no leading space
|
| 948 |
+
$norm_text =~ s/\s+$//; # no trailing space
|
| 949 |
+
|
| 950 |
+
return $norm_text;
|
| 951 |
+
}
|
| 952 |
+
|
| 953 |
+
|
| 954 |
+
sub tokenization_international
|
| 955 |
+
{
|
| 956 |
+
my ($norm_text) = @_;
|
| 957 |
+
|
| 958 |
+
$norm_text =~ s/<skipped>//g; # strip "skipped" tags
|
| 959 |
+
#$norm_text =~ s/\p{Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines
|
| 960 |
+
$norm_text =~ s/\p{Zl}/ /g; # join lines
|
| 961 |
+
|
| 962 |
+
# replace entities
|
| 963 |
+
$norm_text =~ s/"/\"/g; # quote to "
|
| 964 |
+
$norm_text =~ s/&/&/g; # ampersand to &
|
| 965 |
+
$norm_text =~ s/</</g; # less-than to <
|
| 966 |
+
$norm_text =~ s/>/>/g; # greater-than to >
|
| 967 |
+
$norm_text =~ s/'/\'/g; # apostrophe to '
|
| 968 |
+
|
| 969 |
+
$norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed
|
| 970 |
+
$norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII );
|
| 971 |
+
|
| 972 |
+
# punctuation: tokenize any punctuation unless followed AND preceded by a digit
|
| 973 |
+
$norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g;
|
| 974 |
+
$norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g;
|
| 975 |
+
|
| 976 |
+
$norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols
|
| 977 |
+
|
| 978 |
+
$norm_text =~ s/\p{Z}+/ /g; # one space only between words
|
| 979 |
+
$norm_text =~ s/^\p{Z}+//; # no leading space
|
| 980 |
+
$norm_text =~ s/\p{Z}+$//; # no trailing space
|
| 981 |
+
|
| 982 |
+
return $norm_text;
|
| 983 |
+
}
|
| 984 |
+
|
| 985 |
+
#################################
|
| 986 |
+
|
| 987 |
+
sub nist_length_penalty
|
| 988 |
+
{
|
| 989 |
+
my ($ratio) = @_;
|
| 990 |
+
return 1 if $ratio >= 1;
|
| 991 |
+
return 0 if $ratio <= 0;
|
| 992 |
+
my $ratio_x = 1.5;
|
| 993 |
+
my $score_x = 0.5;
|
| 994 |
+
my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
|
| 995 |
+
return exp (-$beta*log($ratio)*log($ratio));
|
| 996 |
+
}
|
| 997 |
+
|
| 998 |
+
#################################
|
| 999 |
+
|
| 1000 |
+
sub date_time_stamp
|
| 1001 |
+
{
|
| 1002 |
+
my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
|
| 1003 |
+
my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
|
| 1004 |
+
my ($date, $time);
|
| 1005 |
+
$time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
|
| 1006 |
+
$date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
|
| 1007 |
+
return ($date, $time);
|
| 1008 |
+
}
|
| 1009 |
+
|
| 1010 |
+
#################################
|
| 1011 |
+
|
| 1012 |
+
sub extract_sgml_tag_and_span
|
| 1013 |
+
{
|
| 1014 |
+
my ($name, $data) = @_;
|
| 1015 |
+
($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
|
| 1016 |
+
}
|
| 1017 |
+
|
| 1018 |
+
#################################
|
| 1019 |
+
|
| 1020 |
+
sub extract_sgml_tag_attribute
|
| 1021 |
+
{
|
| 1022 |
+
my ($name, $data) = @_;
|
| 1023 |
+
($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
|
| 1024 |
+
}
|
| 1025 |
+
|
| 1026 |
+
#################################
|
| 1027 |
+
|
| 1028 |
+
sub max
|
| 1029 |
+
{
|
| 1030 |
+
my ($max, $next);
|
| 1031 |
+
|
| 1032 |
+
return unless defined ($max=pop);
|
| 1033 |
+
while (defined ($next=pop))
|
| 1034 |
+
{
|
| 1035 |
+
$max = $next if $next > $max;
|
| 1036 |
+
}
|
| 1037 |
+
return $max;
|
| 1038 |
+
}
|
| 1039 |
+
|
| 1040 |
+
#################################
|
| 1041 |
+
|
| 1042 |
+
sub min
|
| 1043 |
+
{
|
| 1044 |
+
my ($min, $next);
|
| 1045 |
+
|
| 1046 |
+
return unless defined ($min=pop);
|
| 1047 |
+
while (defined ($next=pop))
|
| 1048 |
+
{
|
| 1049 |
+
$min = $next if $next < $min;
|
| 1050 |
+
}
|
| 1051 |
+
return $min;
|
| 1052 |
+
}
|
| 1053 |
+
|
| 1054 |
+
#################################
|
| 1055 |
+
|
| 1056 |
+
sub printout_report
|
| 1057 |
+
{
|
| 1058 |
+
if ( $METHOD eq "BOTH" )
|
| 1059 |
+
{
|
| 1060 |
+
foreach my $sys (sort @tst_sys)
|
| 1061 |
+
{
|
| 1062 |
+
printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
|
| 1063 |
+
}
|
| 1064 |
+
}
|
| 1065 |
+
elsif ($METHOD eq "NIST" )
|
| 1066 |
+
{
|
| 1067 |
+
foreach my $sys (sort @tst_sys)
|
| 1068 |
+
{
|
| 1069 |
+
printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
|
| 1070 |
+
}
|
| 1071 |
+
}
|
| 1072 |
+
elsif ($METHOD eq "BLEU" )
|
| 1073 |
+
{
|
| 1074 |
+
foreach my $sys (sort @tst_sys)
|
| 1075 |
+
{
|
| 1076 |
+
printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
|
| 1077 |
+
}
|
| 1078 |
+
}
|
| 1079 |
+
printf "\n# ------------------------------------------------------------------------\n\n";
|
| 1080 |
+
printf "Individual N-gram scoring\n";
|
| 1081 |
+
printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
|
| 1082 |
+
printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
|
| 1083 |
+
|
| 1084 |
+
if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "NIST") )
|
| 1085 |
+
{
|
| 1086 |
+
foreach my $sys (sort @tst_sys)
|
| 1087 |
+
{
|
| 1088 |
+
printf " NIST:";
|
| 1089 |
+
for (my $i=1; $i<=$max_Ngram; $i++)
|
| 1090 |
+
{
|
| 1091 |
+
printf " %2.4f ",$NISTmt{$i}{$sys}{ind}
|
| 1092 |
+
}
|
| 1093 |
+
printf " \"$sys\"\n";
|
| 1094 |
+
}
|
| 1095 |
+
printf "\n";
|
| 1096 |
+
}
|
| 1097 |
+
|
| 1098 |
+
if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
|
| 1099 |
+
{
|
| 1100 |
+
foreach my $sys (sort @tst_sys)
|
| 1101 |
+
{
|
| 1102 |
+
printf " BLEU:";
|
| 1103 |
+
for (my $i=1; $i<=$max_Ngram; $i++)
|
| 1104 |
+
{
|
| 1105 |
+
printf " %2.4f ",$BLEUmt{$i}{$sys}{ind}
|
| 1106 |
+
}
|
| 1107 |
+
printf " \"$sys\"\n";
|
| 1108 |
+
}
|
| 1109 |
+
}
|
| 1110 |
+
|
| 1111 |
+
printf "\n# ------------------------------------------------------------------------\n";
|
| 1112 |
+
printf "\nCumulative N-gram scoring\n";
|
| 1113 |
+
printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
|
| 1114 |
+
printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
|
| 1115 |
+
|
| 1116 |
+
if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST"))
|
| 1117 |
+
{
|
| 1118 |
+
foreach my $sys (sort @tst_sys)
|
| 1119 |
+
{
|
| 1120 |
+
printf " NIST:";
|
| 1121 |
+
for (my $i=1; $i<=$max_Ngram; $i++)
|
| 1122 |
+
{
|
| 1123 |
+
printf " %2.4f ",$NISTmt{$i}{$sys}{cum}
|
| 1124 |
+
}
|
| 1125 |
+
printf " \"$sys\"\n";
|
| 1126 |
+
}
|
| 1127 |
+
}
|
| 1128 |
+
printf "\n";
|
| 1129 |
+
if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
|
| 1130 |
+
{
|
| 1131 |
+
foreach my $sys (sort @tst_sys)
|
| 1132 |
+
{
|
| 1133 |
+
printf " BLEU:";
|
| 1134 |
+
for (my $i=1; $i<=$max_Ngram; $i++)
|
| 1135 |
+
{
|
| 1136 |
+
printf " %2.4f ",$BLEUmt{$i}{$sys}{cum}
|
| 1137 |
+
}
|
| 1138 |
+
printf " \"$sys\"\n";
|
| 1139 |
+
}
|
| 1140 |
+
}
|
| 1141 |
+
}
|
| 1142 |
+
|
| 1143 |
+
###############################################################################################################################
|
| 1144 |
+
# Create three files, by using:
|
| 1145 |
+
# - $prefix : the prefix used for the output file names
|
| 1146 |
+
# - %overall : a hash containing seg/doc/sys-level scores:
|
| 1147 |
+
# - $overall{ $SYSTEM_ID }{ 'score' } => system-level score
|
| 1148 |
+
# - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'score' } => document-level score
|
| 1149 |
+
# - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'segments' }{ $SEGMENT_ID } => segment-level score
|
| 1150 |
+
###############################################################################################################################
|
| 1151 |
+
sub outputMetricsMATR
|
| 1152 |
+
{
|
| 1153 |
+
my ( $prefix, %overall ) = @_;
|
| 1154 |
+
my $fileNameSys = $prefix . '-sys.scr';
|
| 1155 |
+
my $fileNameDoc = $prefix . '-doc.scr';
|
| 1156 |
+
my $fileNameSeg = $prefix . '-seg.scr';
|
| 1157 |
+
open FILEOUT_SYS, '>', $fileNameSys or die "Could not open file: ${fileNameSys}";
|
| 1158 |
+
open FILEOUT_DOC, '>', $fileNameDoc or die "Could not open file: ${fileNameDoc}";
|
| 1159 |
+
open FILEOUT_SEG, '>', $fileNameSeg or die "Could not open file: ${fileNameSeg}";
|
| 1160 |
+
foreach my $sys ( sort( keys( %overall ) ) )
|
| 1161 |
+
{
|
| 1162 |
+
my $scoreSys = $overall{ $sys }{ 'score' };
|
| 1163 |
+
print FILEOUT_SYS "${tst_id}\t${sys}\t${scoreSys}\n";
|
| 1164 |
+
foreach my $doc ( sort( keys( %{$overall{ $sys }{ 'documents' }} ) ) )
|
| 1165 |
+
{
|
| 1166 |
+
my $scoreDoc = $overall{ $sys }{ 'documents' }{ $doc }{ 'score' };
|
| 1167 |
+
print FILEOUT_DOC "${tst_id}\t${sys}\t${doc}\t${scoreDoc}\n";
|
| 1168 |
+
foreach my $seg ( nsort keys( %{$overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }} ) )
|
| 1169 |
+
{
|
| 1170 |
+
my $scoreSeg = $overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' };
|
| 1171 |
+
print FILEOUT_SEG "${tst_id}\t${sys}\t${doc}\t${seg}\t${scoreSeg}\n";
|
| 1172 |
+
}
|
| 1173 |
+
}
|
| 1174 |
+
}
|
| 1175 |
+
close FILEOUT_SEG;
|
| 1176 |
+
close FILEOUT_DOC;
|
| 1177 |
+
close FILEOUT_SYS;
|
| 1178 |
+
}
|
| 1179 |
+
|
mosesdecoder/scripts/generic/multi-bleu-detok.perl
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
# This file uses the internal tokenization of mteval-v13a.pl,
|
| 7 |
+
# giving the exact same (case-sensitive) results on untokenized text.
|
| 8 |
+
# Using this script with detokenized output and untokenized references is
|
| 9 |
+
# preferrable over multi-bleu.perl, since scores aren't affected by tokenization differences.
|
| 10 |
+
#
|
| 11 |
+
# like multi-bleu.perl , it supports plain text input and multiple references.
|
| 12 |
+
|
| 13 |
+
# $Id$
|
| 14 |
+
use warnings;
|
| 15 |
+
use strict;
|
| 16 |
+
|
| 17 |
+
binmode(STDIN, ":utf8");
|
| 18 |
+
use open ':encoding(UTF-8)';
|
| 19 |
+
|
| 20 |
+
my $lowercase = 0;
|
| 21 |
+
if ($ARGV[0] eq "-lc") {
|
| 22 |
+
$lowercase = 1;
|
| 23 |
+
shift;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
my $stem = $ARGV[0];
|
| 27 |
+
if (!defined $stem) {
|
| 28 |
+
print STDERR "usage: multi-bleu-detok.pl [-lc] reference < hypothesis\n";
|
| 29 |
+
print STDERR "Reads the references from reference or reference0, reference1, ...\n";
|
| 30 |
+
exit(1);
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
|
| 34 |
+
|
| 35 |
+
my @REF;
|
| 36 |
+
my $ref=0;
|
| 37 |
+
while(-e "$stem$ref") {
|
| 38 |
+
&add_to_ref("$stem$ref",\@REF);
|
| 39 |
+
$ref++;
|
| 40 |
+
}
|
| 41 |
+
&add_to_ref($stem,\@REF) if -e $stem;
|
| 42 |
+
die("ERROR: could not find reference file $stem") unless scalar @REF;
|
| 43 |
+
|
| 44 |
+
# add additional references explicitly specified on the command line
|
| 45 |
+
shift;
|
| 46 |
+
foreach my $stem (@ARGV) {
|
| 47 |
+
&add_to_ref($stem,\@REF) if -e $stem;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
sub add_to_ref {
|
| 53 |
+
my ($file,$REF) = @_;
|
| 54 |
+
my $s=0;
|
| 55 |
+
if ($file =~ /.gz$/) {
|
| 56 |
+
open(REF,"gzip -dc $file|") or die "Can't read $file";
|
| 57 |
+
} else {
|
| 58 |
+
open(REF,$file) or die "Can't read $file";
|
| 59 |
+
}
|
| 60 |
+
while(<REF>) {
|
| 61 |
+
chop;
|
| 62 |
+
$_ = tokenization($_);
|
| 63 |
+
push @{$$REF[$s++]}, $_;
|
| 64 |
+
}
|
| 65 |
+
close(REF);
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
my(@CORRECT,@TOTAL,$length_translation,$length_reference);
|
| 69 |
+
my $s=0;
|
| 70 |
+
while(<STDIN>) {
|
| 71 |
+
chop;
|
| 72 |
+
$_ = lc if $lowercase;
|
| 73 |
+
$_ = tokenization($_);
|
| 74 |
+
my @WORD = split;
|
| 75 |
+
my %REF_NGRAM = ();
|
| 76 |
+
my $length_translation_this_sentence = scalar(@WORD);
|
| 77 |
+
my ($closest_diff,$closest_length) = (9999,9999);
|
| 78 |
+
foreach my $reference (@{$REF[$s]}) {
|
| 79 |
+
# print "$s $_ <=> $reference\n";
|
| 80 |
+
$reference = lc($reference) if $lowercase;
|
| 81 |
+
my @WORD = split(' ',$reference);
|
| 82 |
+
my $length = scalar(@WORD);
|
| 83 |
+
my $diff = abs($length_translation_this_sentence-$length);
|
| 84 |
+
if ($diff < $closest_diff) {
|
| 85 |
+
$closest_diff = $diff;
|
| 86 |
+
$closest_length = $length;
|
| 87 |
+
# print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
|
| 88 |
+
} elsif ($diff == $closest_diff) {
|
| 89 |
+
$closest_length = $length if $length < $closest_length;
|
| 90 |
+
# from two references with the same closeness to me
|
| 91 |
+
# take the *shorter* into account, not the "first" one.
|
| 92 |
+
}
|
| 93 |
+
for(my $n=1;$n<=4;$n++) {
|
| 94 |
+
my %REF_NGRAM_N = ();
|
| 95 |
+
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
|
| 96 |
+
my $ngram = "$n";
|
| 97 |
+
for(my $w=0;$w<$n;$w++) {
|
| 98 |
+
$ngram .= " ".$WORD[$start+$w];
|
| 99 |
+
}
|
| 100 |
+
$REF_NGRAM_N{$ngram}++;
|
| 101 |
+
}
|
| 102 |
+
foreach my $ngram (keys %REF_NGRAM_N) {
|
| 103 |
+
if (!defined($REF_NGRAM{$ngram}) ||
|
| 104 |
+
$REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
|
| 105 |
+
$REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
|
| 106 |
+
# print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
$length_translation += $length_translation_this_sentence;
|
| 112 |
+
$length_reference += $closest_length;
|
| 113 |
+
for(my $n=1;$n<=4;$n++) {
|
| 114 |
+
my %T_NGRAM = ();
|
| 115 |
+
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
|
| 116 |
+
my $ngram = "$n";
|
| 117 |
+
for(my $w=0;$w<$n;$w++) {
|
| 118 |
+
$ngram .= " ".$WORD[$start+$w];
|
| 119 |
+
}
|
| 120 |
+
$T_NGRAM{$ngram}++;
|
| 121 |
+
}
|
| 122 |
+
foreach my $ngram (keys %T_NGRAM) {
|
| 123 |
+
$ngram =~ /^(\d+) /;
|
| 124 |
+
my $n = $1;
|
| 125 |
+
# my $corr = 0;
|
| 126 |
+
# print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
|
| 127 |
+
$TOTAL[$n] += $T_NGRAM{$ngram};
|
| 128 |
+
if (defined($REF_NGRAM{$ngram})) {
|
| 129 |
+
if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
|
| 130 |
+
$CORRECT[$n] += $T_NGRAM{$ngram};
|
| 131 |
+
# $corr = $T_NGRAM{$ngram};
|
| 132 |
+
# print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
|
| 133 |
+
}
|
| 134 |
+
else {
|
| 135 |
+
$CORRECT[$n] += $REF_NGRAM{$ngram};
|
| 136 |
+
# $corr = $REF_NGRAM{$ngram};
|
| 137 |
+
# print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
# $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
|
| 141 |
+
# print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
$s++;
|
| 145 |
+
}
|
| 146 |
+
my $brevity_penalty = 1;
|
| 147 |
+
my $bleu = 0;
|
| 148 |
+
|
| 149 |
+
my @bleu=();
|
| 150 |
+
|
| 151 |
+
for(my $n=1;$n<=4;$n++) {
|
| 152 |
+
if (defined ($TOTAL[$n])){
|
| 153 |
+
$bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
|
| 154 |
+
# print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
|
| 155 |
+
}else{
|
| 156 |
+
$bleu[$n]=0;
|
| 157 |
+
}
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
if ($length_reference==0){
|
| 161 |
+
printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
|
| 162 |
+
exit(1);
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
if ($length_translation<$length_reference) {
|
| 166 |
+
$brevity_penalty = exp(1-$length_reference/$length_translation);
|
| 167 |
+
}
|
| 168 |
+
$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
|
| 169 |
+
my_log( $bleu[2] ) +
|
| 170 |
+
my_log( $bleu[3] ) +
|
| 171 |
+
my_log( $bleu[4] ) ) / 4) ;
|
| 172 |
+
printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
|
| 173 |
+
100*$bleu,
|
| 174 |
+
100*$bleu[1],
|
| 175 |
+
100*$bleu[2],
|
| 176 |
+
100*$bleu[3],
|
| 177 |
+
100*$bleu[4],
|
| 178 |
+
$brevity_penalty,
|
| 179 |
+
$length_translation / $length_reference,
|
| 180 |
+
$length_translation,
|
| 181 |
+
$length_reference;
|
| 182 |
+
|
| 183 |
+
sub my_log {
|
| 184 |
+
return -9999999999 unless $_[0];
|
| 185 |
+
return log($_[0]);
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
sub tokenization
|
| 191 |
+
{
|
| 192 |
+
my ($norm_text) = @_;
|
| 193 |
+
|
| 194 |
+
# language-independent part:
|
| 195 |
+
$norm_text =~ s/<skipped>//g; # strip "skipped" tags
|
| 196 |
+
$norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
|
| 197 |
+
$norm_text =~ s/\n/ /g; # join lines
|
| 198 |
+
$norm_text =~ s/"/"/g; # convert SGML tag for quote to "
|
| 199 |
+
$norm_text =~ s/&/&/g; # convert SGML tag for ampersand to &
|
| 200 |
+
$norm_text =~ s/</</g; # convert SGML tag for less-than to >
|
| 201 |
+
$norm_text =~ s/>/>/g; # convert SGML tag for greater-than to <
|
| 202 |
+
|
| 203 |
+
# language-dependent part (assuming Western languages):
|
| 204 |
+
$norm_text = " $norm_text ";
|
| 205 |
+
$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation
|
| 206 |
+
$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
|
| 207 |
+
$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
|
| 208 |
+
$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
|
| 209 |
+
$norm_text =~ s/\s+/ /g; # one space only between words
|
| 210 |
+
$norm_text =~ s/^\s+//; # no leading space
|
| 211 |
+
$norm_text =~ s/\s+$//; # no trailing space
|
| 212 |
+
|
| 213 |
+
return $norm_text;
|
| 214 |
+
}
|
mosesdecoder/scripts/generic/multi-bleu.perl
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
# $Id$
|
| 7 |
+
use warnings;
|
| 8 |
+
use strict;
|
| 9 |
+
|
| 10 |
+
my $lowercase = 0;
|
| 11 |
+
if ($ARGV[0] eq "-lc") {
|
| 12 |
+
$lowercase = 1;
|
| 13 |
+
shift;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
my $stem = $ARGV[0];
|
| 17 |
+
if (!defined $stem) {
|
| 18 |
+
print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
|
| 19 |
+
print STDERR "Reads the references from reference or reference0, reference1, ...\n";
|
| 20 |
+
exit(1);
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
|
| 24 |
+
|
| 25 |
+
my @REF;
|
| 26 |
+
my $ref=0;
|
| 27 |
+
while(-e "$stem$ref") {
|
| 28 |
+
&add_to_ref("$stem$ref",\@REF);
|
| 29 |
+
$ref++;
|
| 30 |
+
}
|
| 31 |
+
&add_to_ref($stem,\@REF) if -e $stem;
|
| 32 |
+
die("ERROR: could not find reference file $stem") unless scalar @REF;
|
| 33 |
+
|
| 34 |
+
# add additional references explicitly specified on the command line
|
| 35 |
+
shift;
|
| 36 |
+
foreach my $stem (@ARGV) {
|
| 37 |
+
&add_to_ref($stem,\@REF) if -e $stem;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
sub add_to_ref {
|
| 43 |
+
my ($file,$REF) = @_;
|
| 44 |
+
my $s=0;
|
| 45 |
+
if ($file =~ /.gz$/) {
|
| 46 |
+
open(REF,"gzip -dc $file|") or die "Can't read $file";
|
| 47 |
+
} else {
|
| 48 |
+
open(REF,$file) or die "Can't read $file";
|
| 49 |
+
}
|
| 50 |
+
while(<REF>) {
|
| 51 |
+
chomp;
|
| 52 |
+
push @{$$REF[$s++]}, $_;
|
| 53 |
+
}
|
| 54 |
+
close(REF);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
my(@CORRECT,@TOTAL,$length_translation,$length_reference);
|
| 58 |
+
my $s=0;
|
| 59 |
+
while(<STDIN>) {
|
| 60 |
+
chomp;
|
| 61 |
+
$_ = lc if $lowercase;
|
| 62 |
+
my @WORD = split;
|
| 63 |
+
my %REF_NGRAM = ();
|
| 64 |
+
my $length_translation_this_sentence = scalar(@WORD);
|
| 65 |
+
my ($closest_diff,$closest_length) = (9999,9999);
|
| 66 |
+
foreach my $reference (@{$REF[$s]}) {
|
| 67 |
+
# print "$s $_ <=> $reference\n";
|
| 68 |
+
$reference = lc($reference) if $lowercase;
|
| 69 |
+
my @WORD = split(' ',$reference);
|
| 70 |
+
my $length = scalar(@WORD);
|
| 71 |
+
my $diff = abs($length_translation_this_sentence-$length);
|
| 72 |
+
if ($diff < $closest_diff) {
|
| 73 |
+
$closest_diff = $diff;
|
| 74 |
+
$closest_length = $length;
|
| 75 |
+
# print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
|
| 76 |
+
} elsif ($diff == $closest_diff) {
|
| 77 |
+
$closest_length = $length if $length < $closest_length;
|
| 78 |
+
# from two references with the same closeness to me
|
| 79 |
+
# take the *shorter* into account, not the "first" one.
|
| 80 |
+
}
|
| 81 |
+
for(my $n=1;$n<=4;$n++) {
|
| 82 |
+
my %REF_NGRAM_N = ();
|
| 83 |
+
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
|
| 84 |
+
my $ngram = "$n";
|
| 85 |
+
for(my $w=0;$w<$n;$w++) {
|
| 86 |
+
$ngram .= " ".$WORD[$start+$w];
|
| 87 |
+
}
|
| 88 |
+
$REF_NGRAM_N{$ngram}++;
|
| 89 |
+
}
|
| 90 |
+
foreach my $ngram (keys %REF_NGRAM_N) {
|
| 91 |
+
if (!defined($REF_NGRAM{$ngram}) ||
|
| 92 |
+
$REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
|
| 93 |
+
$REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
|
| 94 |
+
# print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
}
|
| 99 |
+
$length_translation += $length_translation_this_sentence;
|
| 100 |
+
$length_reference += $closest_length;
|
| 101 |
+
for(my $n=1;$n<=4;$n++) {
|
| 102 |
+
my %T_NGRAM = ();
|
| 103 |
+
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
|
| 104 |
+
my $ngram = "$n";
|
| 105 |
+
for(my $w=0;$w<$n;$w++) {
|
| 106 |
+
$ngram .= " ".$WORD[$start+$w];
|
| 107 |
+
}
|
| 108 |
+
$T_NGRAM{$ngram}++;
|
| 109 |
+
}
|
| 110 |
+
foreach my $ngram (keys %T_NGRAM) {
|
| 111 |
+
$ngram =~ /^(\d+) /;
|
| 112 |
+
my $n = $1;
|
| 113 |
+
# my $corr = 0;
|
| 114 |
+
# print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
|
| 115 |
+
$TOTAL[$n] += $T_NGRAM{$ngram};
|
| 116 |
+
if (defined($REF_NGRAM{$ngram})) {
|
| 117 |
+
if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
|
| 118 |
+
$CORRECT[$n] += $T_NGRAM{$ngram};
|
| 119 |
+
# $corr = $T_NGRAM{$ngram};
|
| 120 |
+
# print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
|
| 121 |
+
}
|
| 122 |
+
else {
|
| 123 |
+
$CORRECT[$n] += $REF_NGRAM{$ngram};
|
| 124 |
+
# $corr = $REF_NGRAM{$ngram};
|
| 125 |
+
# print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
# $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
|
| 129 |
+
# print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
|
| 130 |
+
}
|
| 131 |
+
}
|
| 132 |
+
$s++;
|
| 133 |
+
}
|
| 134 |
+
my $brevity_penalty = 1;
|
| 135 |
+
my $bleu = 0;
|
| 136 |
+
|
| 137 |
+
my @bleu=();
|
| 138 |
+
|
| 139 |
+
for(my $n=1;$n<=4;$n++) {
|
| 140 |
+
if (defined ($TOTAL[$n])){
|
| 141 |
+
$bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
|
| 142 |
+
# print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
|
| 143 |
+
}else{
|
| 144 |
+
$bleu[$n]=0;
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
if ($length_reference==0){
|
| 149 |
+
printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
|
| 150 |
+
exit(1);
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
if ($length_translation<$length_reference) {
|
| 154 |
+
$brevity_penalty = exp(1-$length_reference/$length_translation);
|
| 155 |
+
}
|
| 156 |
+
$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
|
| 157 |
+
my_log( $bleu[2] ) +
|
| 158 |
+
my_log( $bleu[3] ) +
|
| 159 |
+
my_log( $bleu[4] ) ) / 4) ;
|
| 160 |
+
printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
|
| 161 |
+
100*$bleu,
|
| 162 |
+
100*$bleu[1],
|
| 163 |
+
100*$bleu[2],
|
| 164 |
+
100*$bleu[3],
|
| 165 |
+
100*$bleu[4],
|
| 166 |
+
$brevity_penalty,
|
| 167 |
+
$length_translation / $length_reference,
|
| 168 |
+
$length_translation,
|
| 169 |
+
$length_reference;
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
print STDERR "It is not advisable to publish scores from multi-bleu.perl. The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups. Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization. Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n";
|
| 173 |
+
|
| 174 |
+
sub my_log {
|
| 175 |
+
return -9999999999 unless $_[0];
|
| 176 |
+
return log($_[0]);
|
| 177 |
+
}
|
mosesdecoder/scripts/generic/multi_moses.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
|
| 3 |
+
# Written by Michael Denkowski
|
| 4 |
+
#
|
| 5 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 6 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 7 |
+
|
| 8 |
+
'''Parallelize decoding with multiple instances of moses on a local machine
|
| 9 |
+
|
| 10 |
+
To use with mert-moses.pl, activate --multi-moses and set the number of moses
|
| 11 |
+
instances and threads per instance with --decoder-flags='--threads P:T:E'
|
| 12 |
+
|
| 13 |
+
This script runs a specified number of moses instances, each using one or more
|
| 14 |
+
threads. The highest speed is generally seen with many single-threaded
|
| 15 |
+
instances while the lowest memory usage is seen with a single many-threaded
|
| 16 |
+
instance. It is recommended to use the maximum number of instances that will
|
| 17 |
+
fit into memory (up to the number of available CPUs) and distribute CPUs across
|
| 18 |
+
them equally. For example, a machine with 32 CPUs that can fit 3 copies of
|
| 19 |
+
moses into memory would use --threads 2:11:10 for 2 instances with 11 threads
|
| 20 |
+
each and an extra instance with 10 threads (3 instances total using all CPUs).
|
| 21 |
+
|
| 22 |
+
Memory mapped models can be shared by multiple processes and increase the number
|
| 23 |
+
of instances that can fit into memory:
|
| 24 |
+
|
| 25 |
+
Mmaped phrase tables (Ulrich Germann)
|
| 26 |
+
http://www.statmt.org/moses/?n=Advanced.Incremental#ntoc3
|
| 27 |
+
|
| 28 |
+
Mmaped mapped language models (Kenneth Heafield)
|
| 29 |
+
http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19
|
| 30 |
+
'''
|
| 31 |
+
|
| 32 |
+
import collections
|
| 33 |
+
import os
|
| 34 |
+
import Queue
|
| 35 |
+
import signal
|
| 36 |
+
import subprocess
|
| 37 |
+
import sys
|
| 38 |
+
import threading
|
| 39 |
+
import time
|
| 40 |
+
|
| 41 |
+
HELP = '''Multiple process decoding with Moses
|
| 42 |
+
|
| 43 |
+
Usage:
|
| 44 |
+
{} moses --config moses.ini [options] [decoder flags]
|
| 45 |
+
|
| 46 |
+
Options:
|
| 47 |
+
--threads P:T:E
|
| 48 |
+
P: Number of parallel instances to run
|
| 49 |
+
T: Number of threads per instance
|
| 50 |
+
E: Number of threads in optional extra instance
|
| 51 |
+
(default 1:1:0, overrides [threads] in moses.ini. Specifying T
|
| 52 |
+
and E is optional, e.g. --threads 16 starts 16 single-threaded
|
| 53 |
+
instances)
|
| 54 |
+
--n-best-list nbest.out N [distinct]: location and size of N-best list
|
| 55 |
+
--show-weights: for mert-moses.pl, just call moses and exit
|
| 56 |
+
|
| 57 |
+
Other options (decoder flags) are passed through to moses instances
|
| 58 |
+
'''
|
| 59 |
+
|
| 60 |
+
# Defaults
|
| 61 |
+
INPUT = sys.stdin
|
| 62 |
+
PROCS = 1
|
| 63 |
+
THREADS = 1
|
| 64 |
+
EXTRA = 0
|
| 65 |
+
DONE = threading.Event()
|
| 66 |
+
PID = os.getpid()
|
| 67 |
+
# A very long time, used as Queue operation timeout even though we don't
|
| 68 |
+
# actually want a timeout but we do want interruptibility
|
| 69 |
+
# (https://bugs.python.org/issue1360)
|
| 70 |
+
NEVER = 60 * 60 * 24 * 365 * 1000
|
| 71 |
+
|
| 72 |
+
# Single unit of computation: decode a line, output result, signal done
|
| 73 |
+
Task = collections.namedtuple('Task', ['id', 'line', 'out', 'event'])
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def kill_main(msg):
|
| 77 |
+
'''kill -9 the main thread to stop everything immediately'''
|
| 78 |
+
sys.stderr.write('{}\n'.format(msg))
|
| 79 |
+
os.kill(PID, signal.SIGKILL)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def gzopen(f):
|
| 83 |
+
'''Open plain or gzipped text'''
|
| 84 |
+
return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r')
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def run_instance(cmd_base, threads, tasks, cpu_affinity, cpu_offset, n_best=False):
|
| 88 |
+
'''Run an instance of moses that processes tasks (input lines) from a
|
| 89 |
+
queue using a specified number of threads'''
|
| 90 |
+
cmd = cmd_base[:]
|
| 91 |
+
cmd.append('--threads')
|
| 92 |
+
cmd.append(str(threads))
|
| 93 |
+
|
| 94 |
+
if cpu_affinity:
|
| 95 |
+
cmd.append('--cpu-affinity-offset')
|
| 96 |
+
cmd.append(str(cpu_offset))
|
| 97 |
+
|
| 98 |
+
#print 'BEFORE'
|
| 99 |
+
#print cmd
|
| 100 |
+
#print 'AFTER\n'
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
# Queue of tasks instance is currently working on, limited to the number
|
| 104 |
+
# of threads * 2 (minimal buffering). The queue should be kept full for
|
| 105 |
+
# optimal CPU usage.
|
| 106 |
+
work = Queue.Queue(maxsize=(threads * 2))
|
| 107 |
+
# Multi-threaded instance
|
| 108 |
+
moses = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
| 109 |
+
|
| 110 |
+
# Read and handle instance output as available
|
| 111 |
+
def handle_output():
|
| 112 |
+
while True:
|
| 113 |
+
# Output line triggers task completion
|
| 114 |
+
line = moses.stdout.readline()
|
| 115 |
+
# End of output (instance finished)
|
| 116 |
+
if not line:
|
| 117 |
+
break
|
| 118 |
+
task = work.get(timeout=NEVER)
|
| 119 |
+
if n_best:
|
| 120 |
+
# Read and copy lines until sentinel line, copy real line id
|
| 121 |
+
# id ||| hypothesis words ||| feature scores ||| total score
|
| 122 |
+
(first_i, rest) = line.split(' ||| ', 1)
|
| 123 |
+
task.out.append(' ||| '.join((task.id, rest)))
|
| 124 |
+
while True:
|
| 125 |
+
line = moses.stdout.readline()
|
| 126 |
+
(i, rest) = line.split(' ||| ', 1)
|
| 127 |
+
# Sentinel
|
| 128 |
+
if i != first_i:
|
| 129 |
+
break
|
| 130 |
+
task.out.append(' ||| '.join((task.id, rest)))
|
| 131 |
+
else:
|
| 132 |
+
task.out.append(line)
|
| 133 |
+
# Signal task done
|
| 134 |
+
task.event.set()
|
| 135 |
+
# Output thread
|
| 136 |
+
handler = threading.Thread(target=handle_output, args=())
|
| 137 |
+
# Daemon: guaranteed to finish before non-daemons
|
| 138 |
+
handler.setDaemon(True)
|
| 139 |
+
handler.start()
|
| 140 |
+
|
| 141 |
+
# Input thread: take tasks as they are available and add them to work
|
| 142 |
+
# queue. Stop when DONE encountered.
|
| 143 |
+
while True:
|
| 144 |
+
task = tasks.get(timeout=NEVER)
|
| 145 |
+
work.put(task, timeout=NEVER)
|
| 146 |
+
if task.event == DONE:
|
| 147 |
+
break
|
| 148 |
+
if n_best:
|
| 149 |
+
# Input line followed by blank line (sentinel)
|
| 150 |
+
moses.stdin.write(task.line)
|
| 151 |
+
moses.stdin.write('\n')
|
| 152 |
+
else:
|
| 153 |
+
moses.stdin.write(task.line)
|
| 154 |
+
|
| 155 |
+
# Cleanup
|
| 156 |
+
moses.stdin.close()
|
| 157 |
+
moses.wait()
|
| 158 |
+
handler.join()
|
| 159 |
+
|
| 160 |
+
except:
|
| 161 |
+
kill_main('Error with moses instance: see stderr')
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def write_results(results, n_best=False, n_best_out=None):
|
| 165 |
+
'''Write out results (output lines) from a queue as they are populated'''
|
| 166 |
+
while True:
|
| 167 |
+
task = results.get(timeout=NEVER)
|
| 168 |
+
if task.event == DONE:
|
| 169 |
+
break
|
| 170 |
+
task.event.wait()
|
| 171 |
+
if n_best:
|
| 172 |
+
# Write top-best and N-best
|
| 173 |
+
# id ||| hypothesis words ||| feature scores ||| total score
|
| 174 |
+
top_best = task.out[0].split(' ||| ', 2)[1]
|
| 175 |
+
# Except don't write top-best if writing N-best to stdout "-"
|
| 176 |
+
if n_best_out != sys.stdout:
|
| 177 |
+
sys.stdout.write('{}\n'.format(top_best))
|
| 178 |
+
sys.stdout.flush()
|
| 179 |
+
for line in task.out:
|
| 180 |
+
n_best_out.write(line)
|
| 181 |
+
n_best_out.flush()
|
| 182 |
+
else:
|
| 183 |
+
sys.stdout.write(task.out[0])
|
| 184 |
+
sys.stdout.flush()
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def main(argv):
|
| 188 |
+
# Defaults
|
| 189 |
+
moses_ini = None
|
| 190 |
+
input = INPUT
|
| 191 |
+
procs = PROCS
|
| 192 |
+
threads = THREADS
|
| 193 |
+
extra = EXTRA
|
| 194 |
+
n_best = False
|
| 195 |
+
n_best_file = None
|
| 196 |
+
n_best_size = None
|
| 197 |
+
n_best_distinct = False
|
| 198 |
+
n_best_out = None
|
| 199 |
+
show_weights = False
|
| 200 |
+
cpu_affinity = False
|
| 201 |
+
|
| 202 |
+
# Decoder command
|
| 203 |
+
cmd = argv[1:]
|
| 204 |
+
|
| 205 |
+
# Parse special options and remove from cmd
|
| 206 |
+
i = 1
|
| 207 |
+
while i < len(cmd):
|
| 208 |
+
if cmd[i] in ('-f', '-config', '--config'):
|
| 209 |
+
moses_ini = cmd[i + 1]
|
| 210 |
+
# Do not remove from cmd
|
| 211 |
+
i += 2
|
| 212 |
+
elif cmd[i] in ('-i', '-input-file', '--input-file'):
|
| 213 |
+
input = gzopen(cmd[i + 1])
|
| 214 |
+
cmd = cmd[:i] + cmd[i + 2:]
|
| 215 |
+
elif cmd[i] in ('-th', '-threads', '--threads'):
|
| 216 |
+
# P:T:E
|
| 217 |
+
args = cmd[i + 1].split(':')
|
| 218 |
+
procs = int(args[0])
|
| 219 |
+
if len(args) > 1:
|
| 220 |
+
threads = int(args[1])
|
| 221 |
+
if len(args) > 2:
|
| 222 |
+
extra = int(args[2])
|
| 223 |
+
cmd = cmd[:i] + cmd[i + 2:]
|
| 224 |
+
elif cmd[i] in ('-n-best-list', '--n-best-list'):
|
| 225 |
+
n_best = True
|
| 226 |
+
n_best_file = cmd[i + 1]
|
| 227 |
+
n_best_size = cmd[i + 2]
|
| 228 |
+
# Optional "distinct"
|
| 229 |
+
if i + 3 < len(cmd) and cmd[i + 3] == 'distinct':
|
| 230 |
+
n_best_distinct = True
|
| 231 |
+
cmd = cmd[:i] + cmd[i + 4:]
|
| 232 |
+
else:
|
| 233 |
+
cmd = cmd[:i] + cmd[i + 3:]
|
| 234 |
+
# Handled specially for mert-moses.pl
|
| 235 |
+
elif cmd[i] in ('-show-weights', '--show-weights'):
|
| 236 |
+
show_weights = True
|
| 237 |
+
# Do not remove from cmd
|
| 238 |
+
i += 1
|
| 239 |
+
elif cmd[i] in ('-cpu-affinity', '--cpu-affinity'):
|
| 240 |
+
cpu_affinity = True
|
| 241 |
+
cmd = cmd[:i] + cmd[i + 1:]
|
| 242 |
+
else:
|
| 243 |
+
i += 1
|
| 244 |
+
|
| 245 |
+
# If mert-moses.pl passes -show-weights, just call moses
|
| 246 |
+
if show_weights:
|
| 247 |
+
sys.stdout.write(subprocess.check_output(cmd))
|
| 248 |
+
sys.stdout.flush()
|
| 249 |
+
return
|
| 250 |
+
|
| 251 |
+
# Check inputs
|
| 252 |
+
if not (len(cmd) > 0 and moses_ini):
|
| 253 |
+
sys.stderr.write(HELP.format(os.path.basename(argv[0])))
|
| 254 |
+
sys.exit(2)
|
| 255 |
+
if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)):
|
| 256 |
+
raise Exception('moses "{}" is not executable\n'.format(cmd[0]))
|
| 257 |
+
|
| 258 |
+
# Report settings
|
| 259 |
+
sys.stderr.write('Moses flags: {}\n'.format(' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:])))
|
| 260 |
+
sys.stderr.write('Instances: {}\n'.format(procs))
|
| 261 |
+
sys.stderr.write('Threads per: {}\n'.format(threads))
|
| 262 |
+
if extra:
|
| 263 |
+
sys.stderr.write('Extra: {}\n'.format(extra))
|
| 264 |
+
if n_best:
|
| 265 |
+
sys.stderr.write('N-best list: {} ({}{})\n'.format(n_best_file, n_best_size, ', distinct' if n_best_distinct else ''))
|
| 266 |
+
|
| 267 |
+
# Task and result queues (buffer 8 * total threads input lines)
|
| 268 |
+
tasks = Queue.Queue(maxsize=(8 * ((procs * threads) + extra)))
|
| 269 |
+
results = Queue.Queue()
|
| 270 |
+
|
| 271 |
+
# N-best capture
|
| 272 |
+
if n_best:
|
| 273 |
+
cmd.append('--n-best-list')
|
| 274 |
+
cmd.append('-')
|
| 275 |
+
cmd.append(n_best_size)
|
| 276 |
+
if n_best_distinct:
|
| 277 |
+
cmd.append('distinct')
|
| 278 |
+
if n_best_file == '-':
|
| 279 |
+
n_best_out = sys.stdout
|
| 280 |
+
else:
|
| 281 |
+
n_best_out = open(n_best_file, 'w')
|
| 282 |
+
|
| 283 |
+
# Start instances
|
| 284 |
+
cpu_offset = -threads
|
| 285 |
+
instances = []
|
| 286 |
+
for i in range(procs + (1 if extra else 0)):
|
| 287 |
+
if cpu_affinity:
|
| 288 |
+
cpu_offset += threads
|
| 289 |
+
|
| 290 |
+
t = threading.Thread(target=run_instance, args=(cmd, (threads if i < procs else extra), tasks, cpu_affinity, cpu_offset, n_best))
|
| 291 |
+
instances.append(t)
|
| 292 |
+
# Daemon: guaranteed to finish before non-daemons
|
| 293 |
+
t.setDaemon(True)
|
| 294 |
+
t.start()
|
| 295 |
+
#time.sleep(1)
|
| 296 |
+
|
| 297 |
+
# Start results writer
|
| 298 |
+
writer = threading.Thread(target=write_results, args=(results, n_best, n_best_out))
|
| 299 |
+
writer.start()
|
| 300 |
+
|
| 301 |
+
# Main loop: queue task for each input line
|
| 302 |
+
id = 0
|
| 303 |
+
while True:
|
| 304 |
+
line = input.readline()
|
| 305 |
+
if not line:
|
| 306 |
+
break
|
| 307 |
+
# (input, out lines, err lines, "done" event)
|
| 308 |
+
task = Task(str(id), line, [], threading.Event())
|
| 309 |
+
results.put(task, timeout=NEVER)
|
| 310 |
+
tasks.put(task, timeout=NEVER)
|
| 311 |
+
id += 1
|
| 312 |
+
|
| 313 |
+
# Tell instances to exit
|
| 314 |
+
for t in instances:
|
| 315 |
+
tasks.put(Task(None, None, None, DONE), timeout=NEVER)
|
| 316 |
+
for t in instances:
|
| 317 |
+
t.join()
|
| 318 |
+
|
| 319 |
+
# Stop results writer
|
| 320 |
+
results.put(Task(None, None, None, DONE), timeout=NEVER)
|
| 321 |
+
writer.join()
|
| 322 |
+
|
| 323 |
+
# Cleanup
|
| 324 |
+
if n_best:
|
| 325 |
+
n_best_out.close()
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
if __name__ == '__main__':
|
| 329 |
+
try:
|
| 330 |
+
main(sys.argv)
|
| 331 |
+
except:
|
| 332 |
+
kill_main('Error with main I/O: see stderr')
|
mosesdecoder/scripts/generic/ph_numbers.perl
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
|
| 3 |
+
package ph_numbers;
|
| 4 |
+
|
| 5 |
+
# Script to recognize and replace numbers in Moses training corpora
|
| 6 |
+
# and decoder input
|
| 7 |
+
#
|
| 8 |
+
# (c) 2013 TAUS
|
| 9 |
+
#
|
| 10 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 11 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 12 |
+
|
| 13 |
+
use warnings;
|
| 14 |
+
use strict;
|
| 15 |
+
|
| 16 |
+
run() unless caller();
|
| 17 |
+
use Getopt::Std;
|
| 18 |
+
|
| 19 |
+
my $debug = $ENV{DEBUG} || 0;
|
| 20 |
+
|
| 21 |
+
sub run {
|
| 22 |
+
my %opts;
|
| 23 |
+
if(!getopts('s:t:cm:hl',\%opts) || $opts{h}) {
|
| 24 |
+
print "Usage: perl $0 [-s source_locale][-t target_locale][-c][-h][-l][-m symbol] < in > out\n";
|
| 25 |
+
exit;
|
| 26 |
+
}
|
| 27 |
+
my $sourceLocale = $opts{s} || "";
|
| 28 |
+
my $targetLocale = $opts{t} || "";
|
| 29 |
+
my $numberSymbol = $opts{m} || '@num@';
|
| 30 |
+
while(<>) {
|
| 31 |
+
chomp;
|
| 32 |
+
print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
sub mark_numbers {
|
| 37 |
+
my $input = shift;
|
| 38 |
+
my $corpusMode = shift;
|
| 39 |
+
my $legacyMode = shift;
|
| 40 |
+
my $numberSymbol = shift || '@num@';
|
| 41 |
+
|
| 42 |
+
my $numref = recognize($input);
|
| 43 |
+
my $input_length = length($input);
|
| 44 |
+
my $output = "";
|
| 45 |
+
my $position = 0;
|
| 46 |
+
for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) {
|
| 47 |
+
my $numstart = $numref->[$i][0];
|
| 48 |
+
my $numend = $numref->[$i][1];
|
| 49 |
+
if($position < $numstart) {
|
| 50 |
+
$output .= substr($input,$position,$numstart-$position);
|
| 51 |
+
}
|
| 52 |
+
my $number = substr($input,$numstart,$numend-$numstart);
|
| 53 |
+
if($corpusMode) {
|
| 54 |
+
$output .= $numberSymbol;
|
| 55 |
+
}
|
| 56 |
+
else {
|
| 57 |
+
if($legacyMode) {
|
| 58 |
+
$output .= "<ne translation=\"$number\">$numberSymbol</ne>";
|
| 59 |
+
}
|
| 60 |
+
else {
|
| 61 |
+
$output .= "<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
|
| 62 |
+
}
|
| 63 |
+
}
|
| 64 |
+
$position = $numend;
|
| 65 |
+
}
|
| 66 |
+
$output .= substr($input,$position);
|
| 67 |
+
return $output;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
sub recognize {
|
| 71 |
+
my $input = shift;
|
| 72 |
+
#print STDERR "input=$input\n";
|
| 73 |
+
|
| 74 |
+
my @recognized = ();
|
| 75 |
+
while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) {
|
| 76 |
+
my $start = $-[3];
|
| 77 |
+
my $end = $+[3];
|
| 78 |
+
while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) {
|
| 79 |
+
$end = $+[2];
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
# ALL characters in the word must be
|
| 83 |
+
my $isRecognized = 1;
|
| 84 |
+
if ($start == 0 || substr($input, $start - 1, 1) eq " ") {
|
| 85 |
+
# 1st word, or previous char is a space
|
| 86 |
+
}
|
| 87 |
+
else {
|
| 88 |
+
$isRecognized = 0;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
if ($end == length($input) || substr($input, $end, 1) eq " ") {
|
| 92 |
+
# last word, or next char is a space
|
| 93 |
+
}
|
| 94 |
+
else {
|
| 95 |
+
$isRecognized = 0;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
#print STDERR "start=$start end=$end len=" .length($input) ."\n";
|
| 99 |
+
if ($isRecognized) {
|
| 100 |
+
push @recognized,[$start,$end];
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
return \@recognized;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
1;
|
mosesdecoder/scripts/generic/reverse-alignment.perl
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
use warnings;
|
| 7 |
+
use strict;
|
| 8 |
+
|
| 9 |
+
my $line;
|
| 10 |
+
while ($line = <STDIN>)
|
| 11 |
+
{
|
| 12 |
+
chomp($line);
|
| 13 |
+
my @toks = split(/ /, $line);
|
| 14 |
+
|
| 15 |
+
foreach (my $i = 0; $i < @toks; ++$i)
|
| 16 |
+
{
|
| 17 |
+
my $tok = $toks[$i];
|
| 18 |
+
my @alignPair = split(/-/, $tok);
|
| 19 |
+
(@alignPair == 2) or die("Something wrong");
|
| 20 |
+
print $alignPair[1]."-".$alignPair[0]." ";
|
| 21 |
+
}
|
| 22 |
+
print "\n";
|
| 23 |
+
}
|
| 24 |
+
|
mosesdecoder/scripts/generic/score-parallel.perl
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
# example
|
| 7 |
+
# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0
|
| 8 |
+
# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.inv.sorted.gz ./lex.2.e2f ./phrase-table.2.half.e2f --Inverse 1
|
| 9 |
+
|
| 10 |
+
use warnings;
|
| 11 |
+
use strict;
|
| 12 |
+
use File::Basename;
|
| 13 |
+
|
| 14 |
+
sub RunFork($);
|
| 15 |
+
sub systemCheck($);
|
| 16 |
+
sub GetSourcePhrase($);
|
| 17 |
+
sub NumStr($);
|
| 18 |
+
sub CutContextFile($$$);
|
| 19 |
+
|
| 20 |
+
my $GZIP_EXEC;
|
| 21 |
+
if(`which pigz`) {
|
| 22 |
+
$GZIP_EXEC = 'pigz';
|
| 23 |
+
}
|
| 24 |
+
else {
|
| 25 |
+
$GZIP_EXEC = 'gzip';
|
| 26 |
+
}
|
| 27 |
+
print STDERR "using $GZIP_EXEC \n";
|
| 28 |
+
|
| 29 |
+
#my $EXTRACT_SPLIT_LINES = 5000000;
|
| 30 |
+
my $EXTRACT_SPLIT_LINES = 50000000;
|
| 31 |
+
|
| 32 |
+
print STDERR "Started ".localtime() ."\n";
|
| 33 |
+
|
| 34 |
+
my $numParallel = $ARGV[0];
|
| 35 |
+
$numParallel = 1 if $numParallel < 1;
|
| 36 |
+
|
| 37 |
+
my $sortCmd = $ARGV[1];
|
| 38 |
+
my $scoreCmd = $ARGV[2];
|
| 39 |
+
|
| 40 |
+
my $extractFile = $ARGV[3]; # 1st arg of extract argument
|
| 41 |
+
my $lexFile = $ARGV[4];
|
| 42 |
+
my $ptHalf = $ARGV[5]; # output
|
| 43 |
+
my $inverse = 0;
|
| 44 |
+
my $sourceLabelsFile;
|
| 45 |
+
my $partsOfSpeechFile;
|
| 46 |
+
my $targetSyntacticPreferencesLabelsFile;
|
| 47 |
+
|
| 48 |
+
my $otherExtractArgs= "";
|
| 49 |
+
for (my $i = 6; $i < $#ARGV; ++$i)
|
| 50 |
+
{
|
| 51 |
+
if ($ARGV[$i] eq '--SourceLabels') {
|
| 52 |
+
$sourceLabelsFile = $ARGV[++$i];
|
| 53 |
+
$otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS ";
|
| 54 |
+
next;
|
| 55 |
+
}
|
| 56 |
+
if ($ARGV[$i] eq '--PartsOfSpeech') {
|
| 57 |
+
$partsOfSpeechFile = $ARGV[++$i];
|
| 58 |
+
$otherExtractArgs .= "--PartsOfSpeech ";
|
| 59 |
+
next;
|
| 60 |
+
}
|
| 61 |
+
if ($ARGV[$i] eq '--TargetSyntacticPreferences') {
|
| 62 |
+
$targetSyntacticPreferencesLabelsFile = $ARGV[++$i];
|
| 63 |
+
$otherExtractArgs .= "--TargetSyntacticPreferences ";
|
| 64 |
+
next;
|
| 65 |
+
}
|
| 66 |
+
if ($ARGV[$i] eq '--Inverse') {
|
| 67 |
+
$inverse = 1;
|
| 68 |
+
$otherExtractArgs .= $ARGV[$i] ." ";
|
| 69 |
+
next;
|
| 70 |
+
}
|
| 71 |
+
$otherExtractArgs .= $ARGV[$i] ." ";
|
| 72 |
+
}
|
| 73 |
+
#$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs
|
| 74 |
+
|
| 75 |
+
my $FlexibilityScore = $otherExtractArgs =~ /--FlexibilityScore/;
|
| 76 |
+
my $FlexibilityCmd = $otherExtractArgs;
|
| 77 |
+
$otherExtractArgs =~ s/--FlexibilityScore=\S+//; # don't pass flexibility_score command to score program
|
| 78 |
+
if ($FlexibilityCmd =~ /--FlexibilityScore=(\S+)/) {
|
| 79 |
+
$FlexibilityCmd = $1;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
my $doSort = $ARGV[$#ARGV]; # last arg
|
| 83 |
+
|
| 84 |
+
my $TMPDIR=dirname($ptHalf) ."/tmp.$$";
|
| 85 |
+
mkdir $TMPDIR;
|
| 86 |
+
|
| 87 |
+
my $cmd;
|
| 88 |
+
|
| 89 |
+
my $extractFileContext;
|
| 90 |
+
if ($FlexibilityScore) {
|
| 91 |
+
$extractFileContext = $extractFile;
|
| 92 |
+
$extractFileContext =~ s/extract./extract.context./;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
my $fileCount = 0;
|
| 96 |
+
if ($numParallel <= 1)
|
| 97 |
+
{ # don't do parallel. Just link the extract file into place
|
| 98 |
+
$cmd = "ln -s $extractFile $TMPDIR/extract.0.gz";
|
| 99 |
+
if ($FlexibilityScore) {
|
| 100 |
+
$cmd .= " && ln -s $extractFileContext $TMPDIR/extract.context.0.gz";
|
| 101 |
+
}
|
| 102 |
+
print STDERR "$cmd \n";
|
| 103 |
+
systemCheck($cmd);
|
| 104 |
+
|
| 105 |
+
$fileCount = 1;
|
| 106 |
+
}
|
| 107 |
+
else
|
| 108 |
+
{ # cut up extract file into smaller mini-extract files.
|
| 109 |
+
if ($extractFile =~ /\.gz$/) {
|
| 110 |
+
open(IN, "gunzip -c $extractFile |") || die "can't open pipe to $extractFile";
|
| 111 |
+
}
|
| 112 |
+
else {
|
| 113 |
+
open(IN, $extractFile) || die "can't open $extractFile";
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
my $lastlineContext;
|
| 117 |
+
if ($FlexibilityScore) {
|
| 118 |
+
$lastlineContext = "";
|
| 119 |
+
if ($extractFileContext =~ /\.gz$/) {
|
| 120 |
+
open(IN_CONTEXT, "gunzip -c $extractFileContext |") || die "can't open pipe to $extractFileContext";
|
| 121 |
+
}
|
| 122 |
+
else {
|
| 123 |
+
open(IN_CONTEXT, $extractFileContext) || die "can't open $extractFileContext";
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
my $filePath = "$TMPDIR/extract.$fileCount.gz";
|
| 128 |
+
open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
|
| 129 |
+
|
| 130 |
+
my $lineCount = 0;
|
| 131 |
+
my $line;
|
| 132 |
+
my $prevSourcePhrase = "";
|
| 133 |
+
while ($line=<IN>)
|
| 134 |
+
{
|
| 135 |
+
chomp($line);
|
| 136 |
+
++$lineCount;
|
| 137 |
+
|
| 138 |
+
if ($lineCount > $EXTRACT_SPLIT_LINES)
|
| 139 |
+
{ # over line limit. Cut off at next source phrase change
|
| 140 |
+
my $sourcePhrase = GetSourcePhrase($line);
|
| 141 |
+
|
| 142 |
+
if ($prevSourcePhrase eq "")
|
| 143 |
+
{ # start comparing
|
| 144 |
+
$prevSourcePhrase = $sourcePhrase;
|
| 145 |
+
}
|
| 146 |
+
elsif ($sourcePhrase eq $prevSourcePhrase)
|
| 147 |
+
{ # can't cut off yet. Do nothing
|
| 148 |
+
}
|
| 149 |
+
else
|
| 150 |
+
{ # cut off, open next min-extract file & write to that instead
|
| 151 |
+
close OUT;
|
| 152 |
+
|
| 153 |
+
if ($FlexibilityScore) {
|
| 154 |
+
$lastlineContext = CutContextFile($prevSourcePhrase, $fileCount, $lastlineContext);
|
| 155 |
+
}
|
| 156 |
+
$prevSourcePhrase = "";
|
| 157 |
+
$lineCount = 0;
|
| 158 |
+
++$fileCount;
|
| 159 |
+
my $filePath = $fileCount;
|
| 160 |
+
$filePath = "$TMPDIR/extract.$filePath.gz";
|
| 161 |
+
open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
else
|
| 165 |
+
{ # keep on writing to current mini-extract file
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
print OUT "$line\n";
|
| 169 |
+
|
| 170 |
+
}
|
| 171 |
+
close OUT;
|
| 172 |
+
if ($FlexibilityScore) {
|
| 173 |
+
$lastlineContext = CutContextFile($prevSourcePhrase, $fileCount, $lastlineContext);
|
| 174 |
+
}
|
| 175 |
+
++$fileCount;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
# create run scripts
|
| 180 |
+
my @runFiles = (0..($numParallel-1));
|
| 181 |
+
for (my $i = 0; $i < $numParallel; ++$i)
|
| 182 |
+
{
|
| 183 |
+
my $path = "$TMPDIR/run.$i.sh";
|
| 184 |
+
open(my $fh, ">", $path) or die "cannot open $path: $!";
|
| 185 |
+
$runFiles[$i] = $fh;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
# write scoring of mini-extracts to run scripts
|
| 189 |
+
for (my $i = 0; $i < $fileCount; ++$i)
|
| 190 |
+
{
|
| 191 |
+
my $numStr = NumStr($i);
|
| 192 |
+
|
| 193 |
+
my $fileInd = $i % $numParallel;
|
| 194 |
+
my $fh = $runFiles[$fileInd];
|
| 195 |
+
|
| 196 |
+
my $cmd = "$scoreCmd $TMPDIR/extract.$i.gz $lexFile $TMPDIR/phrase-table.half.$numStr.gz $otherExtractArgs 2>> /dev/stderr \n";
|
| 197 |
+
print STDERR $cmd;
|
| 198 |
+
|
| 199 |
+
if ($FlexibilityScore) {
|
| 200 |
+
$cmd .= "gzip -cd $TMPDIR/phrase-table.half.$numStr.gz | $FlexibilityCmd $TMPDIR/extract.context.$i.gz";
|
| 201 |
+
$cmd .= " --Inverse" if ($otherExtractArgs =~ /--Inverse/);
|
| 202 |
+
$cmd .= " --Hierarchical" if ($otherExtractArgs =~ /--Hierarchical/);
|
| 203 |
+
$cmd .= " | $GZIP_EXEC -c > $TMPDIR/phrase-table.half.$numStr.flex.gz\n";
|
| 204 |
+
$cmd .= "mv $TMPDIR/phrase-table.half.$numStr.flex.gz $TMPDIR/phrase-table.half.$numStr.gz\n";
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
print $fh $cmd;
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
# close run script files
|
| 211 |
+
for (my $i = 0; $i < $numParallel; ++$i)
|
| 212 |
+
{
|
| 213 |
+
close($runFiles[$i]);
|
| 214 |
+
my $path = "$TMPDIR/run.$i.sh";
|
| 215 |
+
systemCheck("chmod +x $path");
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
# run each score script in parallel
|
| 219 |
+
my @children;
|
| 220 |
+
for (my $i = 0; $i < $numParallel; ++$i)
|
| 221 |
+
{
|
| 222 |
+
my $cmd = "$TMPDIR/run.$i.sh";
|
| 223 |
+
my $pid = RunFork($cmd);
|
| 224 |
+
push(@children, $pid);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
# wait for everything is finished
|
| 228 |
+
foreach (@children) {
|
| 229 |
+
waitpid($_, 0);
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
# merge & sort
|
| 233 |
+
$cmd = "\n\nOH SHIT. This should have been filled in \n\n";
|
| 234 |
+
if ($fileCount == 1 && !$doSort && !$FlexibilityScore)
|
| 235 |
+
{
|
| 236 |
+
my $numStr = NumStr(0);
|
| 237 |
+
$cmd = "mv $TMPDIR/phrase-table.half.$numStr.gz $ptHalf";
|
| 238 |
+
}
|
| 239 |
+
else
|
| 240 |
+
{
|
| 241 |
+
$cmd = "gunzip -c $TMPDIR/phrase-table.half.*.gz 2>> /dev/stderr";
|
| 242 |
+
|
| 243 |
+
if ($doSort) {
|
| 244 |
+
$cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR ";
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
$cmd .= " | $GZIP_EXEC -c > $ptHalf 2>> /dev/stderr ";
|
| 248 |
+
}
|
| 249 |
+
print STDERR $cmd;
|
| 250 |
+
systemCheck($cmd);
|
| 251 |
+
|
| 252 |
+
# merge coc
|
| 253 |
+
my $numStr = NumStr(0);
|
| 254 |
+
my $cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc";
|
| 255 |
+
|
| 256 |
+
if (-e $cocPath)
|
| 257 |
+
{
|
| 258 |
+
my @arrayCOC;
|
| 259 |
+
my $line;
|
| 260 |
+
|
| 261 |
+
# 1st file
|
| 262 |
+
open(FHCOC, $cocPath) || die "can't open pipe to $cocPath";
|
| 263 |
+
while ($line = <FHCOC>)
|
| 264 |
+
{
|
| 265 |
+
my $coc = int($line);
|
| 266 |
+
push(@arrayCOC, $coc);
|
| 267 |
+
}
|
| 268 |
+
close(FHCOC);
|
| 269 |
+
|
| 270 |
+
# all other files
|
| 271 |
+
for (my $i = 1; $i < $fileCount; ++$i)
|
| 272 |
+
{
|
| 273 |
+
$numStr = NumStr($i);
|
| 274 |
+
$cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc";
|
| 275 |
+
open(FHCOC, $cocPath) || die "can't open pipe to $cocPath";
|
| 276 |
+
my $arrayInd = 0;
|
| 277 |
+
while ($line = <FHCOC>)
|
| 278 |
+
{
|
| 279 |
+
my $coc = int($line);
|
| 280 |
+
$arrayCOC[$arrayInd] += $coc;
|
| 281 |
+
|
| 282 |
+
++$arrayInd;
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
close(FHCOC);
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
# output
|
| 289 |
+
$cocPath = "$ptHalf.coc";
|
| 290 |
+
open(FHCOC, ">", $cocPath) or die "cannot open $cocPath: $!";
|
| 291 |
+
for (my $i = 0; $i < @arrayCOC; ++$i)
|
| 292 |
+
{
|
| 293 |
+
print FHCOC $arrayCOC[$i]."\n";
|
| 294 |
+
}
|
| 295 |
+
close(FHCOC);
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
# merge source labels files
|
| 299 |
+
if (!$inverse && defined($sourceLabelsFile))
|
| 300 |
+
{
|
| 301 |
+
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; echo \"SSTART 2\"; echo \"SEND 3\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $sourceLabelsFile";
|
| 302 |
+
print STDERR "Merging source labels files: $cmd \n";
|
| 303 |
+
`$cmd`;
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
# merge parts-of-speech files
|
| 307 |
+
if (!$inverse && defined($partsOfSpeechFile))
|
| 308 |
+
{
|
| 309 |
+
my $cmd = "(echo \"SSTART 0\"; echo \"SEND 1\"; cat $TMPDIR/phrase-table.half.*.gz.partsOfSpeech | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $partsOfSpeechFile";
|
| 310 |
+
print STDERR "Merging parts-of-speech files: $cmd \n";
|
| 311 |
+
`$cmd`;
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
# merge target syntactic preferences labels files
|
| 315 |
+
if (!$inverse && defined($targetSyntacticPreferencesLabelsFile))
|
| 316 |
+
{
|
| 317 |
+
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.tgtpref | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $targetSyntacticPreferencesLabelsFile";
|
| 318 |
+
print STDERR "Merging target syntactic preferences labels files: $cmd \n";
|
| 319 |
+
`$cmd`;
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
$cmd = "rm -rf $TMPDIR \n";
|
| 323 |
+
print STDERR $cmd;
|
| 324 |
+
systemCheck($cmd);
|
| 325 |
+
|
| 326 |
+
print STDERR "Finished ".localtime() ."\n";
|
| 327 |
+
|
| 328 |
+
# -----------------------------------------
|
| 329 |
+
# -----------------------------------------
|
| 330 |
+
|
| 331 |
+
sub RunFork($)
|
| 332 |
+
{
|
| 333 |
+
my $cmd = shift;
|
| 334 |
+
|
| 335 |
+
my $pid = fork();
|
| 336 |
+
|
| 337 |
+
if ($pid == 0)
|
| 338 |
+
{ # child
|
| 339 |
+
print STDERR $cmd;
|
| 340 |
+
systemCheck($cmd);
|
| 341 |
+
exit();
|
| 342 |
+
}
|
| 343 |
+
return $pid;
|
| 344 |
+
}
|
| 345 |
+
sub systemCheck($)
|
| 346 |
+
{
|
| 347 |
+
my $cmd = shift;
|
| 348 |
+
my $retVal = system($cmd);
|
| 349 |
+
if ($retVal != 0)
|
| 350 |
+
{
|
| 351 |
+
exit(1);
|
| 352 |
+
}
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
sub GetSourcePhrase($)
|
| 356 |
+
{
|
| 357 |
+
my $line = shift;
|
| 358 |
+
my $pos = index($line, "|||");
|
| 359 |
+
my $sourcePhrase = substr($line, 0, $pos);
|
| 360 |
+
return $sourcePhrase;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
sub NumStr($)
|
| 365 |
+
{
|
| 366 |
+
my $i = shift;
|
| 367 |
+
my $numStr;
|
| 368 |
+
if ($i < 10) {
|
| 369 |
+
$numStr = "000000$i";
|
| 370 |
+
}
|
| 371 |
+
elsif ($i < 100) {
|
| 372 |
+
$numStr = "00000$i";
|
| 373 |
+
}
|
| 374 |
+
elsif ($i < 1000) {
|
| 375 |
+
$numStr = "0000$i";
|
| 376 |
+
}
|
| 377 |
+
elsif ($i < 10000) {
|
| 378 |
+
$numStr = "000$i";
|
| 379 |
+
}
|
| 380 |
+
elsif ($i < 100000) {
|
| 381 |
+
$numStr = "00$i";
|
| 382 |
+
}
|
| 383 |
+
elsif ($i < 1000000) {
|
| 384 |
+
$numStr = "0$i";
|
| 385 |
+
}
|
| 386 |
+
else {
|
| 387 |
+
$numStr = $i;
|
| 388 |
+
}
|
| 389 |
+
return $numStr;
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
sub CutContextFile($$$)
|
| 394 |
+
{
|
| 395 |
+
my($lastsourcePhrase, $fileCount, $lastline) = @_;
|
| 396 |
+
my $line;
|
| 397 |
+
my $sourcePhrase;
|
| 398 |
+
|
| 399 |
+
my $filePath = "$TMPDIR/extract.context.$fileCount.gz";
|
| 400 |
+
open (OUT_CONTEXT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
|
| 401 |
+
|
| 402 |
+
if ($lastline ne "") {
|
| 403 |
+
print OUT_CONTEXT "$lastline\n";
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
#write all lines in context file until we meet last source phrase in extract file
|
| 407 |
+
while ($line=<IN_CONTEXT>)
|
| 408 |
+
{
|
| 409 |
+
chomp($line);
|
| 410 |
+
$sourcePhrase = GetSourcePhrase($line);
|
| 411 |
+
print OUT_CONTEXT "$line\n";
|
| 412 |
+
if ($sourcePhrase eq $lastsourcePhrase) {last;}
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
#write all lines in context file that correspond to last source phrase in extract file
|
| 416 |
+
while ($line=<IN_CONTEXT>)
|
| 417 |
+
{
|
| 418 |
+
chomp($line);
|
| 419 |
+
$sourcePhrase = GetSourcePhrase($line);
|
| 420 |
+
if ($sourcePhrase ne $lastsourcePhrase) {last;}
|
| 421 |
+
print OUT_CONTEXT "$line\n";
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
close(OUT_CONTEXT);
|
| 425 |
+
|
| 426 |
+
return $line;
|
| 427 |
+
|
| 428 |
+
}
|
mosesdecoder/scripts/generic/score_parallel.py
ADDED
|
@@ -0,0 +1,776 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#! /usr/bin/env python
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
#
|
| 6 |
+
# Script contributed by Precision Translation Tools.
|
| 7 |
+
|
| 8 |
+
"""Run Moses `score` jobs in parallel.
|
| 9 |
+
|
| 10 |
+
This script is a replacement for `score-parallel.perl`. The two are similar,
|
| 11 |
+
but there are differences in usage. In addition, this script can be called
|
| 12 |
+
directly from Python code without the need to run it as a separate process.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import (
|
| 16 |
+
absolute_import,
|
| 17 |
+
print_function,
|
| 18 |
+
unicode_literals,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
__metaclass__ = type
|
| 22 |
+
|
| 23 |
+
from argparse import ArgumentParser
|
| 24 |
+
from contextlib import contextmanager
|
| 25 |
+
from datetime import datetime
|
| 26 |
+
import errno
|
| 27 |
+
import gzip
|
| 28 |
+
from multiprocessing import Pool
|
| 29 |
+
import os
|
| 30 |
+
import os.path
|
| 31 |
+
import pipes
|
| 32 |
+
from shutil import rmtree
|
| 33 |
+
from subprocess import check_call
|
| 34 |
+
import sys
|
| 35 |
+
import tempfile
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def get_unicode_type():
|
| 39 |
+
"""Return the Unicode string type appropriate to this Python version."""
|
| 40 |
+
if sys.version_info.major <= 2:
|
| 41 |
+
# Unicode string type. In Python 2 this is the "unicode" type,
|
| 42 |
+
# while "str" is a binary string type.
|
| 43 |
+
return unicode
|
| 44 |
+
else:
|
| 45 |
+
# Unicode string type. In Python 3 this is the default "str" type.
|
| 46 |
+
# The binary string type is now called "bytes".
|
| 47 |
+
return str
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
UNICODE_TYPE = get_unicode_type()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class CommandLineError(Exception):
|
| 54 |
+
"""Invalid command line."""
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class ProgramFailure(Exception):
|
| 58 |
+
"""Failure, not a bug, which is reported neatly to the user."""
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def parse_args():
|
| 62 |
+
"""Parse command line arguments, return as `Namespace`."""
|
| 63 |
+
parser = ArgumentParser(description=__doc__)
|
| 64 |
+
parser.add_argument(
|
| 65 |
+
'--extract-file', '-e', metavar='PATH', required=True,
|
| 66 |
+
help=(
|
| 67 |
+
"Path to input file: extract file (e.g. 'extract.sorted.gz' or "
|
| 68 |
+
"'extract.inv.sorted.gz'). Required."))
|
| 69 |
+
parser.add_argument(
|
| 70 |
+
'--lex-file', '-l', metavar='PATH', required=True,
|
| 71 |
+
help=(
|
| 72 |
+
"Path to input file: lex file (e.g. 'lex.f2e' or 'lex.e2f'). "
|
| 73 |
+
"Required."))
|
| 74 |
+
parser.add_argument(
|
| 75 |
+
'--output', '-o', metavar='PATH', required=True,
|
| 76 |
+
help=(
|
| 77 |
+
"Write phrase table to file PATH (e.g. 'phrase-table.half.f2e' "
|
| 78 |
+
"or 'phrase-table.half.e2f'). Required."))
|
| 79 |
+
parser.add_argument(
|
| 80 |
+
'--inverse', '-i', action='store_true',
|
| 81 |
+
help="Inverse scoring. Defaults to direct scoring.")
|
| 82 |
+
parser.add_argument(
|
| 83 |
+
'--labels-file', '-L', metavar='PATH',
|
| 84 |
+
help="Also write source labels to file PATH.")
|
| 85 |
+
parser.add_argument(
|
| 86 |
+
'--parts-of-speech', '-p', metavar='PATH',
|
| 87 |
+
help="Also write parts-of-speech file to PATH.")
|
| 88 |
+
parser.add_argument(
|
| 89 |
+
'--flexibility-score', '-F', metavar='PATH',
|
| 90 |
+
help="Path to the 'flexibility_score.py' script. Defaults to none.")
|
| 91 |
+
parser.add_argument(
|
| 92 |
+
'--hierarchical', '-H', action='store_true',
|
| 93 |
+
help="Process hierarchical rules.")
|
| 94 |
+
parser.add_argument(
|
| 95 |
+
'--args', '-a', metavar='ARGUMENTS',
|
| 96 |
+
help="Additional arguments for `score` and `flexibility_score`.")
|
| 97 |
+
parser.add_argument(
|
| 98 |
+
'--sort', '-s', action='store_true',
|
| 99 |
+
help="Sort output file.")
|
| 100 |
+
parser.add_argument(
|
| 101 |
+
'--jobs', '-j', metavar='N', type=int, default=1,
|
| 102 |
+
help="Run up to N jobs in parallel. Defaults to %(default)s.")
|
| 103 |
+
parser.add_argument(
|
| 104 |
+
'--score-exe', '-x', metavar='PROGRAM',
|
| 105 |
+
help="Name of, or path to, the 'score' executable.")
|
| 106 |
+
parser.add_argument(
|
| 107 |
+
'--sort-command', '-S', metavar='COMMAND-LINE',
|
| 108 |
+
help=(
|
| 109 |
+
"Command line for sorting text files to standard output. "
|
| 110 |
+
"Must support operation as a pipe, as well as input files named "
|
| 111 |
+
"as command-line arguments."))
|
| 112 |
+
parser.add_argument(
|
| 113 |
+
'--gzip-command', '-z', metavar='PROGRAM',
|
| 114 |
+
help="Path to a gzip or pigz executable.")
|
| 115 |
+
parser.add_argument(
|
| 116 |
+
'--verbose', '-v', action='store_true',
|
| 117 |
+
help="Print what's going on.")
|
| 118 |
+
parser.add_argument(
|
| 119 |
+
'--debug', '-d', action='store_true',
|
| 120 |
+
help="Don't delete temporary directories when done.")
|
| 121 |
+
return parser.parse_args()
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def normalize_path(optional_path=None):
|
| 125 |
+
"""Return a cleaned-up version of a given filesystem path, or None.
|
| 126 |
+
|
| 127 |
+
Converts the path to the operating system's native conventions, and
|
| 128 |
+
removes redundancies like `.`.
|
| 129 |
+
|
| 130 |
+
The return value will be `None`, an absolute path, or a relative path,
|
| 131 |
+
same as the argument. But it will have redundant path separators,
|
| 132 |
+
unnecessary detours through parent directories, and use of the current
|
| 133 |
+
directory "." removed.
|
| 134 |
+
"""
|
| 135 |
+
if optional_path is None:
|
| 136 |
+
return None
|
| 137 |
+
else:
|
| 138 |
+
path = os.path.normpath(optional_path)
|
| 139 |
+
path = path.replace('/', os.path.sep)
|
| 140 |
+
path = path.replace('\\', os.path.sep)
|
| 141 |
+
return path
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def quote(path):
|
| 145 |
+
"""Quote and escape a filename for use in a shell command.
|
| 146 |
+
|
| 147 |
+
The Windows implementation is very limited and will break on anything
|
| 148 |
+
more advanced than a space.
|
| 149 |
+
"""
|
| 150 |
+
if os.name == 'posix':
|
| 151 |
+
return pipes.quote(path)
|
| 152 |
+
else:
|
| 153 |
+
# TODO: Improve escaping for Windows.
|
| 154 |
+
return '"%s"' % path
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def sanitize_args(args):
|
| 158 |
+
"""Check `args` for sanity, clean up, and set nontrivial defaults."""
|
| 159 |
+
if args.jobs < 1:
|
| 160 |
+
raise CommandLineError("Number of parallel jobs must be 1 or more.")
|
| 161 |
+
if args.sort_command is None:
|
| 162 |
+
args.sort_command = find_first_executable(
|
| 163 |
+
['neandersort', 'gsort', 'sort'])
|
| 164 |
+
if args.sort_command is None:
|
| 165 |
+
raise CommandLineError(
|
| 166 |
+
"No 'sort' command is available. "
|
| 167 |
+
"Choose one using the --sort-command option.")
|
| 168 |
+
if args.gzip_command is None:
|
| 169 |
+
args.gzip_command = find_first_executable(['pigz', 'gzip'])
|
| 170 |
+
if args.gzip_command is None:
|
| 171 |
+
raise CommandLineError(
|
| 172 |
+
"No 'gzip' or 'pigz' command is available. "
|
| 173 |
+
"Choose one using the --gzip-command option.")
|
| 174 |
+
if args.score_exe is None:
|
| 175 |
+
# Look for "score" executable. It may be in the current project
|
| 176 |
+
# directory somewhere, or in the PATH.
|
| 177 |
+
moses_dir = os.path.dirname(os.path.dirname(
|
| 178 |
+
os.path.abspath(__file__)))
|
| 179 |
+
args.score_exe = find_first_executable(
|
| 180 |
+
['score'],
|
| 181 |
+
[
|
| 182 |
+
moses_dir,
|
| 183 |
+
os.path.join(moses_dir, 'phrase-extract'),
|
| 184 |
+
os.path.join(moses_dir, 'binaries'),
|
| 185 |
+
])
|
| 186 |
+
args.extract_file = normalize_path(args.extract_file)
|
| 187 |
+
args.lex_file = normalize_path(args.lex_file)
|
| 188 |
+
args.output = normalize_path(args.output)
|
| 189 |
+
args.labels_file = normalize_path(args.labels_file)
|
| 190 |
+
args.parts_of_speech = normalize_path(args.parts_of_speech)
|
| 191 |
+
args.flexibility_score = normalize_path(args.flexibility_score)
|
| 192 |
+
args.score_exe = normalize_path(args.score_exe)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def add_exe_suffix(program):
|
| 196 |
+
"""Return the full filename for an executable.
|
| 197 |
+
|
| 198 |
+
On Windows, this adds a `.exe` suffix to the name. On other
|
| 199 |
+
systems, it returns the original name unchanged.
|
| 200 |
+
"""
|
| 201 |
+
if os.name == 'nt':
|
| 202 |
+
# Windows.
|
| 203 |
+
return program + '.exe'
|
| 204 |
+
else:
|
| 205 |
+
# Assume POSIX or similar.
|
| 206 |
+
return program
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def find_executable(exe, extra_path=None):
|
| 210 |
+
"""Return full path to an executable of the given name, or `None`.
|
| 211 |
+
|
| 212 |
+
If the given name is a qualified path to an executable, it will be returned
|
| 213 |
+
unchanged. A qualified path where no executable is found results in a
|
| 214 |
+
`CommandLineError`.
|
| 215 |
+
"""
|
| 216 |
+
if extra_path is None:
|
| 217 |
+
extra_path = []
|
| 218 |
+
|
| 219 |
+
if os.path.sep in exe:
|
| 220 |
+
# The executable name includes a path. Only one place it can be.
|
| 221 |
+
if not os.path.isfile(exe) or not os.access(exe, os.X_OK):
|
| 222 |
+
raise CommandLineError("Not an executable: '%s'." % exe)
|
| 223 |
+
return exe
|
| 224 |
+
|
| 225 |
+
for path in extra_path + os.getenv('PATH').split(os.pathsep):
|
| 226 |
+
full_path = os.path.join(path, exe)
|
| 227 |
+
if os.access(full_path, os.X_OK):
|
| 228 |
+
return full_path
|
| 229 |
+
return None
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def find_first_executable(candidates, extra_path=None):
|
| 233 |
+
"""Find the first available of the given candidate programs.
|
| 234 |
+
|
| 235 |
+
:raise ProgramFailure: If none of `candidates` was found.
|
| 236 |
+
"""
|
| 237 |
+
for program in candidates:
|
| 238 |
+
executable = find_executable(add_exe_suffix(program), extra_path)
|
| 239 |
+
if executable is not None:
|
| 240 |
+
return executable
|
| 241 |
+
raise ProgramFailure(
|
| 242 |
+
"Could not find any of these executables in path: %s."
|
| 243 |
+
% ', '.join(candidates))
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def execute_shell(command, verbose=False):
|
| 247 |
+
"""Run `command` string through the shell.
|
| 248 |
+
|
| 249 |
+
Inherits environment, but sets `LC_ALL` to `C` for predictable results,
|
| 250 |
+
especially from sort commands.
|
| 251 |
+
|
| 252 |
+
This uses a full-featured shell, including pipes, substitution, etc. So
|
| 253 |
+
remember to quote/escape arguments where appropriate!
|
| 254 |
+
"""
|
| 255 |
+
assert isinstance(command, UNICODE_TYPE), (
|
| 256 |
+
"Wrong argument for execute_shell.")
|
| 257 |
+
if verbose:
|
| 258 |
+
print("Executing: %s" % command)
|
| 259 |
+
env = os.environ.copy()
|
| 260 |
+
if os.name == 'posix':
|
| 261 |
+
env['LC_ALL'] = 'C'
|
| 262 |
+
check_call(command, shell=True, env=env)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
@contextmanager
|
| 266 |
+
def tempdir(keep=False):
|
| 267 |
+
"""Context manager: temporary directory."""
|
| 268 |
+
directory = tempfile.mkdtemp()
|
| 269 |
+
yield directory
|
| 270 |
+
if not keep:
|
| 271 |
+
rmtree(directory)
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def make_dirs(path):
|
| 275 |
+
"""Equivalent to `mkdir -p -- path`."""
|
| 276 |
+
try:
|
| 277 |
+
os.makedirs(path)
|
| 278 |
+
except OSError as error:
|
| 279 |
+
if error.errno != errno.EEXIST:
|
| 280 |
+
raise
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def open_file(path, mode='r'):
|
| 284 |
+
"""Open a file, which may be gzip-compressed."""
|
| 285 |
+
if path.endswith('.gz'):
|
| 286 |
+
return gzip.open(path, mode)
|
| 287 |
+
else:
|
| 288 |
+
return open(path, mode)
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def count_lines(filename):
|
| 292 |
+
"""Count the number of lines in `filename` (may be gzip-compressed)."""
|
| 293 |
+
count = 0
|
| 294 |
+
with open_file(filename) as stream:
|
| 295 |
+
for _ in stream:
|
| 296 |
+
count += 1
|
| 297 |
+
return count
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def set_temp_dir():
|
| 301 |
+
"""Set temporary directory to `$MOSES_TEMP_DIR`, if set.
|
| 302 |
+
|
| 303 |
+
Create the directory if necessary.
|
| 304 |
+
"""
|
| 305 |
+
temp_dir = os.getenv('MOSES_TEMP_DIR')
|
| 306 |
+
if temp_dir is not None:
|
| 307 |
+
make_dirs(temp_dir)
|
| 308 |
+
tempfile.tempdir = temp_dir
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
def strip_newline(line):
|
| 312 |
+
"""Remove trailing carriage return and/or line feed, if present."""
|
| 313 |
+
if line.endswith('\n'):
|
| 314 |
+
line = line[:-1]
|
| 315 |
+
if line.endswith('\r'):
|
| 316 |
+
line = line[:-1]
|
| 317 |
+
return line
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def open_chunk_file(split_dir, chunk_number):
|
| 321 |
+
"""Open a file to write one chunk of the extract file."""
|
| 322 |
+
return open_file(
|
| 323 |
+
os.path.join(split_dir, 'extract.%d.gz' % chunk_number), 'w')
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def name_context_chunk_file(split_dir, chunk_number):
|
| 327 |
+
"""Compose file name for one chunk of the extract context file."""
|
| 328 |
+
return os.path.join(
|
| 329 |
+
split_dir, 'extract.context.%d.gz' % chunk_number)
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
def extract_source_phrase(line):
|
| 333 |
+
"""Extract the source phrase from an extract-file line."""
|
| 334 |
+
return line.split(b'|||', 1)[0]
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def cut_context_file(last_source_phrase, chunk_file, last_line,
|
| 338 |
+
context_stream):
|
| 339 |
+
"""Write one chunk of extract context file into its own file.
|
| 340 |
+
|
| 341 |
+
:param last_source_phrase: Last source phrase that should be in the
|
| 342 |
+
chunk. Stop processing after this source phrase.
|
| 343 |
+
:param chunk_file: Path to the extract context file for this chunk.
|
| 344 |
+
:param last_line: Previously read line that may still need writing.
|
| 345 |
+
:param context_stream: Extract context file, opened for reading.
|
| 346 |
+
:return: Last line read from `context_stream`. This line will still
|
| 347 |
+
need processing.
|
| 348 |
+
"""
|
| 349 |
+
# TODO: Use open_file.
|
| 350 |
+
with gzip.open(chunk_file, 'w') as chunk:
|
| 351 |
+
if last_line is not None:
|
| 352 |
+
chunk.write('%s\n' % last_line)
|
| 353 |
+
|
| 354 |
+
# Are we processing our last source phrase yet?
|
| 355 |
+
on_last_source_phrase = False
|
| 356 |
+
|
| 357 |
+
# Write all lines in context file until we meet last source phrase
|
| 358 |
+
# in extract file.
|
| 359 |
+
for line in context_stream:
|
| 360 |
+
# Reading from a gzip file returns lines *including the newline*.
|
| 361 |
+
# Either way, we want to ignore carriage returns as well.
|
| 362 |
+
line = strip_newline(line)
|
| 363 |
+
source_phrase = extract_source_phrase(line)
|
| 364 |
+
if on_last_source_phrase and source_phrase != last_source_phrase:
|
| 365 |
+
# First new source phrase after our last one. We're done.
|
| 366 |
+
return line
|
| 367 |
+
else:
|
| 368 |
+
# Still adding lines to our chunk.
|
| 369 |
+
chunk.write('%s\n' % line)
|
| 370 |
+
if source_phrase == last_source_phrase:
|
| 371 |
+
# We're on our last source phrase now.
|
| 372 |
+
on_last_source_phrase = True
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def split_extract_files(split_dir, extract_file, extract_context_file=None,
|
| 376 |
+
jobs=1):
|
| 377 |
+
"""Split extract file into chunks, so we can process them in parallel.
|
| 378 |
+
|
| 379 |
+
:param split_dir: A temporary directory where this function can write
|
| 380 |
+
temporary files. The caller must ensure that this directory will be
|
| 381 |
+
cleaned up after it's done with the files.
|
| 382 |
+
:return: An iterable of tuples. Each tuple hols a partial extract file,
|
| 383 |
+
and the corresponding context file. The files may be in `split_dir`,
|
| 384 |
+
or there may just be the original extract file.
|
| 385 |
+
"""
|
| 386 |
+
if jobs == 1:
|
| 387 |
+
# No splitting needed. Read the original file(s).
|
| 388 |
+
return [(extract_file, extract_context_file)]
|
| 389 |
+
|
| 390 |
+
# Otherwise: split files.
|
| 391 |
+
files = []
|
| 392 |
+
num_lines = count_lines(extract_file)
|
| 393 |
+
chunk_size = (num_lines + jobs - 1) / jobs
|
| 394 |
+
assert isinstance(chunk_size, int)
|
| 395 |
+
|
| 396 |
+
line_count = 0
|
| 397 |
+
chunk_number = 0
|
| 398 |
+
prev_source_phrase = None
|
| 399 |
+
last_line_context = None
|
| 400 |
+
extract_stream = open_file(extract_file)
|
| 401 |
+
chunk_file = open_chunk_file(split_dir, chunk_number)
|
| 402 |
+
if extract_context_file is None:
|
| 403 |
+
chunk_context_file = None
|
| 404 |
+
if extract_context_file is not None:
|
| 405 |
+
context_stream = open_file(extract_context_file)
|
| 406 |
+
|
| 407 |
+
for line in extract_stream:
|
| 408 |
+
line_count += 1
|
| 409 |
+
line = line.decode('utf-8')
|
| 410 |
+
line = strip_newline(line)
|
| 411 |
+
if line_count >= chunk_size:
|
| 412 |
+
# At or over chunk size. Cut off at next source phrase change.
|
| 413 |
+
source_phrase = extract_source_phrase(line)
|
| 414 |
+
if prev_source_phrase is None:
|
| 415 |
+
# Start looking for a different source phrase.
|
| 416 |
+
prev_source_phrase = source_phrase
|
| 417 |
+
elif source_phrase == prev_source_phrase:
|
| 418 |
+
# Can't cut yet. Still working on the same source phrase.
|
| 419 |
+
pass
|
| 420 |
+
else:
|
| 421 |
+
# Hit first new source phrase after chunk limit. Cut new
|
| 422 |
+
# file(s).
|
| 423 |
+
chunk_file.close()
|
| 424 |
+
if extract_context_file is not None:
|
| 425 |
+
chunk_context_file = name_context_chunk_file(
|
| 426 |
+
split_dir, chunk_number)
|
| 427 |
+
last_line_context = cut_context_file(
|
| 428 |
+
prev_source_phrase, chunk_context_file,
|
| 429 |
+
last_line_context, context_stream)
|
| 430 |
+
files.append((chunk_file.name, chunk_context_file))
|
| 431 |
+
|
| 432 |
+
# Start on new chunk.
|
| 433 |
+
prev_source_phrase = None
|
| 434 |
+
line_count = 0
|
| 435 |
+
chunk_number += 1
|
| 436 |
+
chunk_file = open_chunk_file(split_dir, chunk_number)
|
| 437 |
+
chunk_file.write(('%s\n' % line).encode('utf-8'))
|
| 438 |
+
|
| 439 |
+
chunk_file.close()
|
| 440 |
+
if extract_context_file is not None:
|
| 441 |
+
chunk_context_file = name_context_chunk_file(split_dir, chunk_number)
|
| 442 |
+
last_line_context = cut_context_file(
|
| 443 |
+
prev_source_phrase, chunk_number, last_line_context,
|
| 444 |
+
context_stream)
|
| 445 |
+
files.append((chunk_file.name, chunk_context_file))
|
| 446 |
+
return files
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
def compose_score_command(extract_file, context_file, half_file,
|
| 450 |
+
flex_half_file, args):
|
| 451 |
+
"""Compose command line text to run one instance of `score`.
|
| 452 |
+
|
| 453 |
+
:param extract_file: One chunk of extract file.
|
| 454 |
+
:param context_file: If doing flexibility scoring, one chunk of
|
| 455 |
+
extract context file. Otherwise, None.
|
| 456 |
+
:param half_file: ???
|
| 457 |
+
:param flex_half_file: ???
|
| 458 |
+
:param args: Arguments namespace.
|
| 459 |
+
"""
|
| 460 |
+
command = [
|
| 461 |
+
args.score_exe,
|
| 462 |
+
extract_file,
|
| 463 |
+
args.lex_file,
|
| 464 |
+
half_file,
|
| 465 |
+
]
|
| 466 |
+
if args.args not in (None, ''):
|
| 467 |
+
command.append(args.args)
|
| 468 |
+
other_args = build_score_args(args)
|
| 469 |
+
if other_args != '':
|
| 470 |
+
command.append(other_args)
|
| 471 |
+
if context_file is not None:
|
| 472 |
+
command += [
|
| 473 |
+
'&&',
|
| 474 |
+
find_first_executable(['bzcat']),
|
| 475 |
+
half_file,
|
| 476 |
+
'|',
|
| 477 |
+
quote(args.flexibility_score),
|
| 478 |
+
quote(context_file),
|
| 479 |
+
]
|
| 480 |
+
if args.inverse:
|
| 481 |
+
command.append('--Inverse')
|
| 482 |
+
if args.hierarchical:
|
| 483 |
+
command.append('--Hierarchical')
|
| 484 |
+
command += [
|
| 485 |
+
'|',
|
| 486 |
+
quote(args.gzip_command),
|
| 487 |
+
'-c',
|
| 488 |
+
'>%s' % quote(flex_half_file),
|
| 489 |
+
]
|
| 490 |
+
return ' '.join(command)
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
def score_parallel(split_dir, file_pairs, args):
|
| 494 |
+
"""Run the `score` command in parallel.
|
| 495 |
+
|
| 496 |
+
:param split_dir: Temporary directory where we can create split files.
|
| 497 |
+
:param file_pairs: Sequence of tuples for the input files, one tuple
|
| 498 |
+
per chunk of the work. Each tuple consists of a partial extract
|
| 499 |
+
file, and optionally a partial extract context file.
|
| 500 |
+
:param args: Arguments namespace.
|
| 501 |
+
:return: A list of tuples. Each tuple contains two file paths. The first
|
| 502 |
+
is for a partial half-phrase-table file. The second is for the
|
| 503 |
+
corresponding partial flex file, if a context file is given; or
|
| 504 |
+
`None` otherwise.
|
| 505 |
+
"""
|
| 506 |
+
partial_files = []
|
| 507 |
+
# Pool of worker processes for executing the partial "score" invocations
|
| 508 |
+
# concurrently.
|
| 509 |
+
pool = Pool(args.jobs)
|
| 510 |
+
try:
|
| 511 |
+
for chunk_num, file_pair in enumerate(file_pairs):
|
| 512 |
+
half_file = os.path.join(
|
| 513 |
+
split_dir, 'phrase-table.half.%06d.gz' % chunk_num)
|
| 514 |
+
extract_file, context_file = file_pair
|
| 515 |
+
if context_file is None:
|
| 516 |
+
flex_half_file = None
|
| 517 |
+
else:
|
| 518 |
+
flex_half_file = os.path.join(
|
| 519 |
+
split_dir, 'phrase-table.half.%06d.flex.gz' % chunk_num)
|
| 520 |
+
# Pickling of arguments for the pool is awkward on Windows, so
|
| 521 |
+
# keep them simple. Compose the command line in the parent
|
| 522 |
+
# process, then hand them to worker processes which execute them.
|
| 523 |
+
command_line = compose_score_command(
|
| 524 |
+
extract_file, context_file, half_file, flex_half_file, args)
|
| 525 |
+
pool.apply_async(
|
| 526 |
+
execute_shell, (command_line, ), {'verbose': args.verbose})
|
| 527 |
+
partial_files.append((half_file, flex_half_file))
|
| 528 |
+
pool.close()
|
| 529 |
+
except BaseException:
|
| 530 |
+
pool.terminate()
|
| 531 |
+
raise
|
| 532 |
+
finally:
|
| 533 |
+
pool.join()
|
| 534 |
+
return partial_files
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
def merge_and_sort(files, output, sort_command=None, gzip_exe=None,
|
| 538 |
+
verbose=False):
|
| 539 |
+
"""Merge partial files.
|
| 540 |
+
|
| 541 |
+
:param files: List of partial half-phrase-table files.
|
| 542 |
+
:param output: Path for resulting combined phrase-table file.
|
| 543 |
+
"""
|
| 544 |
+
# TODO: The Perl code mentioned "sort" and "flexibility_score" here.
|
| 545 |
+
# What do we do with those?
|
| 546 |
+
|
| 547 |
+
# Sort whether we're asked to or not, as a way of combining the input
|
| 548 |
+
# files.
|
| 549 |
+
if sort_command == 'neandersort':
|
| 550 |
+
# Neandersort transparently decompresses input and compresses output.
|
| 551 |
+
check_call([
|
| 552 |
+
'neandersort',
|
| 553 |
+
'-o', output,
|
| 554 |
+
] + files)
|
| 555 |
+
else:
|
| 556 |
+
command = (
|
| 557 |
+
"%(gzip)s -c -d %(files)s | "
|
| 558 |
+
"%(sort)s | "
|
| 559 |
+
"%(gzip)s -c >>%(output)s"
|
| 560 |
+
% {
|
| 561 |
+
'gzip': quote(gzip_exe),
|
| 562 |
+
'sort': sort_command,
|
| 563 |
+
'files': ' '.join(map(quote, files)),
|
| 564 |
+
'output': quote(output),
|
| 565 |
+
})
|
| 566 |
+
execute_shell(command, verbose=verbose)
|
| 567 |
+
|
| 568 |
+
|
| 569 |
+
def build_score_args(args):
|
| 570 |
+
"""Compose command line for the `score` program."""
|
| 571 |
+
command_line = []
|
| 572 |
+
if args.labels_file:
|
| 573 |
+
command_line += [
|
| 574 |
+
'--SourceLabels',
|
| 575 |
+
'--SourceLabelCountsLHS',
|
| 576 |
+
'--SourceLabelSet',
|
| 577 |
+
]
|
| 578 |
+
if args.parts_of_speech:
|
| 579 |
+
command_line.append('--PartsOfSpeech')
|
| 580 |
+
if args.inverse:
|
| 581 |
+
command_line.append('--Inverse')
|
| 582 |
+
if args.args is not None:
|
| 583 |
+
command_line.append(args.args)
|
| 584 |
+
return ' '.join(command_line)
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
def list_existing(paths):
|
| 588 |
+
"""Return, in the same order, those of the given files which exist."""
|
| 589 |
+
return filter(os.path.exists, paths)
|
| 590 |
+
|
| 591 |
+
|
| 592 |
+
def compose_coc_path_for(path):
|
| 593 |
+
"""Compose COC-file path for the given file."""
|
| 594 |
+
return '%s.coc' % path
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
def read_cocs(path):
|
| 598 |
+
"""Read COC file at `path`, return contents as tuple of ints."""
|
| 599 |
+
with open(path) as lines:
|
| 600 |
+
return tuple(
|
| 601 |
+
int(line.rstrip('\r\n'))
|
| 602 |
+
for line in lines
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
|
| 606 |
+
def add_cocs(original, additional):
|
| 607 |
+
"""Add two tuples of COCs. Extend as needed."""
|
| 608 |
+
assert not (original is None and additional is None), "No COCs to add!"
|
| 609 |
+
if original is None:
|
| 610 |
+
return additional
|
| 611 |
+
elif additional is None:
|
| 612 |
+
return original
|
| 613 |
+
else:
|
| 614 |
+
common = tuple(lhs + rhs for lhs, rhs in zip(original, additional))
|
| 615 |
+
return (
|
| 616 |
+
common +
|
| 617 |
+
tuple(original[len(common):]) +
|
| 618 |
+
tuple(additional[len(common):]))
|
| 619 |
+
|
| 620 |
+
|
| 621 |
+
def merge_coc(files, output):
|
| 622 |
+
"""Merge COC files for the given partial files.
|
| 623 |
+
|
| 624 |
+
Each COC file is a series of integers, one per line. This reads them, and
|
| 625 |
+
adds them up line-wise into one file of the same format: the sum of the
|
| 626 |
+
numbers the respective files have at line 1, the sum of the numbers the
|
| 627 |
+
respective files have at line 2, and so on.
|
| 628 |
+
"""
|
| 629 |
+
assert len(files) > 0, "No partial files - no work to do."
|
| 630 |
+
extract_files = [extract_file for extract_file, _ in files]
|
| 631 |
+
if not os.path.exists(compose_coc_path_for(extract_files[0])):
|
| 632 |
+
# Nothing to merge.
|
| 633 |
+
return
|
| 634 |
+
totals = None
|
| 635 |
+
# TODO: Shouldn't we just fail if any of these files is missing?
|
| 636 |
+
for coc_path in list_existing(map(compose_coc_path_for, extract_files)):
|
| 637 |
+
totals = add_cocs(totals, read_cocs(coc_path))
|
| 638 |
+
|
| 639 |
+
# Write to output file.
|
| 640 |
+
with open(output, 'w') as output_stream:
|
| 641 |
+
for entry in totals:
|
| 642 |
+
output_stream.write('%d\n' % entry)
|
| 643 |
+
|
| 644 |
+
|
| 645 |
+
def suffix_line_numbers(infile, outfile):
|
| 646 |
+
"""Rewrite `infile` to `outfile`; suffix line number to each line.
|
| 647 |
+
|
| 648 |
+
The line number is zero-based, and separated from the rest of the line
|
| 649 |
+
by a single space.
|
| 650 |
+
"""
|
| 651 |
+
temp_file = '%s.numbering' % outfile
|
| 652 |
+
with open(infile, 'r') as instream, open(outfile, 'w') as outstream:
|
| 653 |
+
line_no = 0
|
| 654 |
+
for line in instream:
|
| 655 |
+
outstream.write(line)
|
| 656 |
+
outstream.write(' %d\n' % line_no)
|
| 657 |
+
line_no += 1
|
| 658 |
+
os.rename(temp_file, outfile)
|
| 659 |
+
|
| 660 |
+
|
| 661 |
+
def compose_source_labels_path_for(path):
|
| 662 |
+
"""Return source labels file path for given file."""
|
| 663 |
+
return '%s.syntaxLabels.src' % path
|
| 664 |
+
|
| 665 |
+
|
| 666 |
+
def merge_numbered_files(inputs, output, header_lines, sort_command,
|
| 667 |
+
verbose=False):
|
| 668 |
+
"""Sort and merge files `inputs`, add header and line numbers.
|
| 669 |
+
|
| 670 |
+
:param inputs: Iterable of input files.
|
| 671 |
+
:param output: Output file.
|
| 672 |
+
:header_lines: Iterable of header lines.
|
| 673 |
+
:sort_command: Command line for sorting input files.
|
| 674 |
+
"""
|
| 675 |
+
sort_temp = '%s.sorting' % output
|
| 676 |
+
with open(sort_temp, 'w') as stream:
|
| 677 |
+
for line in header_lines:
|
| 678 |
+
stream.write(line)
|
| 679 |
+
stream.write('\n')
|
| 680 |
+
execute_shell(
|
| 681 |
+
"%s %s >>%s" % (
|
| 682 |
+
sort_command,
|
| 683 |
+
' '.join(map(quote, inputs)),
|
| 684 |
+
quote(sort_temp)),
|
| 685 |
+
verbose=verbose)
|
| 686 |
+
suffix_line_numbers(sort_temp, output)
|
| 687 |
+
|
| 688 |
+
|
| 689 |
+
def merge_source_labels(files, output, sort_command, verbose=False):
|
| 690 |
+
"""Merge source labels files."""
|
| 691 |
+
# TODO: Shouldn't we just fail if any of these files is missing?
|
| 692 |
+
labels_files = list_existing(map(compose_source_labels_path_for, files))
|
| 693 |
+
header = [
|
| 694 |
+
'GlueTop',
|
| 695 |
+
'GlueX',
|
| 696 |
+
'SSTART',
|
| 697 |
+
'SEND',
|
| 698 |
+
]
|
| 699 |
+
merge_numbered_files(
|
| 700 |
+
labels_files, output, header, sort_command, verbose=verbose)
|
| 701 |
+
|
| 702 |
+
|
| 703 |
+
def compose_parts_of_speech_path_for(path):
|
| 704 |
+
"""Return parts-of-speech file path for given file."""
|
| 705 |
+
return '%s.partsOfSpeech' % path
|
| 706 |
+
|
| 707 |
+
|
| 708 |
+
def merge_parts_of_speech(files, output, sort_command, verbose=False):
|
| 709 |
+
"""Merge parts-of-speech files into output."""
|
| 710 |
+
# TODO: Shouldn't we just fail if any of these files is missing?
|
| 711 |
+
parts_files = list_existing(map(compose_parts_of_speech_path_for, files))
|
| 712 |
+
header = [
|
| 713 |
+
'SSTART',
|
| 714 |
+
'SEND',
|
| 715 |
+
]
|
| 716 |
+
merge_numbered_files(
|
| 717 |
+
parts_files, output, header, sort_command, verbose=verbose)
|
| 718 |
+
|
| 719 |
+
|
| 720 |
+
def main():
|
| 721 |
+
"""Command-line entry point. Marshals and forwards to `score_parallel`."""
|
| 722 |
+
args = parse_args()
|
| 723 |
+
sanitize_args(args)
|
| 724 |
+
set_temp_dir()
|
| 725 |
+
|
| 726 |
+
if args.flexibility_score is None:
|
| 727 |
+
extract_context_file = None
|
| 728 |
+
else:
|
| 729 |
+
extract_context_file = args.extract_file.replace(
|
| 730 |
+
'extract.', 'extract.context.')
|
| 731 |
+
|
| 732 |
+
if args.verbose:
|
| 733 |
+
print("Started %s." % datetime.now())
|
| 734 |
+
print("Using '%s' for gzip." % args.gzip_command)
|
| 735 |
+
|
| 736 |
+
with tempdir(args.debug) as split_dir:
|
| 737 |
+
extract_files = split_extract_files(
|
| 738 |
+
split_dir, args.extract_file,
|
| 739 |
+
extract_context_file=extract_context_file, jobs=args.jobs)
|
| 740 |
+
|
| 741 |
+
scored_files = score_parallel(split_dir, extract_files, args)
|
| 742 |
+
|
| 743 |
+
if args.verbose:
|
| 744 |
+
sys.stderr.write("Finished score %s.\n" % datetime.now())
|
| 745 |
+
|
| 746 |
+
# TODO: Pass on "sort" and "flexibility-score" arguments?
|
| 747 |
+
merge_and_sort(
|
| 748 |
+
[phrase_chunk for phrase_chunk, _ in scored_files], args.output,
|
| 749 |
+
sort_command=args.sort_command, gzip_exe=args.gzip_command,
|
| 750 |
+
verbose=args.verbose)
|
| 751 |
+
merge_coc(extract_files, compose_coc_path_for(args.output))
|
| 752 |
+
|
| 753 |
+
if not args.inverse and args.labels_file is not None:
|
| 754 |
+
if args.verbose:
|
| 755 |
+
print("Merging source labels files.")
|
| 756 |
+
merge_source_labels(
|
| 757 |
+
extract_files, args.labels_file,
|
| 758 |
+
sort_command=args.sort_command, verbose=args.verbose)
|
| 759 |
+
|
| 760 |
+
if not args.inverse and args.parts_of_speech is not None:
|
| 761 |
+
if args.verbose:
|
| 762 |
+
print("Merging parts-of-speech files.")
|
| 763 |
+
merge_parts_of_speech(
|
| 764 |
+
extract_files, args.parts_of_speech,
|
| 765 |
+
sort_command=args.sort_command, verbose=args.verbose)
|
| 766 |
+
|
| 767 |
+
|
| 768 |
+
if __name__ == '__main__':
|
| 769 |
+
try:
|
| 770 |
+
main()
|
| 771 |
+
except ProgramFailure as error:
|
| 772 |
+
sys.stderr.write('%s\n' % error)
|
| 773 |
+
sys.exit(1)
|
| 774 |
+
except CommandLineError as error:
|
| 775 |
+
sys.stderr.write("Command line error: %s\n" % error)
|
| 776 |
+
sys.exit(2)
|
mosesdecoder/scripts/generic/strip-xml.perl
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
use warnings;
|
| 7 |
+
use strict;
|
| 8 |
+
|
| 9 |
+
while (my $line = <STDIN>) {
|
| 10 |
+
chomp($line);
|
| 11 |
+
#print "$line\n";
|
| 12 |
+
|
| 13 |
+
my $len = length($line);
|
| 14 |
+
my $inXML = 0;
|
| 15 |
+
my $prevSpace = 1;
|
| 16 |
+
my $prevBar = 0;
|
| 17 |
+
|
| 18 |
+
for (my $i = 0; $i < $len; ++$i) {
|
| 19 |
+
my $c = substr($line, $i, 1);
|
| 20 |
+
if ($c eq "<" && !$prevBar) {
|
| 21 |
+
++$inXML;
|
| 22 |
+
}
|
| 23 |
+
elsif ($c eq ">" && $inXML>0) {
|
| 24 |
+
--$inXML;
|
| 25 |
+
}
|
| 26 |
+
elsif ($prevSpace == 1 && $c eq " ")
|
| 27 |
+
{ # duplicate space. Do nothing
|
| 28 |
+
}
|
| 29 |
+
elsif ($inXML == 0) {
|
| 30 |
+
if ($c eq " ") {
|
| 31 |
+
$prevSpace = 1;
|
| 32 |
+
$prevBar = 0;
|
| 33 |
+
}
|
| 34 |
+
elsif ($c eq "|") {
|
| 35 |
+
$prevSpace = 0;
|
| 36 |
+
$prevBar = 1;
|
| 37 |
+
}
|
| 38 |
+
else {
|
| 39 |
+
$prevSpace = 0;
|
| 40 |
+
$prevBar = 0;
|
| 41 |
+
}
|
| 42 |
+
print $c;
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
print "\n";
|
| 47 |
+
}
|
| 48 |
+
|
mosesdecoder/scripts/generic/trainlm-irst2.perl
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env perl
|
| 2 |
+
#
|
| 3 |
+
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
| 4 |
+
# Public License version 2.1 or, at your option, any later version.
|
| 5 |
+
|
| 6 |
+
# Compatible with sri LM-creating script, eg.
|
| 7 |
+
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
|
| 8 |
+
# To use it in the EMS, add this to the [LM] section
|
| 9 |
+
# lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irst-dir"
|
| 10 |
+
# settings = ""
|
| 11 |
+
# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section.
|
| 12 |
+
# It should point to the root of the LM toolkit, eg
|
| 13 |
+
# irst-dir = /Users/hieu/workspace/irstlm/trunk/bin
|
| 14 |
+
# Set smoothing method in settings, if different from modified Kneser-Ney
|
| 15 |
+
|
| 16 |
+
use warnings;
|
| 17 |
+
use strict;
|
| 18 |
+
use FindBin qw($RealBin);
|
| 19 |
+
use Getopt::Long;
|
| 20 |
+
|
| 21 |
+
my $order = 3; # order of language model (default trigram)
|
| 22 |
+
my $corpusPath; # input text data
|
| 23 |
+
my $lmPath; # generated language model
|
| 24 |
+
my $cores = 2; # number of CPUs used
|
| 25 |
+
my $irstPath; # bin directory of IRSTLM
|
| 26 |
+
my $tempPath = "tmp"; # temp dir
|
| 27 |
+
my $pruneSingletons = 1; # 1 = prune singletons, 0 = keep singletons
|
| 28 |
+
my $smoothing = "msb"; # smoothing method: wb = witten-bell, sb = kneser-ney, msb = modified-kneser-ney
|
| 29 |
+
my $dummy;
|
| 30 |
+
|
| 31 |
+
GetOptions("order=s" => \$order,
|
| 32 |
+
"text=s" => \$corpusPath,
|
| 33 |
+
"lm=s" => \$lmPath,
|
| 34 |
+
"cores=s" => \$cores,
|
| 35 |
+
"irst-dir=s" => \$irstPath,
|
| 36 |
+
"temp-dir=s" => \$tempPath,
|
| 37 |
+
"p=i" => \$pruneSingletons, # irstlm parameter: prune singletons
|
| 38 |
+
"s=s" => \$smoothing, # irstlm parameter: smoothing method
|
| 39 |
+
"interpolate!" => \$dummy, #ignore
|
| 40 |
+
"kndiscount!" => \$dummy #ignore
|
| 41 |
+
) or exit 1;
|
| 42 |
+
|
| 43 |
+
#die("ERROR: please set order") unless defined($order);
|
| 44 |
+
die("ERROR: please set text") unless defined($corpusPath);
|
| 45 |
+
die("ERROR: please set lm") unless defined($lmPath);
|
| 46 |
+
die("ERROR: please set irst-dir") unless defined($irstPath);
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
$tempPath .= "/irstlm-build-tmp.$$";
|
| 50 |
+
`mkdir -p $tempPath`;
|
| 51 |
+
|
| 52 |
+
# add <s> and </s>
|
| 53 |
+
my $cmd = "cat $corpusPath | $irstPath/add-start-end.sh > $tempPath/setagged";
|
| 54 |
+
print STDERR "EXECUTING $cmd\n";
|
| 55 |
+
`$cmd`;
|
| 56 |
+
|
| 57 |
+
# collect n-gram counts
|
| 58 |
+
$cmd = "$irstPath/ngt -i=$tempPath/setagged -n=$order -b=yes -o=$tempPath/counts";
|
| 59 |
+
print STDERR "EXECUTING $cmd\n";
|
| 60 |
+
`$cmd`;
|
| 61 |
+
|
| 62 |
+
# build lm
|
| 63 |
+
$cmd = "$irstPath/tlm -o=$lmPath -lm=$smoothing -bo=yes -n=$order -tr=$tempPath/counts";
|
| 64 |
+
$cmd .= " -ps=no" unless $pruneSingletons;
|
| 65 |
+
print STDERR "EXECUTING $cmd\n";
|
| 66 |
+
`$cmd`;
|
| 67 |
+
|
| 68 |
+
$cmd = "rm -rf $tempPath";
|
| 69 |
+
print STDERR "EXECUTING $cmd\n";
|
| 70 |
+
`$cmd`;
|
| 71 |
+
|
| 72 |
+
print STDERR "FINISH.\n";
|
mosesdecoder/scripts/share/nonbreaking_prefixes/README.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The language suffix can be found here:
|
| 2 |
+
|
| 3 |
+
http://www.loc.gov/standards/iso639-2/php/code_list.php
|
| 4 |
+
|
| 5 |
+
This code includes data from Daniel Naber's Language Tools (czech abbreviations).
|
| 6 |
+
This code includes data from czech wiktionary (also czech abbreviations).
|
| 7 |
+
|
| 8 |
+
|
mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.as
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
| 2 |
+
|
| 3 |
+
#common exceptions
|
| 4 |
+
# Dr
|
| 5 |
+
ড
|
| 6 |
+
|
| 7 |
+
#others
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
#phonetics
|
| 11 |
+
# A
|
| 12 |
+
এ
|
| 13 |
+
# B
|
| 14 |
+
বি
|
| 15 |
+
# C
|
| 16 |
+
সি
|
| 17 |
+
# D
|
| 18 |
+
ডি
|
| 19 |
+
# E
|
| 20 |
+
ই
|
| 21 |
+
# F
|
| 22 |
+
এফ
|
| 23 |
+
# G
|
| 24 |
+
জি
|
| 25 |
+
# H
|
| 26 |
+
এইচ
|
| 27 |
+
# I
|
| 28 |
+
আম
|
| 29 |
+
# J
|
| 30 |
+
জে
|
| 31 |
+
# K
|
| 32 |
+
কে
|
| 33 |
+
# L
|
| 34 |
+
এল
|
| 35 |
+
# M
|
| 36 |
+
এম
|
| 37 |
+
# N
|
| 38 |
+
এন
|
| 39 |
+
# O
|
| 40 |
+
হে
|
| 41 |
+
# P
|
| 42 |
+
পি
|
| 43 |
+
# Q
|
| 44 |
+
কিউ
|
| 45 |
+
# R
|
| 46 |
+
আর
|
| 47 |
+
# S
|
| 48 |
+
এস
|
| 49 |
+
# T
|
| 50 |
+
টি
|
| 51 |
+
# U
|
| 52 |
+
ইউ
|
| 53 |
+
# V
|
| 54 |
+
ভি
|
| 55 |
+
# W
|
| 56 |
+
ডব্লু
|
| 57 |
+
# X
|
| 58 |
+
এক্স
|
| 59 |
+
# Y
|
| 60 |
+
ওয়াই
|
| 61 |
+
# Z
|
| 62 |
+
জেড
|
| 63 |
+
|
| 64 |
+
#consonants
|
| 65 |
+
|
mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ca
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Dr
|
| 2 |
+
Dra
|
| 3 |
+
pàg
|
| 4 |
+
p
|
| 5 |
+
c
|
| 6 |
+
av
|
| 7 |
+
Sr
|
| 8 |
+
Sra
|
| 9 |
+
adm
|
| 10 |
+
esq
|
| 11 |
+
Prof
|
| 12 |
+
S.A
|
| 13 |
+
S.L
|
| 14 |
+
p.e
|
| 15 |
+
ptes
|
| 16 |
+
Sta
|
| 17 |
+
St
|
| 18 |
+
pl
|
| 19 |
+
màx
|
| 20 |
+
cast
|
| 21 |
+
dir
|
| 22 |
+
nre
|
| 23 |
+
fra
|
| 24 |
+
admdora
|
| 25 |
+
Emm
|
| 26 |
+
Excma
|
| 27 |
+
espf
|
| 28 |
+
dc
|
| 29 |
+
admdor
|
| 30 |
+
tel
|
| 31 |
+
angl
|
| 32 |
+
aprox
|
| 33 |
+
ca
|
| 34 |
+
dept
|
| 35 |
+
dj
|
| 36 |
+
dl
|
| 37 |
+
dt
|
| 38 |
+
ds
|
| 39 |
+
dg
|
| 40 |
+
dv
|
| 41 |
+
ed
|
| 42 |
+
entl
|
| 43 |
+
al
|
| 44 |
+
i.e
|
| 45 |
+
maj
|
| 46 |
+
smin
|
| 47 |
+
n
|
| 48 |
+
núm
|
| 49 |
+
pta
|
| 50 |
+
A
|
| 51 |
+
B
|
| 52 |
+
C
|
| 53 |
+
D
|
| 54 |
+
E
|
| 55 |
+
F
|
| 56 |
+
G
|
| 57 |
+
H
|
| 58 |
+
I
|
| 59 |
+
J
|
| 60 |
+
K
|
| 61 |
+
L
|
| 62 |
+
M
|
| 63 |
+
N
|
| 64 |
+
O
|
| 65 |
+
P
|
| 66 |
+
Q
|
| 67 |
+
R
|
| 68 |
+
S
|
| 69 |
+
T
|
| 70 |
+
U
|
| 71 |
+
V
|
| 72 |
+
W
|
| 73 |
+
X
|
| 74 |
+
Y
|
| 75 |
+
Z
|
mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.cs
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Bc
|
| 2 |
+
BcA
|
| 3 |
+
Ing
|
| 4 |
+
Ing.arch
|
| 5 |
+
MUDr
|
| 6 |
+
MVDr
|
| 7 |
+
MgA
|
| 8 |
+
Mgr
|
| 9 |
+
JUDr
|
| 10 |
+
PhDr
|
| 11 |
+
RNDr
|
| 12 |
+
PharmDr
|
| 13 |
+
ThLic
|
| 14 |
+
ThDr
|
| 15 |
+
Ph.D
|
| 16 |
+
Th.D
|
| 17 |
+
prof
|
| 18 |
+
doc
|
| 19 |
+
CSc
|
| 20 |
+
DrSc
|
| 21 |
+
dr. h. c
|
| 22 |
+
PaedDr
|
| 23 |
+
Dr
|
| 24 |
+
PhMr
|
| 25 |
+
DiS
|
| 26 |
+
abt
|
| 27 |
+
ad
|
| 28 |
+
a.i
|
| 29 |
+
aj
|
| 30 |
+
angl
|
| 31 |
+
anon
|
| 32 |
+
apod
|
| 33 |
+
atd
|
| 34 |
+
atp
|
| 35 |
+
aut
|
| 36 |
+
bd
|
| 37 |
+
biogr
|
| 38 |
+
b.m
|
| 39 |
+
b.p
|
| 40 |
+
b.r
|
| 41 |
+
cca
|
| 42 |
+
cit
|
| 43 |
+
cizojaz
|
| 44 |
+
c.k
|
| 45 |
+
col
|
| 46 |
+
čes
|
| 47 |
+
čín
|
| 48 |
+
čj
|
| 49 |
+
ed
|
| 50 |
+
facs
|
| 51 |
+
fasc
|
| 52 |
+
fol
|
| 53 |
+
fot
|
| 54 |
+
franc
|
| 55 |
+
h.c
|
| 56 |
+
hist
|
| 57 |
+
hl
|
| 58 |
+
hrsg
|
| 59 |
+
ibid
|
| 60 |
+
il
|
| 61 |
+
ind
|
| 62 |
+
inv.č
|
| 63 |
+
jap
|
| 64 |
+
jhdt
|
| 65 |
+
jv
|
| 66 |
+
koed
|
| 67 |
+
kol
|
| 68 |
+
korej
|
| 69 |
+
kl
|
| 70 |
+
krit
|
| 71 |
+
lat
|
| 72 |
+
lit
|
| 73 |
+
m.a
|
| 74 |
+
maď
|
| 75 |
+
mj
|
| 76 |
+
mp
|
| 77 |
+
násl
|
| 78 |
+
např
|
| 79 |
+
nepubl
|
| 80 |
+
něm
|
| 81 |
+
no
|
| 82 |
+
nr
|
| 83 |
+
n.s
|
| 84 |
+
okr
|
| 85 |
+
odd
|
| 86 |
+
odp
|
| 87 |
+
obr
|
| 88 |
+
opr
|
| 89 |
+
orig
|
| 90 |
+
phil
|
| 91 |
+
pl
|
| 92 |
+
pokrač
|
| 93 |
+
pol
|
| 94 |
+
port
|
| 95 |
+
pozn
|
| 96 |
+
př.kr
|
| 97 |
+
př.n.l
|
| 98 |
+
přel
|
| 99 |
+
přeprac
|
| 100 |
+
příl
|
| 101 |
+
pseud
|
| 102 |
+
pt
|
| 103 |
+
red
|
| 104 |
+
repr
|
| 105 |
+
resp
|
| 106 |
+
revid
|
| 107 |
+
rkp
|
| 108 |
+
roč
|
| 109 |
+
roz
|
| 110 |
+
rozš
|
| 111 |
+
samost
|
| 112 |
+
sect
|
| 113 |
+
sest
|
| 114 |
+
seš
|
| 115 |
+
sign
|
| 116 |
+
sl
|
| 117 |
+
srv
|
| 118 |
+
stol
|
| 119 |
+
sv
|
| 120 |
+
šk
|
| 121 |
+
šk.ro
|
| 122 |
+
špan
|
| 123 |
+
tab
|
| 124 |
+
t.č
|
| 125 |
+
tis
|
| 126 |
+
tj
|
| 127 |
+
tř
|
| 128 |
+
tzv
|
| 129 |
+
univ
|
| 130 |
+
uspoř
|
| 131 |
+
vol
|
| 132 |
+
vl.jm
|
| 133 |
+
vs
|
| 134 |
+
vyd
|
| 135 |
+
vyobr
|
| 136 |
+
zal
|
| 137 |
+
zejm
|
| 138 |
+
zkr
|
| 139 |
+
zprac
|
| 140 |
+
zvl
|
| 141 |
+
n.p
|
| 142 |
+
např
|
| 143 |
+
než
|
| 144 |
+
MUDr
|
| 145 |
+
abl
|
| 146 |
+
absol
|
| 147 |
+
adj
|
| 148 |
+
adv
|
| 149 |
+
ak
|
| 150 |
+
ak. sl
|
| 151 |
+
akt
|
| 152 |
+
alch
|
| 153 |
+
amer
|
| 154 |
+
anat
|
| 155 |
+
angl
|
| 156 |
+
anglosas
|
| 157 |
+
arab
|
| 158 |
+
arch
|
| 159 |
+
archit
|
| 160 |
+
arg
|
| 161 |
+
astr
|
| 162 |
+
astrol
|
| 163 |
+
att
|
| 164 |
+
bás
|
| 165 |
+
belg
|
| 166 |
+
bibl
|
| 167 |
+
biol
|
| 168 |
+
boh
|
| 169 |
+
bot
|
| 170 |
+
bulh
|
| 171 |
+
círk
|
| 172 |
+
csl
|
| 173 |
+
č
|
| 174 |
+
čas
|
| 175 |
+
čes
|
| 176 |
+
dat
|
| 177 |
+
děj
|
| 178 |
+
dep
|
| 179 |
+
dět
|
| 180 |
+
dial
|
| 181 |
+
dór
|
| 182 |
+
dopr
|
| 183 |
+
dosl
|
| 184 |
+
ekon
|
| 185 |
+
epic
|
| 186 |
+
etnonym
|
| 187 |
+
eufem
|
| 188 |
+
f
|
| 189 |
+
fam
|
| 190 |
+
fem
|
| 191 |
+
fil
|
| 192 |
+
film
|
| 193 |
+
form
|
| 194 |
+
fot
|
| 195 |
+
fr
|
| 196 |
+
fut
|
| 197 |
+
fyz
|
| 198 |
+
gen
|
| 199 |
+
geogr
|
| 200 |
+
geol
|
| 201 |
+
geom
|
| 202 |
+
germ
|
| 203 |
+
gram
|
| 204 |
+
hebr
|
| 205 |
+
herald
|
| 206 |
+
hist
|
| 207 |
+
hl
|
| 208 |
+
hovor
|
| 209 |
+
hud
|
| 210 |
+
hut
|
| 211 |
+
chcsl
|
| 212 |
+
chem
|
| 213 |
+
ie
|
| 214 |
+
imp
|
| 215 |
+
impf
|
| 216 |
+
ind
|
| 217 |
+
indoevr
|
| 218 |
+
inf
|
| 219 |
+
instr
|
| 220 |
+
interj
|
| 221 |
+
ión
|
| 222 |
+
iron
|
| 223 |
+
it
|
| 224 |
+
kanad
|
| 225 |
+
katalán
|
| 226 |
+
klas
|
| 227 |
+
kniž
|
| 228 |
+
komp
|
| 229 |
+
konj
|
| 230 |
+
|
| 231 |
+
konkr
|
| 232 |
+
kř
|
| 233 |
+
kuch
|
| 234 |
+
lat
|
| 235 |
+
lék
|
| 236 |
+
les
|
| 237 |
+
lid
|
| 238 |
+
lit
|
| 239 |
+
liturg
|
| 240 |
+
lok
|
| 241 |
+
log
|
| 242 |
+
m
|
| 243 |
+
mat
|
| 244 |
+
meteor
|
| 245 |
+
metr
|
| 246 |
+
mod
|
| 247 |
+
ms
|
| 248 |
+
mysl
|
| 249 |
+
n
|
| 250 |
+
náb
|
| 251 |
+
námoř
|
| 252 |
+
neklas
|
| 253 |
+
něm
|
| 254 |
+
nesklon
|
| 255 |
+
nom
|
| 256 |
+
ob
|
| 257 |
+
obch
|
| 258 |
+
obyč
|
| 259 |
+
ojed
|
| 260 |
+
opt
|
| 261 |
+
part
|
| 262 |
+
pas
|
| 263 |
+
pejor
|
| 264 |
+
pers
|
| 265 |
+
pf
|
| 266 |
+
pl
|
| 267 |
+
plpf
|
| 268 |
+
|
| 269 |
+
práv
|
| 270 |
+
prep
|
| 271 |
+
předl
|
| 272 |
+
přivl
|
| 273 |
+
r
|
| 274 |
+
rcsl
|
| 275 |
+
refl
|
| 276 |
+
reg
|
| 277 |
+
rkp
|
| 278 |
+
ř
|
| 279 |
+
řec
|
| 280 |
+
s
|
| 281 |
+
samohl
|
| 282 |
+
sg
|
| 283 |
+
sl
|
| 284 |
+
souhl
|
| 285 |
+
spec
|
| 286 |
+
srov
|
| 287 |
+
stfr
|
| 288 |
+
střv
|
| 289 |
+
stsl
|
| 290 |
+
subj
|
| 291 |
+
subst
|
| 292 |
+
superl
|
| 293 |
+
sv
|
| 294 |
+
sz
|
| 295 |
+
táz
|
| 296 |
+
tech
|
| 297 |
+
telev
|
| 298 |
+
teol
|
| 299 |
+
trans
|
| 300 |
+
typogr
|
| 301 |
+
var
|
| 302 |
+
vedl
|
| 303 |
+
verb
|
| 304 |
+
vl. jm
|
| 305 |
+
voj
|
| 306 |
+
vok
|
| 307 |
+
vůb
|
| 308 |
+
vulg
|
| 309 |
+
výtv
|
| 310 |
+
vztaž
|
| 311 |
+
zahr
|
| 312 |
+
zájm
|
| 313 |
+
zast
|
| 314 |
+
zejm
|
| 315 |
+
|
| 316 |
+
zeměd
|
| 317 |
+
zkr
|
| 318 |
+
zř
|
| 319 |
+
mj
|
| 320 |
+
dl
|
| 321 |
+
atp
|
| 322 |
+
sport
|
| 323 |
+
Mgr
|
| 324 |
+
horn
|
| 325 |
+
MVDr
|
| 326 |
+
JUDr
|
| 327 |
+
RSDr
|
| 328 |
+
Bc
|
| 329 |
+
PhDr
|
| 330 |
+
ThDr
|
| 331 |
+
Ing
|
| 332 |
+
aj
|
| 333 |
+
apod
|
| 334 |
+
PharmDr
|
| 335 |
+
pomn
|
| 336 |
+
ev
|
| 337 |
+
slang
|
| 338 |
+
nprap
|
| 339 |
+
odp
|
| 340 |
+
dop
|
| 341 |
+
pol
|
| 342 |
+
st
|
| 343 |
+
stol
|
| 344 |
+
p. n. l
|
| 345 |
+
před n. l
|
| 346 |
+
n. l
|
| 347 |
+
př. Kr
|
| 348 |
+
po Kr
|
| 349 |
+
př. n. l
|
| 350 |
+
odd
|
| 351 |
+
RNDr
|
| 352 |
+
tzv
|
| 353 |
+
atd
|
| 354 |
+
tzn
|
| 355 |
+
resp
|
| 356 |
+
tj
|
| 357 |
+
p
|
| 358 |
+
br
|
| 359 |
+
č. j
|
| 360 |
+
čj
|
| 361 |
+
č. p
|
| 362 |
+
čp
|
| 363 |
+
a. s
|
| 364 |
+
s. r. o
|
| 365 |
+
spol. s r. o
|
| 366 |
+
p. o
|
| 367 |
+
s. p
|
| 368 |
+
v. o. s
|
| 369 |
+
k. s
|
| 370 |
+
o. p. s
|
| 371 |
+
o. s
|
| 372 |
+
v. r
|
| 373 |
+
v z
|
| 374 |
+
ml
|
| 375 |
+
vč
|
| 376 |
+
kr
|
| 377 |
+
mld
|
| 378 |
+
hod
|
| 379 |
+
popř
|
| 380 |
+
ap
|
| 381 |
+
event
|
| 382 |
+
rus
|
| 383 |
+
slov
|
| 384 |
+
rum
|
| 385 |
+
švýc
|
| 386 |
+
P. T
|
| 387 |
+
zvl
|
| 388 |
+
hor
|
| 389 |
+
dol
|
| 390 |
+
S.O.S
|