diff --git a/.gitattributes b/.gitattributes index c6a471db69a530b6eb02f9ce4d0d64a84d61b6da..37f366b5661f02780ee7ab15e6b4dcca68eee036 100644 --- a/.gitattributes +++ b/.gitattributes @@ -37,3 +37,4 @@ fairseq-0.10.2/fairseq/libbleu.cpython-310-x86_64-linux-gnu.so filter=lfs diff=l fairseq-0.10.2/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text fairseq-0.10.2/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text fairseq-0.10.2/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text +mosesdecoder/moses/TranslationModel/UG/util/ibm1-align filter=lfs diff=lfs merge=lfs -text diff --git a/mosesdecoder/defer/Joint.h b/mosesdecoder/defer/Joint.h new file mode 100644 index 0000000000000000000000000000000000000000..88547585af5786bdb3dbb54f6b3fe5ceacbbcdfd --- /dev/null +++ b/mosesdecoder/defer/Joint.h @@ -0,0 +1,139 @@ +// $Id$ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_LanguageModelJoint_h +#define moses_LanguageModelJoint_h + +#include +#include +#include +#include "SingleFactor.h" +#include "MultiFactor.h" +#include "moses/Word.h" +#include "moses/FactorTypeSet.h" +#include "moses/FactorCollection.h" + +namespace Moses +{ + +class Phrase; +class FactorCollection; + +/** LM of multiple factors. A simple extension of single factor LM - factors backoff together. + * Rather slow as this uses string concatenation/split. + * Not used for a long time + */ +class LanguageModelJoint : public LanguageModelMultiFactor +{ +protected: + LanguageModelSingleFactor *m_lmImpl; + std::vector m_factorTypesOrdered; + + size_t m_implFactor; +public: + LanguageModelJoint(const std::string &line, LanguageModelSingleFactor *lmImpl) + :LanguageModelMultiFactor(line) { + m_lmImpl = lmImpl; + } + + ~LanguageModelJoint() { + delete m_lmImpl; + } + + bool Load(AllOptions const& opts, const std::string &filePath + , const std::vector &factorTypes + , size_t nGramOrder) { + m_factorTypes = FactorMask(factorTypes); + m_filePath = filePath; + m_nGramOrder = nGramOrder; + + m_factorTypesOrdered= factorTypes; + m_implFactor = 0; + + FactorCollection &factorCollection = FactorCollection::Instance(); + + // sentence markers + for (size_t index = 0 ; index < factorTypes.size() ; ++index) { + FactorType factorType = factorTypes[index]; + m_sentenceStartWord[factorType] = factorCollection.AddFactor(Output, factorType, BOS_); + m_sentenceEndWord[factorType] = factorCollection.AddFactor(Output, factorType, EOS_); + } + + m_lmImpl->Load(AllOptions const& opts); + } + + LMResult GetValueForgotState(const std::vector &contextFactor, FFState &outState) const { + if (contextFactor.size() == 0) { + LMResult ret; + ret.score = 0.0; + ret.unknown = false; + return ret; + } + + // joint context for internal LM + std::vector jointContext; + + for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos ) { + const Word &word = *contextFactor[currPos]; + + // add word to chunked context + std::stringstream stream(""); + + const Factor *factor = word[ m_factorTypesOrdered[0] ]; + stream << factor->GetString(); + + for (size_t index = 1 ; index < m_factorTypesOrdered.size() ; ++index) { + FactorType factorType = m_factorTypesOrdered[index]; + const Factor *factor = word[factorType]; + stream << "|" << factor->GetString(); + } + + factor = FactorCollection::Instance().AddFactor(Output, m_implFactor, stream.str()); + + Word* jointWord = new Word; + jointWord->SetFactor(m_implFactor, factor); + jointContext.push_back(jointWord); + } + + // calc score on chunked phrase + LMResult ret = m_lmImpl->GetValueForgotState(jointContext, outState); + + RemoveAllInColl(jointContext); + + return ret; + } + + const FFState *GetNullContextState() const { + return m_lmImpl->GetNullContextState(); + } + + const FFState *GetBeginSentenceState() const { + return m_lmImpl->GetBeginSentenceState(); + } + + FFState *NewState(const FFState *from) const { + return m_lmImpl->NewState(from); + } + +}; + +} +#endif diff --git a/mosesdecoder/defer/PhraseDictionaryInterpolated.cpp b/mosesdecoder/defer/PhraseDictionaryInterpolated.cpp new file mode 100644 index 0000000000000000000000000000000000000000..892e5f98f17e89d88f15137cbce6866a2ac04416 --- /dev/null +++ b/mosesdecoder/defer/PhraseDictionaryInterpolated.cpp @@ -0,0 +1,186 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2013- University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include +#include + +#include "util/exception.hh" +#include "util/tokenize_piece.hh" +#include "moses/TranslationModel/PhraseDictionaryInterpolated.h" + +using namespace std; + +namespace Moses +{ + +PhraseDictionaryInterpolated::PhraseDictionaryInterpolated +(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature): + PhraseDictionary(numScoreComponent,feature), + m_targetPhrases(NULL), + m_languageModels(NULL) {} + +bool PhraseDictionaryInterpolated::Load( + const std::vector &input + , const std::vector &output + , const std::vector& config + , const std::vector &weightT + , size_t tableLimit + , const LMList &languageModels + , float weightWP) +{ + + m_languageModels = &languageModels; + m_weightT = weightT; + m_tableLimit = tableLimit; + m_weightWP = weightWP; + + //The config should be as follows: + //0-3: type factor factor num-components (as usual) + //4: combination mode (e.g. naive) + //5-(length-2): List of phrase-table files + //length-1: Weight string, in the same format as used for tmcombine + + UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7"); + UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'"); + + // Create the dictionaries + for (size_t i = 5; i < config.size()-1; ++i) { + m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor( + GetFeature()->GetNumScoreComponents(), + GetFeature()->GetNumInputScores(), + GetFeature()))); + bool ret = m_dictionaries.back()->Load( + input, + output, + config[i], + weightT, + 0, + languageModels, + weightWP); + if (!ret) return ret; + } + + //Parse the weight strings + for (util::TokenIter featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) { + m_weights.push_back(vector()); + float sum = 0; + for (util::TokenIter tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) { + const float weight = boost::lexical_cast(*tableWeights); + m_weights.back().push_back(weight); + sum += weight; + } + UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception, + "Number of weights (" << m_weights.back().size() << + ") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")"); + UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised"); + + } + + //check number of weight sets. Make sure there is a weight for every score component + //except for the last - which is assumed to be the phrase penalty. + UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets"); + //if 1 weight set, then repeat + if (m_weights.size() == 1) { + while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) { + m_weights.push_back(m_weights[0]); + } + } + + return true; +} + +void PhraseDictionaryInterpolated::InitializeForInput(ttasksptr const& ttask) +{ + for (size_t i = 0; i < m_dictionaries.size(); ++i) { + m_dictionaries[i]->InitializeForInput(ttask); + } +} + +typedef +boost::unordered_set PhraseSet; + + +TargetPhraseCollection::shared_ptr +PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const +{ + + delete m_targetPhrases; + m_targetPhrases = new TargetPhraseCollection(); + PhraseSet allPhrases; + vector phrasesByTable(m_dictionaries.size()); + for (size_t i = 0; i < m_dictionaries.size(); ++i) { + TargetPhraseCollection::shared_ptr phrases = m_dictionaries[i]->GetTargetPhraseCollection(src); + if (phrases) { + for (TargetPhraseCollection::const_iterator j = phrases->begin(); + j != phrases->end(); ++j) { + allPhrases.insert(*j); + phrasesByTable[i].insert(*j); + } + } + } + ScoreComponentCollection sparseVector; + for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) { + TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i); + //combinedPhrase->ResetScore(); + //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl; + combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase()); + combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm())); + combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm())); + Scores combinedScores(GetFeature()->GetNumScoreComponents()); + for (size_t j = 0; j < phrasesByTable.size(); ++j) { + PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase); + if (tablePhrase != phrasesByTable[j].end()) { + Scores tableScores = (*tablePhrase)->GetScoreBreakdown() + .GetScoresForProducer(GetFeature()); + //cerr << "Scores from " << j << " table: "; + for (size_t k = 0; k < tableScores.size()-1; ++k) { + //cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") "; + combinedScores[k] += m_weights[k][j] * exp(tableScores[k]); + //cerr << m_weights[k][j] * exp(tableScores[k]) << " "; + } + //cerr << endl; + } + } + //map back to log space + //cerr << "Combined "; + for (size_t k = 0; k < combinedScores.size()-1; ++k) { + //cerr << combinedScores[k] << " "; + combinedScores[k] = log(combinedScores[k]); + //cerr << combinedScores[k] << " "; + } + //cerr << endl; + combinedScores.back() = 1; //assume last is penalty + combinedPhrase->SetScore( + GetFeature(), + combinedScores, + sparseVector, + m_weightT, + m_weightWP, + *m_languageModels); + //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl; + m_targetPhrases->Add(combinedPhrase); + } + + m_targetPhrases->Prune(true,m_tableLimit); + + + return m_targetPhrases; +} + +} diff --git a/mosesdecoder/defer/PhraseLengthFeatureTest.cpp b/mosesdecoder/defer/PhraseLengthFeatureTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6fb15e71e83233b7612f2b5253eb586d87e960c3 --- /dev/null +++ b/mosesdecoder/defer/PhraseLengthFeatureTest.cpp @@ -0,0 +1,104 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2010 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ +#include + +#include "moses/FF/PhraseLengthFeature.h" +#include "moses/FactorCollection.h" +#include "moses/Sentence.h" +#include "moses/TargetPhrase.h" +#include "moses/TranslationOption.h" + +using namespace Moses; +using namespace std; + +BOOST_AUTO_TEST_SUITE(phrase_length_feature) + +//TODO: Factor out setup code so that it can be reused + +static Word MakeWord(string text) +{ + FactorCollection &factorCollection = FactorCollection::Instance(); + const Factor* f = factorCollection.AddFactor(Input,0,text); + Word w; + w.SetFactor(0,f); + return w; +} + + +BOOST_AUTO_TEST_CASE(evaluate) +{ + Word w1 = MakeWord("w1"); + Word w2 = MakeWord("y2"); + Word w3 = MakeWord("x3"); + Word w4 = MakeWord("w4"); + + Phrase p1; + p1.AddWord(w1); + p1.AddWord(w3); + p1.AddWord(w4); + + Phrase p2; + p2.AddWord(w1); + p2.AddWord(w2); + + Phrase p3; + p3.AddWord(w2); + p3.AddWord(w1); + p3.AddWord(w4); + p3.AddWord(w4); + + TargetPhrase tp1(p1); + TargetPhrase tp2(p2); + TargetPhrase tp3(p3); + + Sentence sentence; + vector order; + order.push_back(0); + stringstream in("the input sentence has 6 words"); + sentence.Read(in, order); + + TranslationOption topt1(WordsRange(0,0), tp1); + TranslationOption topt2(WordsRange(1,3), tp2); + TranslationOption topt3(WordsRange(2,3), tp3); + + PhraseBasedFeatureContext context1(topt1,sentence); + PhraseBasedFeatureContext context2(topt2,sentence); + PhraseBasedFeatureContext context3(topt3,sentence); + + PhraseLengthFeature plf; + + ScoreComponentCollection acc1,acc2,acc3; + + plf.Evaluate(context1, &acc1); + BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "s1"),1); + BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "t3"),1); + BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "1,3"),1); + + plf.Evaluate(context2, &acc2); + BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "s3"),1); + BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "t2"),1); + BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "3,2"),1); + + plf.Evaluate(context3, &acc3); + BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "s2"),1); + BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "t4"),1); + BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "2,4"),1); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/mosesdecoder/lm/builder/corpus_count.hh b/mosesdecoder/lm/builder/corpus_count.hh new file mode 100644 index 0000000000000000000000000000000000000000..165505c4a06f9c4411882c2d7df65fd740c44afc --- /dev/null +++ b/mosesdecoder/lm/builder/corpus_count.hh @@ -0,0 +1,53 @@ +#ifndef LM_BUILDER_CORPUS_COUNT_H +#define LM_BUILDER_CORPUS_COUNT_H + +#include "lm/lm_exception.hh" +#include "lm/word_index.hh" +#include "util/scoped.hh" + +#include +#include +#include +#include + +namespace util { +class FilePiece; +namespace stream { +class ChainPosition; +} // namespace stream +} // namespace util + +namespace lm { +namespace builder { + +class CorpusCount { + public: + // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size + static float DedupeMultiplier(std::size_t order); + + // How much memory vocabulary will use based on estimated size of the vocab. + static std::size_t VocabUsage(std::size_t vocab_estimate); + + // token_count: out. + // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value. + CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol); + + void Run(const util::stream::ChainPosition &position); + + private: + util::FilePiece &from_; + int vocab_write_; + uint64_t &token_count_; + WordIndex &type_count_; + std::vector& prune_words_; + const std::string& prune_vocab_filename_; + + std::size_t dedupe_mem_size_; + util::scoped_malloc dedupe_mem_; + + WarningAction disallowed_symbol_action_; +}; + +} // namespace builder +} // namespace lm +#endif // LM_BUILDER_CORPUS_COUNT_H diff --git a/mosesdecoder/lm/builder/dump_counts_main.cc b/mosesdecoder/lm/builder/dump_counts_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..26078d0e7195429ed9c54d0c0c37d4509edbaf6d --- /dev/null +++ b/mosesdecoder/lm/builder/dump_counts_main.cc @@ -0,0 +1,36 @@ +#include "lm/common/print.hh" +#include "lm/word_index.hh" +#include "util/file.hh" +#include "util/read_compressed.hh" + +#include + +#include +#include + +int main(int argc, char *argv[]) { + if (argc != 4) { + std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n" + "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n" + "counts. Each record has order many vocabulary ids.\n" + "The vocabulary file contains the words delimited by NULL in order of id.\n" + "The vocabulary file may not be compressed because it is mmapped but the counts\n" + "file can be compressed.\n"; + return 1; + } + util::ReadCompressed counts(util::OpenReadOrThrow(argv[1])); + util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2])); + lm::VocabReconstitute vocab(vocab_file.get()); + unsigned int order = boost::lexical_cast(argv[3]); + std::vector record(sizeof(uint32_t) * order + sizeof(uint64_t)); + while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) { + UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size()); + const lm::WordIndex *words = reinterpret_cast(&*record.begin()); + for (const lm::WordIndex *i = words; i != words + order; ++i) { + UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?"); + std::cout << vocab.Lookup(*i) << ' '; + } + // TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FileStream. + std::cout << *reinterpret_cast(words + order) << '\n'; + } +} diff --git a/mosesdecoder/lm/builder/lmplz_main.cc b/mosesdecoder/lm/builder/lmplz_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..cc3f381caad4d9989d2aedaa4c691b3231f92a7d --- /dev/null +++ b/mosesdecoder/lm/builder/lmplz_main.cc @@ -0,0 +1,220 @@ +#include "lm/builder/output.hh" +#include "lm/builder/pipeline.hh" +#include "lm/common/size_option.hh" +#include "lm/lm_exception.hh" +#include "util/file.hh" +#include "util/file_piece.hh" +#include "util/usage.hh" + +#include + +#include +#include +#include + +namespace { + +// Parse and validate pruning thresholds then return vector of threshold counts +// for each n-grams order. +std::vector ParsePruning(const std::vector ¶m, std::size_t order) { + // convert to vector of integers + std::vector prune_thresholds; + prune_thresholds.reserve(order); + for (std::vector::const_iterator it(param.begin()); it != param.end(); ++it) { + try { + prune_thresholds.push_back(boost::lexical_cast(*it)); + } catch(const boost::bad_lexical_cast &) { + UTIL_THROW(util::Exception, "Bad pruning threshold " << *it); + } + } + + // Fill with zeros by default. + if (prune_thresholds.empty()) { + prune_thresholds.resize(order, 0); + return prune_thresholds; + } + + // validate pruning threshold if specified + // throw if each n-gram order has not threshold specified + UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order); + // threshold for unigram can only be 0 (no pruning) + + // check if threshold are not in decreasing order + uint64_t lower_threshold = 0; + for (std::vector::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) { + UTIL_THROW_IF(lower_threshold > *it, util::Exception, "Pruning thresholds should be in non-decreasing order. Otherwise substrings would be removed, which is bad for query-time data structures."); + lower_threshold = *it; + } + + // Pad to all orders using the last value. + prune_thresholds.resize(order, prune_thresholds.back()); + return prune_thresholds; +} + +lm::builder::Discount ParseDiscountFallback(const std::vector ¶m) { + lm::builder::Discount ret; + UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+"); + UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified"); + ret.amount[0] = 0.0; + for (unsigned i = 0; i < 3; ++i) { + float discount = boost::lexical_cast(param[i < param.size() ? i : (param.size() - 1)]); + UTIL_THROW_IF(discount < 0.0 || discount > static_cast(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "]."); + ret.amount[i + 1] = discount; + } + return ret; +} + +} // namespace + +int main(int argc, char *argv[]) { + try { + namespace po = boost::program_options; + po::options_description options("Language model building options"); + lm::builder::PipelineConfig pipeline; + + std::string text, intermediate, arpa; + std::vector pruning; + std::vector discount_fallback; + std::vector discount_fallback_default; + discount_fallback_default.push_back("0.5"); + discount_fallback_default.push_back("1"); + discount_fallback_default.push_back("1.5"); + bool verbose_header; + + options.add_options() + ("help,h", po::bool_switch(), "Show this help message") + ("order,o", po::value(&pipeline.order) +#if BOOST_VERSION >= 104200 + ->required() +#endif + , "Order of the model") + ("interpolate_unigrams", po::value(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to like SRI. If you want SRI's behavior with a large and the old lmplz default, use --interpolate_unigrams 0.") + ("skip_symbols", po::bool_switch(), "Treat , , and as whitespace instead of throwing an exception") + ("temp_prefix,T", po::value(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix") + ("memory,S", lm:: SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory") + ("minimum_block", lm::SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow") + ("sort_block", lm::SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)") + ("block_count", po::value(&pipeline.block_count)->default_value(2), "Block count (per order)") + ("vocab_estimate", po::value(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table") + ("vocab_pad", po::value(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with to reach this size. Requires --interpolate_unigrams") + ("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.") + ("text", po::value(&text), "Read text from a file instead of stdin") + ("arpa", po::value(&arpa), "Write ARPA to a file instead of stdout") + ("intermediate", po::value(&intermediate), "Write ngrams to intermediate files. Turns off ARPA output (which can be reactivated by --arpa file). Forces --renumber on.") + ("renumber", po::bool_switch(&pipeline.renumber_vocabulary), "Rrenumber the vocabulary identifiers so that they are monotone with the hash of each string. This is consistent with the ordering used by the trie data structure.") + ("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.") + ("prune", po::value >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.") + ("limit_vocab_file", po::value(&pipeline.prune_vocab_file)->default_value(""), "Read allowed vocabulary separated by whitespace. N-grams that contain vocabulary items not in this list will be pruned. Can be combined with --prune arg") + ("discount_fallback", po::value >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons. It can also fail if these values are out of range. This option falls back to user-specified discounts when the closed-form estimate fails. Note that this option is generally a bad idea: you should deduplicate your corpus instead. However, class-based models need custom discounts because they lack singleton unigrams. Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail."); + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, options), vm); + + if (argc == 1 || vm["help"].as()) { + std::cerr << + "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n" + "Please cite:\n" + "@inproceedings{Heafield-estimate,\n" + " author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n" + " title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n" + " year = {2013},\n" + " month = {8},\n" + " booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n" + " address = {Sofia, Bulgaria},\n" + " url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n" + "}\n\n" + "Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n" + "the model (-o) is the only mandatory option. As this is an on-disk program,\n" + "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n" + "Memory sizes are specified like GNU sort: a number followed by a unit character.\n" + "Valid units are \% for percentage of memory (supported platforms only) and (in\n" + "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n"; + uint64_t mem = util::GuessPhysicalMemory(); + if (mem) { + std::cerr << "This machine has " << mem << " bytes of memory.\n\n"; + } else { + std::cerr << "Unable to determine the amount of memory on this machine.\n\n"; + } + std::cerr << options << std::endl; + return 1; + } + + po::notify(vm); + + // required() appeared in Boost 1.42.0. +#if BOOST_VERSION < 104200 + if (!vm.count("order")) { + std::cerr << "the option '--order' is required but missing" << std::endl; + return 1; + } +#endif + + if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) { + std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl; + return 1; + } + + if (vm["skip_symbols"].as()) { + pipeline.disallowed_symbol_action = lm::COMPLAIN; + } else { + pipeline.disallowed_symbol_action = lm::THROW_UP; + } + + if (vm.count("discount_fallback")) { + pipeline.discount.fallback = ParseDiscountFallback(discount_fallback); + pipeline.discount.bad_action = lm::COMPLAIN; + } else { + // Unused, just here to prevent the compiler from complaining about uninitialized. + pipeline.discount.fallback = lm::builder::Discount(); + pipeline.discount.bad_action = lm::THROW_UP; + } + + // parse pruning thresholds. These depend on order, so it is not done as a notifier. + pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order); + + if (!vm["limit_vocab_file"].as().empty()) { + pipeline.prune_vocab = true; + } + else { + pipeline.prune_vocab = false; + } + + util::NormalizeTempPrefix(pipeline.sort.temp_prefix); + + lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; + // TODO: evaluate options for these. + initial.adder_in.total_memory = 32768; + initial.adder_in.block_count = 2; + initial.adder_out.total_memory = 32768; + initial.adder_out.block_count = 2; + pipeline.read_backoffs = initial.adder_out; + + // Read from stdin, write to stdout by default + util::scoped_fd in(0), out(1); + if (vm.count("text")) { + in.reset(util::OpenReadOrThrow(text.c_str())); + } + if (vm.count("arpa")) { + out.reset(util::CreateOrThrow(arpa.c_str())); + } + + try { + bool writing_intermediate = vm.count("intermediate"); + if (writing_intermediate) { + pipeline.renumber_vocabulary = true; + } + lm::builder::Output output(writing_intermediate ? intermediate : pipeline.sort.temp_prefix, writing_intermediate, pipeline.output_q); + if (!writing_intermediate || vm.count("arpa")) { + output.Add(new lm::builder::PrintHook(out.release(), verbose_header)); + } + lm::builder::Pipeline(pipeline, in.release(), output); + } catch (const util::MallocException &e) { + std::cerr << e.what() << std::endl; + std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as() << std::endl; + return 1; + } + util::PrintUsage(std::cerr); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } +} diff --git a/mosesdecoder/lm/common/CMakeLists.txt b/mosesdecoder/lm/common/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..942e24bdcb370702f4748253477807f6ac830e5d --- /dev/null +++ b/mosesdecoder/lm/common/CMakeLists.txt @@ -0,0 +1,40 @@ +cmake_minimum_required(VERSION 2.8.8) +# +# The KenLM cmake files make use of add_library(... OBJECTS ...) +# +# This syntax allows grouping of source files when compiling +# (effectively creating "fake" libraries based on source subdirs). +# +# This syntax was only added in cmake version 2.8.8 +# +# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library + + +# This CMake file was created by Lane Schwartz + +# Explicitly list the source files for this subdirectory +# +# If you add any source files to this subdirectory +# that should be included in the kenlm library, +# (this excludes any unit test files) +# you should add them to the following list: +# +# In order to set correct paths to these files +# in case this variable is referenced by CMake files in the parent directory, +# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. +# +set(KENLM_COMMON_SOURCE + ${CMAKE_CURRENT_SOURCE_DIR}/model_buffer.cc + ${CMAKE_CURRENT_SOURCE_DIR}/print.cc + ${CMAKE_CURRENT_SOURCE_DIR}/renumber.cc + ${CMAKE_CURRENT_SOURCE_DIR}/size_option.cc + ) + + +# Group these objects together for later use. +# +# Given add_library(foo OBJECT ${my_foo_sources}), +# refer to these objects as $ +# +add_library(kenlm_common OBJECT ${KENLM_COMMON_SOURCE}) + diff --git a/mosesdecoder/lm/common/Jamfile b/mosesdecoder/lm/common/Jamfile new file mode 100644 index 0000000000000000000000000000000000000000..c9bdfd0dfda5ed3f875913b9259ba72abec8f43e --- /dev/null +++ b/mosesdecoder/lm/common/Jamfile @@ -0,0 +1,2 @@ +fakelib common : [ glob *.cc : *test.cc *main.cc ] + ../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm /top//boost_program_options ; diff --git a/mosesdecoder/lm/common/joint_order.hh b/mosesdecoder/lm/common/joint_order.hh new file mode 100644 index 0000000000000000000000000000000000000000..6113bb8f145167c5e629bbf31629d0e41a5a53c2 --- /dev/null +++ b/mosesdecoder/lm/common/joint_order.hh @@ -0,0 +1,71 @@ +#ifndef LM_COMMON_JOINT_ORDER_H +#define LM_COMMON_JOINT_ORDER_H + +#include "lm/common/ngram_stream.hh" +#include "lm/lm_exception.hh" + +#ifdef DEBUG +#include "util/fixed_array.hh" +#include +#endif + +#include + +namespace lm { + +template void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) { + // Allow matching to reference streams[-1]. + util::FixedArray > streams_with_dummy(positions.size() + 1); + // A bogus stream for [-1]. + streams_with_dummy.push_back(); + for (std::size_t i = 0; i < positions.size(); ++i) { + streams_with_dummy.push_back(positions[i], NGramHeader(NULL, i + 1)); + } + ProxyStream *streams = streams_with_dummy.begin() + 1; + + std::size_t order; + for (order = 0; order < positions.size() && streams[order]; ++order) {} + assert(order); // should always have . + + // Debugging only: call comparison function to sanity check order. +#ifdef DEBUG + util::FixedArray less_compare(order); + for (unsigned i = 0; i < order; ++i) + less_compare.push_back(i + 1); +#endif // DEBUG + + std::size_t current = 0; + while (true) { + // Does the context match the lower one? + if (!memcmp(streams[static_cast(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) { + callback.Enter(current, streams[current].Get()); + // Transition to looking for extensions. + if (++current < order) continue; + } +#ifdef DEBUG + // match_check[current - 1] matches current-grams + // The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams). + else if (!less_compare[current - 1](streams[static_cast(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) { + std::cerr << "Stream out of order detected" << std::endl; + abort(); + } +#endif // DEBUG + // No extension left. + while(true) { + assert(current > 0); + --current; + callback.Exit(current, streams[current].Get()); + + if (++streams[current]) break; + + UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix"); + + order = current; + if (!order) return; + } + } +} + +} // namespaces + +#endif // LM_COMMON_JOINT_ORDER_H diff --git a/mosesdecoder/lm/common/ngram.hh b/mosesdecoder/lm/common/ngram.hh new file mode 100644 index 0000000000000000000000000000000000000000..7a6d1c358a838935b38301e985beb0d922531b0e --- /dev/null +++ b/mosesdecoder/lm/common/ngram.hh @@ -0,0 +1,77 @@ +#ifndef LM_COMMON_NGRAM_H +#define LM_COMMON_NGRAM_H + +#include "lm/weights.hh" +#include "lm/word_index.hh" + +#include +#include +#include +#include + +namespace lm { + +class NGramHeader { + public: + NGramHeader(void *begin, std::size_t order) + : begin_(static_cast(begin)), end_(begin_ + order) {} + + NGramHeader() : begin_(NULL), end_(NULL) {} + + const uint8_t *Base() const { return reinterpret_cast(begin_); } + uint8_t *Base() { return reinterpret_cast(begin_); } + + void ReBase(void *to) { + std::size_t difference = end_ - begin_; + begin_ = reinterpret_cast(to); + end_ = begin_ + difference; + } + + // These are for the vocab index. + // Lower-case in deference to STL. + const WordIndex *begin() const { return begin_; } + WordIndex *begin() { return begin_; } + const WordIndex *end() const { return end_; } + WordIndex *end() { return end_; } + + std::size_t size() const { return end_ - begin_; } + std::size_t Order() const { return end_ - begin_; } + + private: + WordIndex *begin_, *end_; +}; + +template class NGram : public NGramHeader { + public: + typedef PayloadT Payload; + + NGram() : NGramHeader(NULL, 0) {} + + NGram(void *begin, std::size_t order) : NGramHeader(begin, order) {} + + // Would do operator++ but that can get confusing for a stream. + void NextInMemory() { + ReBase(&Value() + 1); + } + + static std::size_t TotalSize(std::size_t order) { + return order * sizeof(WordIndex) + sizeof(Payload); + } + std::size_t TotalSize() const { + // Compiler should optimize this. + return TotalSize(Order()); + } + + static std::size_t OrderFromSize(std::size_t size) { + std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex); + assert(size == TotalSize(ret)); + return ret; + } + + const Payload &Value() const { return *reinterpret_cast(end()); } + Payload &Value() { return *reinterpret_cast(end()); } +}; + +} // namespace lm + +#endif // LM_COMMON_NGRAM_H diff --git a/mosesdecoder/lm/common/print.cc b/mosesdecoder/lm/common/print.cc new file mode 100644 index 0000000000000000000000000000000000000000..518b62f51716f066f5cd608024eed3ccaa81abe1 --- /dev/null +++ b/mosesdecoder/lm/common/print.cc @@ -0,0 +1,62 @@ +#include "lm/common/print.hh" + +#include "lm/common/ngram_stream.hh" +#include "util/file_stream.hh" +#include "util/file.hh" +#include "util/mmap.hh" +#include "util/scoped.hh" + +#include +#include + +namespace lm { + +VocabReconstitute::VocabReconstitute(int fd) { + uint64_t size = util::SizeOrThrow(fd); + util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_); + const char *const start = static_cast(memory_.get()); + const char *i; + for (i = start; i != start + size; i += strlen(i) + 1) { + map_.push_back(i); + } + // Last one for LookupPiece. + map_.push_back(i); +} + +namespace { +template void PrintLead(const VocabReconstitute &vocab, ProxyStream &stream, util::FileStream &out) { + out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin()); + for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) { + out << ' ' << vocab.Lookup(*i); + } +} +} // namespace + +void PrintARPA::Run(const util::stream::ChainPositions &positions) { + VocabReconstitute vocab(vocab_fd_); + util::FileStream out(out_fd_); + out << "\\data\\\n"; + for (size_t i = 0; i < positions.size(); ++i) { + out << "ngram " << (i+1) << '=' << counts_[i] << '\n'; + } + out << '\n'; + + for (unsigned order = 1; order < positions.size(); ++order) { + out << "\\" << order << "-grams:" << '\n'; + for (ProxyStream > stream(positions[order - 1], NGram(NULL, order)); stream; ++stream) { + PrintLead(vocab, stream, out); + out << '\t' << stream->Value().backoff << '\n'; + } + out << '\n'; + } + + out << "\\" << positions.size() << "-grams:" << '\n'; + for (ProxyStream > stream(positions.back(), NGram(NULL, positions.size())); stream; ++stream) { + PrintLead(vocab, stream, out); + out << '\n'; + } + out << '\n'; + out << "\\end\\\n"; +} + +} // namespace lm diff --git a/mosesdecoder/lm/common/renumber.cc b/mosesdecoder/lm/common/renumber.cc new file mode 100644 index 0000000000000000000000000000000000000000..0632a149b90097ace2c2387222ba5a51d40e4cab --- /dev/null +++ b/mosesdecoder/lm/common/renumber.cc @@ -0,0 +1,17 @@ +#include "lm/common/renumber.hh" +#include "lm/common/ngram.hh" + +#include "util/stream/stream.hh" + +namespace lm { + +void Renumber::Run(const util::stream::ChainPosition &position) { + for (util::stream::Stream stream(position); stream; ++stream) { + NGramHeader gram(stream.Get(), order_); + for (WordIndex *w = gram.begin(); w != gram.end(); ++w) { + *w = new_numbers_[*w]; + } + } +} + +} // namespace lm diff --git a/mosesdecoder/lm/common/renumber.hh b/mosesdecoder/lm/common/renumber.hh new file mode 100644 index 0000000000000000000000000000000000000000..ca25c4dc6e1618c93a9dd77c0cdca72d107bc3f9 --- /dev/null +++ b/mosesdecoder/lm/common/renumber.hh @@ -0,0 +1,30 @@ +/* Map vocab ids. This is useful to merge independently collected counts or + * change the vocab ids to the order used by the trie. + */ +#ifndef LM_COMMON_RENUMBER_H +#define LM_COMMON_RENUMBER_H + +#include "lm/word_index.hh" + +#include + +namespace util { namespace stream { class ChainPosition; }} + +namespace lm { + +class Renumber { + public: + // Assumes the array is large enough to map all words and stays alive while + // the thread is active. + Renumber(const WordIndex *new_numbers, std::size_t order) + : new_numbers_(new_numbers), order_(order) {} + + void Run(const util::stream::ChainPosition &position); + + private: + const WordIndex *new_numbers_; + std::size_t order_; +}; + +} // namespace lm +#endif // LM_COMMON_RENUMBER_H diff --git a/mosesdecoder/mert/ReferenceTest.cpp b/mosesdecoder/mert/ReferenceTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c333212279acaa74a548219abc92158e4da24679 --- /dev/null +++ b/mosesdecoder/mert/ReferenceTest.cpp @@ -0,0 +1,123 @@ +#include "Reference.h" + +#define BOOST_TEST_MODULE MertReference +#include + +using namespace MosesTuning; + +BOOST_AUTO_TEST_CASE(refernece_count) +{ + Reference ref; + BOOST_CHECK(ref.get_counts() != NULL); +} + +BOOST_AUTO_TEST_CASE(refernece_length_iterator) +{ + Reference ref; + ref.push_back(4); + ref.push_back(2); + BOOST_REQUIRE(ref.num_references() == 2); + + Reference::iterator it = ref.begin(); + BOOST_CHECK_EQUAL(*it, 4); + ++it; + BOOST_CHECK_EQUAL(*it, 2); + ++it; + BOOST_CHECK(it == ref.end()); +} + +BOOST_AUTO_TEST_CASE(refernece_length_average) +{ + { + Reference ref; + ref.push_back(4); + ref.push_back(1); + BOOST_CHECK_EQUAL(2, ref.CalcAverage()); + } + + { + Reference ref; + ref.push_back(4); + ref.push_back(3); + BOOST_CHECK_EQUAL(3, ref.CalcAverage()); + } + + { + Reference ref; + ref.push_back(4); + ref.push_back(3); + ref.push_back(4); + ref.push_back(5); + BOOST_CHECK_EQUAL(4, ref.CalcAverage()); + } +} + +BOOST_AUTO_TEST_CASE(refernece_length_closest) +{ + { + Reference ref; + ref.push_back(4); + ref.push_back(1); + BOOST_REQUIRE(ref.num_references() == 2); + + BOOST_CHECK_EQUAL(1, ref.CalcClosest(2)); + BOOST_CHECK_EQUAL(1, ref.CalcClosest(1)); + BOOST_CHECK_EQUAL(4, ref.CalcClosest(3)); + BOOST_CHECK_EQUAL(4, ref.CalcClosest(4)); + BOOST_CHECK_EQUAL(4, ref.CalcClosest(5)); + } + + { + Reference ref; + ref.push_back(4); + ref.push_back(3); + BOOST_REQUIRE(ref.num_references() == 2); + + BOOST_CHECK_EQUAL(3, ref.CalcClosest(1)); + BOOST_CHECK_EQUAL(3, ref.CalcClosest(2)); + BOOST_CHECK_EQUAL(3, ref.CalcClosest(3)); + BOOST_CHECK_EQUAL(4, ref.CalcClosest(4)); + BOOST_CHECK_EQUAL(4, ref.CalcClosest(5)); + } + + { + Reference ref; + ref.push_back(4); + ref.push_back(3); + ref.push_back(4); + ref.push_back(5); + BOOST_REQUIRE(ref.num_references() == 4); + + BOOST_CHECK_EQUAL(3, ref.CalcClosest(1)); + BOOST_CHECK_EQUAL(3, ref.CalcClosest(2)); + BOOST_CHECK_EQUAL(3, ref.CalcClosest(3)); + BOOST_CHECK_EQUAL(4, ref.CalcClosest(4)); + BOOST_CHECK_EQUAL(5, ref.CalcClosest(5)); + } +} + +BOOST_AUTO_TEST_CASE(refernece_length_shortest) +{ + { + Reference ref; + ref.push_back(4); + ref.push_back(1); + BOOST_CHECK_EQUAL(1, ref.CalcShortest()); + } + + { + Reference ref; + ref.push_back(4); + ref.push_back(3); + BOOST_CHECK_EQUAL(3, ref.CalcShortest()); + } + + { + Reference ref; + ref.push_back(4); + ref.push_back(3); + ref.push_back(4); + ref.push_back(5); + BOOST_CHECK_EQUAL(3, ref.CalcShortest()); + } +} diff --git a/mosesdecoder/mert/ScoreArray.cpp b/mosesdecoder/mert/ScoreArray.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dd9aa5b07443a67e3780379521fbe4db21a1444f --- /dev/null +++ b/mosesdecoder/mert/ScoreArray.cpp @@ -0,0 +1,169 @@ +/* + * ScoreArray.cpp + * mert - Minimum Error Rate Training + * + * Created by Nicola Bertoldi on 13/05/08. + * + */ + +#include "ScoreArray.h" +#include "Util.h" +#include "FileStream.h" + +using namespace std; + +namespace MosesTuning +{ + + +ScoreArray::ScoreArray() + : m_num_scores(0), m_index(0) {} + +void ScoreArray::savetxt(ostream* os, const string& sctype) +{ + *os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size() + << " " << m_num_scores << " " << sctype << endl; + for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) { + i->savetxt(os); + *os << endl; + } + *os << SCORES_TXT_END << endl; +} + +void ScoreArray::savebin(ostream* os, const string& score_type) +{ + *os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size() + << " " << m_num_scores << " " << score_type << endl; + for (scorearray_t::iterator i = m_array.begin(); + i != m_array.end(); i++) { + i->savebin(os); + } + *os << SCORES_BIN_END << endl; +} + +void ScoreArray::save(ostream* os, const string& score_type, bool bin) +{ + if (size() <= 0) return; + if (bin) { + savebin(os, score_type); + } else { + savetxt(os, score_type); + } +} + +void ScoreArray::save(const string &file, const string& score_type, bool bin) +{ + ofstream ofs(file.c_str(), ios::out); + if (!ofs) { + cerr << "Failed to open " << file << endl; + exit(1); + } + ostream* os = &ofs; + save(os, score_type, bin); + ofs.close(); +} + +void ScoreArray::save(const string& score_type, bool bin) +{ + save(&cout, score_type, bin); +} + +void ScoreArray::loadbin(istream* is, size_t n) +{ + ScoreStats entry(m_num_scores); + for (size_t i = 0; i < n; i++) { + entry.loadbin(is); + add(entry); + } +} + +void ScoreArray::loadtxt(istream* is, size_t n) +{ + ScoreStats entry(m_num_scores); + for (size_t i = 0; i < n; i++) { + entry.loadtxt(is); + add(entry); + } +} + +void ScoreArray::load(istream* is) +{ + size_t number_of_entries = 0; + bool binmode = false; + + string substring, stringBuf; + string::size_type loc; + + getline(*is, stringBuf); + if (!is->good()) { + return; + } + + if (!stringBuf.empty()) { + if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0) { + binmode=false; + } else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0) { + binmode=true; + } else { + TRACE_ERR("ERROR: ScoreArray::load(): Wrong header"); + return; + } + getNextPound(stringBuf, substring); + getNextPound(stringBuf, substring); + m_index = atoi(substring.c_str()); + getNextPound(stringBuf, substring); + number_of_entries = atoi(substring.c_str()); + getNextPound(stringBuf, substring); + m_num_scores = atoi(substring.c_str()); + getNextPound(stringBuf, substring); + m_score_type = substring; + } + + if (binmode) { + loadbin(is, number_of_entries); + } else { + loadtxt(is, number_of_entries); + } + + getline(*is, stringBuf); + if (!stringBuf.empty()) { + if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && + (loc = stringBuf.find(SCORES_BIN_END)) != 0) { + TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer"); + return; + } + } +} + +void ScoreArray::load(const string &file) +{ + TRACE_ERR("loading data from " << file << endl); + inputfilestream input_stream(file); // matches a stream with a file. Opens the file + istream* is = &input_stream; + load(is); + input_stream.close(); +} + + +void ScoreArray::merge(ScoreArray& e) +{ + //dummy implementation + for (size_t i=0; isize() != sz) + return false; + } + return true; +} + +} diff --git a/mosesdecoder/mert/ScoreArray.h b/mosesdecoder/mert/ScoreArray.h new file mode 100644 index 0000000000000000000000000000000000000000..438b57e3feba2aa6516a50e12e2666cf54dab778 --- /dev/null +++ b/mosesdecoder/mert/ScoreArray.h @@ -0,0 +1,113 @@ +/* + * ScoreArray.h + * mert - Minimum Error Rate Training + * + * Created by Nicola Bertoldi on 13/05/08. + * + */ + +#ifndef MERT_SCORE_ARRAY_H_ +#define MERT_SCORE_ARRAY_H_ + +#include +#include +#include + +#include "ScoreStats.h" + +namespace MosesTuning +{ + +const char SCORES_TXT_BEGIN[] = "SCORES_TXT_BEGIN_0"; +const char SCORES_TXT_END[] = "SCORES_TXT_END_0"; +const char SCORES_BIN_BEGIN[] = "SCORES_BIN_BEGIN_0"; +const char SCORES_BIN_END[] = "SCORES_BIN_END_0"; + +class ScoreArray +{ +private: + scorearray_t m_array; + std::string m_score_type; + std::size_t m_num_scores; + + // indexx to identify the utterance. + // It can differ from the index inside the vector. + int m_index; + +public: + ScoreArray(); + ~ScoreArray() {} + + void clear() { + m_array.clear(); + } + + int getIndex() const { + return m_index; + } + + void setIndex(int value) { + m_index = value; + } + + ScoreStats& get(std::size_t i) { + return m_array.at(i); + } + + const ScoreStats& get(std::size_t i) const { + return m_array.at(i); + } + + void add(const ScoreStats& e) { + m_array.push_back(e); + } + + //ADDED BY TS + void swap(std::size_t i, std::size_t j) { + std::swap(m_array[i], m_array[j]); + } + + void resize(std::size_t new_size) { + m_array.resize(std::min(new_size, m_array.size())); + } + //END_ADDED + + void merge(ScoreArray& e); + + std::string name() const { + return m_score_type; + } + + void name(std::string &score_type) { + m_score_type = score_type; + } + + std::size_t size() const { + return m_array.size(); + } + + std::size_t NumberOfScores() const { + return m_num_scores; + } + + void NumberOfScores(std::size_t v) { + m_num_scores = v; + } + + void savetxt(std::ostream* os, const std::string& score_type); + void savebin(std::ostream* os, const std::string& score_type); + void save(std::ostream* os, const std::string& score_type, bool bin=false); + void save(const std::string &file, const std::string& score_type, bool bin=false); + void save(const std::string& score_type, bool bin=false); + + void loadtxt(std::istream* is, std::size_t n); + void loadbin(std::istream* is, std::size_t n); + void load(std::istream* is); + void load(const std::string &file); + + bool check_consistency() const; +}; + +} + +#endif // MERT_SCORE_ARRAY_H_ diff --git a/mosesdecoder/mert/Util.h b/mosesdecoder/mert/Util.h new file mode 100644 index 0000000000000000000000000000000000000000..7e6926d197ed4be00b2901563058ed08f8206234 --- /dev/null +++ b/mosesdecoder/mert/Util.h @@ -0,0 +1,149 @@ +/* + * Util.h + * mert - Minimum Error Rate Training + * + * Created by Nicola Bertoldi on 13/05/08. + * + */ + +#ifndef MERT_UTIL_H_ +#define MERT_UTIL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Types.h" + +namespace MosesTuning +{ + +#ifdef TRACE_ENABLE +#define TRACE_ERR(str) { std::cerr << str; } +#else +#define TRACE_ERR(str) { } +#endif + +#if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2) +// gcc nth_element() bug +#define NTH_ELEMENT3(begin, middle, end) std::sort(begin, end) +#define NTH_ELEMENT4(begin, middle, end, orderer) std::sort(begin, end, orderer) +#else +#define NTH_ELEMENT3(begin, middle, end) std::nth_element(begin, middle, end) +#define NTH_ELEMENT4(begin, middle, end, orderer) std::nth_element(begin, middle, end, orderer) +#endif + +const char kDefaultDelimiterSymbol[] = " "; + +int verboselevel(); +int setverboselevel(int v); + + +const float kEPS = 0.0001f; + +template +bool IsAlmostEqual(T expected, T actual, float round=kEPS) +{ + if (std::abs(expected - actual) < round) { + return true; + } else { + std::cerr << "Fail: expected = " << expected + << " (actual = " << actual << ")" << std::endl; + return false; + } +} + +/** + * Find the specified delimiter for the string 'str', and 'str' is assigned + * to a substring object that starts at the position of first occurrence of + * the delimiter in 'str'. 'substr' is copied from 'str' ranging from + * the start position of 'str' to the position of first occurrence of + * the delimiter. + * + * It returns the position of first occurrence in the queried string. + * If the content is not found, std::string::npos is returned. + */ +size_t getNextPound(std::string &str, std::string &substr, + const std::string &delimiter = kDefaultDelimiterSymbol); + +void split(const std::string &s, char delim, std::vector &elems); + +/** + * Split the string 'str' with specified delimitter 'delim' into tokens. + * The resulting tokens are set to 'res'. + * + * ex. "a,b,c" => {"a", "b", "c"}. + */ +void Tokenize(const char *str, const char delim, std::vector *res); + +template +inline T Scan(const std::string &input) +{ + std::stringstream stream(input); + T ret; + stream >> ret; + return ret; +} + +/** + * Returns true iff "str" ends with "suffix". + * e.g., Given str = "abc:" and suffix = ":", this function returns true. + */ +inline bool EndsWith(const std::string& str, const char* suffix) +{ + return str.find_last_of(suffix) == str.size() - 1; +} + +template +inline std::string stringify(T x) +{ + std::ostringstream o; + if (!(o << x)) + throw std::runtime_error("stringify(template)"); + return o.str(); +} + +inline ScoreStatsType ConvertCharToScoreStatsType(const char *str) +{ + return std::atoi(str); +} + +inline ScoreStatsType ConvertStringToScoreStatsType(const std::string& str) +{ + return ConvertCharToScoreStatsType(str.c_str()); +} + +inline FeatureStatsType ConvertCharToFeatureStatsType(const char *str) +{ + return static_cast(std::atof(str)); +} + +inline FeatureStatsType ConvertStringToFeatureStatsType(const std::string &str) +{ + return ConvertCharToFeatureStatsType(str.c_str()); +} + +inline std::string trimStr(const std::string& Src, const std::string& c = " \r\n") +{ + size_t p2 = Src.find_last_not_of(c); + if (p2 == std::string::npos) return std::string(); + size_t p1 = Src.find_first_not_of(c); + if (p1 == std::string::npos) p1 = 0; + return Src.substr(p1, (p2-p1)+1); +} + +// Utilities to measure decoding time +void ResetUserTime(); +void PrintUserTime(const std::string &message); +double GetUserTime(); + +} + +#endif // MERT_UTIL_H_ diff --git a/mosesdecoder/moses/TranslationModel/UG/util/ibm1-align b/mosesdecoder/moses/TranslationModel/UG/util/ibm1-align new file mode 100644 index 0000000000000000000000000000000000000000..003049a9c447f2e745f8be92b21d56588f0c7fe0 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/UG/util/ibm1-align @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67f9b51b84f1b18fefcfe58feba9a9879648529fed29fbfb90ec0cec4f42a80e +size 1062799 diff --git a/mosesdecoder/scripts/Jamfile b/mosesdecoder/scripts/Jamfile new file mode 100644 index 0000000000000000000000000000000000000000..b76152d08bcc5d4ef97983aefb3b42b7545a5b84 --- /dev/null +++ b/mosesdecoder/scripts/Jamfile @@ -0,0 +1,23 @@ +#See ../Jamroot for options. +import option path ; + +build-project training ; + +prefix = [ option.get "prefix" ] ; +if $(prefix) { + prefix = [ path.root $(prefix) [ path.pwd ] ] ; + location = [ option.get "install-scripts" : : $(prefix)$(GITTAG)/scripts ] ; +} else { + location = [ option.get "install-scripts" ] ; +} + +if $(location) { + location = [ path.root $(location) [ path.pwd ] ] ; + install scripts : + [ glob-tree README *.js *.pl *.perl *.pm *.py *.sh *.php : tests regression-testing other bin ] + [ glob share/nonbreaking_prefixes/* ems/example/*.* ems/example/data/* ems/web/* analysis/smtgui/* : ems/web/javascripts ] + generic/fsa-sample.fsa + ems/experiment.machines + ems/experiment.meta + : . $(location) ; +} diff --git a/mosesdecoder/scripts/README b/mosesdecoder/scripts/README new file mode 100644 index 0000000000000000000000000000000000000000..35dac9dd0217f2fd7903a38bddddcbff223255c2 --- /dev/null +++ b/mosesdecoder/scripts/README @@ -0,0 +1,15 @@ +2006-07-29 + +This directory should contain all multi-purpose scripts for: + +- training ... training moses (including BLEU evaluation needed for MERT) +- analysis ... analyzing MT output (for human analysis) +- generic ... script for handling generic issues (parallelization) +- lib ... perl modules used by various scripts + + +The Jamfile then takes care of proper 'release' from your git directory to +the shared directories. + +The released scripts should remain in the *same directory structure*. + diff --git a/mosesdecoder/scripts/generic/binarize4moses2.perl b/mosesdecoder/scripts/generic/binarize4moses2.perl new file mode 100644 index 0000000000000000000000000000000000000000..4f4fff32f459eb9f3564b0d53f273bd122a3fd9b --- /dev/null +++ b/mosesdecoder/scripts/generic/binarize4moses2.perl @@ -0,0 +1,88 @@ +#!/usr/bin/env perl + +use strict; + +use Getopt::Long; +use File::Basename; +use FindBin qw($RealBin); + +sub systemCheck($); + +my $mosesDir = "$RealBin/../.."; +my $ptPath; +my $lexRoPath; +my $outPath; +my $numScores = 4; +my $numLexScores; +my $pruneNum = 100; +my $scfg = 0; + +GetOptions("phrase-table=s" => \$ptPath, + "lex-ro=s" => \$lexRoPath, + "output-dir=s" => \$outPath, + "num-scores=s" => \$numScores, + "num-lex-scores=i" => \$numLexScores, + "prune=i" => \$pruneNum, + "scfg" => \$scfg + ) or exit 1; + +#print STDERR "scfg=$scfg \n"; +die("ERROR: please set --phrase-table") unless defined($ptPath); +#die("ERROR: please set --lex-ro") unless defined($lexRoPath); +die("ERROR: please set --output-dir") unless defined($outPath); +#die("ERROR: please set --num-lex-scores") unless defined($numLexScores); +die("ERROR: compile contrib/sigtest-filter") if (!-X "$mosesDir/contrib/sigtest-filter/filter-pt"); +die("ERROR: compile with bjam --with-cmph") if (!-X "$mosesDir/bin/processLexicalTableMin"); +die("ERROR: compile with bjam --with-xmlrpc-c") if (!-X "$mosesDir/bin/CreateProbingPT"); + +my $cmd; + +my $tempPath = dirname($outPath) ."/tmp.$$"; +`mkdir -p $tempPath`; + +$cmd = "gzip -dc $ptPath | $mosesDir/contrib/sigtest-filter/filter-pt -n $pruneNum | gzip -c > $tempPath/pt.gz"; +systemCheck($cmd); + +if (defined($lexRoPath)) { + die("ERROR: please set --num-lex-scores") unless defined($numLexScores); + + $cmd = "$mosesDir/bin/processLexicalTableMin -in $lexRoPath -out $tempPath/lex-ro -T . -threads all"; + systemCheck($cmd); + + $cmd = "$mosesDir/bin/addLexROtoPT $tempPath/pt.gz $tempPath/lex-ro.minlexr | gzip -c > $tempPath/pt.withLexRO.gz"; + systemCheck($cmd); + + $cmd = "ln -s pt.withLexRO.gz $tempPath/pt.txt.gz"; + systemCheck($cmd); +} +else { + $cmd = "ln -s pt.gz $tempPath/pt.txt.gz"; + systemCheck($cmd); +} + +$cmd = "$mosesDir/bin/CreateProbingPT2 --num-scores $numScores --log-prob --input-pt $tempPath/pt.txt.gz --output-dir $outPath"; + +if (defined($lexRoPath)) { + $cmd .= " --num-lex-scores $numLexScores"; +} + +if ($scfg) { + $cmd .= " --scfg"; +} + +systemCheck($cmd); + +exit(0); + +##################################################### +sub systemCheck($) +{ + my $cmd = shift; + print STDERR "Executing: $cmd\n"; + + my $retVal = system($cmd); + if ($retVal != 0) + { + exit(1); + } +} diff --git a/mosesdecoder/scripts/generic/bsbleu.py b/mosesdecoder/scripts/generic/bsbleu.py new file mode 100644 index 0000000000000000000000000000000000000000..d40a28e6e5325c74e11b959024673fefa3dd03ac --- /dev/null +++ b/mosesdecoder/scripts/generic/bsbleu.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python +# compute Bleu scores with confidence intervals via boostrap resampling +# written by Ulrich Germann +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +from argparse import ArgumentParser +import math +import os +from random import randint +import sys, gzip + + +def count_ngrams(snt, max_n): + """ + Return a dictionary of ngram counts (up to length /max_n/) + for sentence (list of words) /snt/. + """ + ret = {} + for i in xrange(len(snt)): + for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)): + key = tuple(snt[i:k]) + ret[key] = ret.get(key, 0) + 1 + return ret + + +def max_counts(ng1, ng2): + """ + Return a dicitonary of ngram counts such that + each count is the greater of the two individual counts + for each ngram in the input ngram count dictionaries + /ng1/ and /ng2/. + """ + ret = ng1.copy() + for k, v in ng2.items(): + ret[k] = max(ret.get(k, 0), v) + return ret + + +def ng_hits(hyp, ref, max_n): + """ + Return a list of ngram counts such that each ngram count + is the minimum of the counts in hyp and ref, up to ngram + length /max_n/. + """ + ret = [0 for i in xrange(max_n)] + for ng, cnt in hyp.items(): + k = ng + if len(k) <= max_n: + ret[len(k) - 1] += min(cnt, ref.get(ng, 0)) + return ret + + +class BleuScore: + def __init__(self, hyp, ref, max_n=4, bootstrap=1000): + # print len(hyp.ngrams), len(ref.ngrams), "X" + self.hits = [ + ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n) + for i in xrange(len(hyp.ngrams))] + self.max_n = max_n + self.hyp = hyp + self.ref = ref + self.lower = None + self.upper = None + self.median = None + self.actual = self.score([i for i in xrange(len(hyp.snt))]) + if bootstrap: + self.bootstrap = [self.score([randint(0, len(hyp.snt) - 1) + for s in hyp.snt]) + for i in xrange(bootstrap)] + self.bootstrap.sort() + else: + self.bootstrap = [self.actual] + pass + + def score(self, sample): + hits = [0 for i in xrange(self.max_n)] + self.hyplen = 0 + self.reflen = 0 + self.total = [0 for i in hits] + for i in sample: + self.hyplen += len(self.hyp.snt[i]) + self.reflen += len(self.ref.snt[i]) + for n in xrange(self.max_n): + hits[n] += self.hits[i][n] + self.total[n] += max(len(self.hyp.snt[i]) - n, 0) + pass + self.prec = [float(hits[n]) / self.total[n] + for n in xrange(self.max_n)] + ret = sum([math.log(x) for x in self.prec]) / self.max_n + self.BP = min( + 1, math.exp(1. - float(self.reflen) / float(self.hyplen))) + ret += math.log(self.BP) + return math.exp(ret) + + +class Document: + def __init__(self, fname=None): + self.fname = fname + if fname: + if fname[-3:] == ".gz": + self.snt = [line.strip().split() for line in gzip.open(fname).readlines()] + else: + self.snt = [line.strip().split() for line in open(fname)] + pass + self.ngrams = [count_ngrams(snt, 4) for snt in self.snt] + # print self.snt + else: + self.snt = None + self.ngrams = None + + def merge(self, R): + self.fname = "multi-ref" + self.ngrams = [x for x in R[0].ngrams] + self.snt = [x for x in R[0].snt] + for i in xrange(len(R[0].ngrams)): + for k in xrange(1, len(R)): + self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i]) + + def update(self, hyp, R): + for i, hyp_snt in enumerate(hyp.snt): + clen = len(hyp_snt) + K = 0 + for k in xrange(1, len(R)): + k_snt = R[k].snt[i] + assert len(R[k].snt) == len(hyp.snt), ( + "Mismatch in number of sentences " + + "between reference and candidate") + if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen): + if len(k_snt) < len(R[K].snt[i]): + K = k + elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen): + K = k + self.snt[i] = R[K].snt[i] + + +if __name__ == "__main__": + argparser = ArgumentParser() + argparser.add_argument( + "-r", "--ref", nargs='+', help="Reference translation(s).") + argparser.add_argument( + "-c", "--cand", nargs='+', help="Candidate translations.") + argparser.add_argument( + "-i", "--individual", action='store_true', + help="Compute BLEU scores for individual references.") + argparser.add_argument( + "-b", "--bootstrap", type=int, default=1000, + help="Sample size for bootstrap resampling.") + argparser.add_argument( + "-a", "--alpha", type=float, default=.05, + help="1-alpha = confidence interval.") + args = argparser.parse_args(sys.argv[1:]) + R = [Document(fname) for fname in args.ref] + C = [Document(fname) for fname in args.cand] + Rx = Document() # for multi-reference BLEU + Rx.merge(R) + for c in C: + # compute multi-reference BLEU + Rx.update(c, R) + bleu = BleuScore(c, Rx, bootstrap=args.bootstrap) + print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % ( + 100 * bleu.actual, + os.path.basename(Rx.fname), + 100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)], + 100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)], + 100 * bleu.bootstrap[int(.5 * args.bootstrap)], + c.fname) # os.path.basename(c.fname)) + + if args.individual: + for r in R: + bleu = BleuScore(c, r, bootstrap=args.bootstrap) + print " %5.2f %s" % ( + 100 * bleu.actual, os.path.basename(r.fname)) + # print bleu.prec, bleu.hyplen, bleu.reflen, bleu.BP + + # print [ + # sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))]) + # for n in xrange(4)] diff --git a/mosesdecoder/scripts/generic/compound-splitter.perl b/mosesdecoder/scripts/generic/compound-splitter.perl new file mode 100644 index 0000000000000000000000000000000000000000..2ece80a601f4f176a98469a74b38224f43cea879 --- /dev/null +++ b/mosesdecoder/scripts/generic/compound-splitter.perl @@ -0,0 +1,295 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; +use Getopt::Long "GetOptions"; + +my ($CORPUS,$MODEL,$TRAIN,$HELP,$VERBOSE); +my $FILLER = ":s:es"; +my $MIN_SIZE = 3; +my $MIN_COUNT = 5; +my $MAX_COUNT = 5; +my $FACTORED = 0; +my $SYNTAX = 0; +my $MARK_SPLIT = 0; +my $BINARIZE = 0; +$HELP = 1 + unless &GetOptions('corpus=s' => \$CORPUS, + 'model=s' => \$MODEL, + 'filler=s' => \$FILLER, + 'factored' => \$FACTORED, + 'min-size=i' => \$MIN_SIZE, + 'min-count=i' => \$MIN_COUNT, + 'max-count=i' => \$MAX_COUNT, + 'help' => \$HELP, + 'verbose' => \$VERBOSE, + 'syntax' => \$SYNTAX, + 'binarize' => \$BINARIZE, + 'mark-split' => \$MARK_SPLIT, + 'train' => \$TRAIN); + +if ($HELP || + ( $TRAIN && !$CORPUS) || + (!$TRAIN && !$MODEL)) { + print "Compound splitter\n"; + print "-----------------\n\n"; + print "train: compound-splitter -train -corpus txt-file -model new-model\n"; + print "apply: compound-splitter -model trained-model < in > out\n"; + print "options: -min-size: minimum word size (default $MIN_SIZE)\n"; + print " -min-count: minimum word count (default $MIN_COUNT)\n"; + print " -filler: filler letters between words (default $FILLER)\n"; + print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n"; + print " -syntax: syntactically parsed data (default $SYNTAX)\n"; + print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n"; + print " -binarize: binarize subtree for split word (default $BINARIZE)\n"; + exit; +} + +if ($TRAIN) { + if ($SYNTAX) { &train_syntax(); } + elsif ($FACTORED) { &train_factored(); } + else { &train(); } +} +else { + &apply(); +} + +sub train { + my %COUNT; + open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); + while() { + chop; s/\s+/ /g; s/^ //; s/ $//; + foreach (split) { + $COUNT{$_}++; + } + } + close(CORPUS); + &save_trained_model(\%COUNT); +} + +sub save_trained_model { + my ($COUNT) = @_; + my $id = 0; + open(MODEL,">".$MODEL); + foreach my $word (keys %$COUNT) { + print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n"; + } + close(MODEL); + print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n"; +} + +sub train_factored { + my (%COUNT,%FACTORED_COUNT); + # collect counts for interpretations for each surface word + open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); + while() { + chop; s/\s+/ /g; s/^ //; s/ $//; + foreach my $factored_word (split) { + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + $FACTORED_COUNT{$word}{$factored_word}++; + } + } + close(CORPUS); + # only preserve most frequent interpretation, assign sum of counts + foreach my $word (keys %FACTORED_COUNT) { + my ($max,$best,$total) = (0,"",0); + foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) { + my $count = $FACTORED_COUNT{$word}{$factored_word}; + $total += $count; + if ($count > $max) { + $max = $count; + $best = $factored_word; + } + } + $COUNT{$best} = $total; + } + &save_trained_model(\%COUNT); +} + +sub train_syntax { + my (%COUNT,%LABELED_COUNT); + # collect counts for interpretations for each surface word + open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); + while() { + chop; s/\s+/ /g; s/^ //; s/ $//; + my $label; + foreach (split) { + if (/^label="([^\"]+)"/) { + $label = $1; + } + elsif (! /^ $max) { + $max = $count; + $best = "$word $label"; + } + } + $COUNT{$best} = $total; + } + &save_trained_model(\%COUNT); +} + +sub apply { + my (%COUNT,%TRUECASE,%LABEL); + open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'"); + while() { + chomp; + my ($id,$factored_word,$count) = split(/\t/); + my $label; + ($factored_word,$label) = split(/ /,$factored_word); + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + my $lc = lc($word); + # if word exists with multipe casings, only record most frequent + next if defined($COUNT{$lc}) && $COUNT{$lc} > $count; + $COUNT{$lc} = $count; + $TRUECASE{$lc} = $factored_word; + $LABEL{$lc} = $label if $SYNTAX; + } + close(MODEL); + + while() { + my $first = 1; + chop; s/\s+/ /g; s/^ //; s/ $//; + my @BUFFER; # for xml tags + foreach my $factored_word (split) { + print " " unless $first; + $first = 0; + + # syntax: don't split xml + if ($SYNTAX && ($factored_word =~ /^$/)) { + push @BUFFER,$factored_word; + $first = 1; + next; + } + + # get case class + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + my $lc = lc($word); + + print STDERR "considering $word ($lc)...\n" if $VERBOSE; + # don't split frequent words + if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) || + $lc !~ /[a-zA-Z]/) {; # has to have at least one letter + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $factored_word; + print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE; + next; + } + + # consider possible splits + my $final = length($word)-1; + my %REACHABLE; + for(my $i=0;$i<=$final;$i++) { $REACHABLE{$i} = (); } + + print STDERR "splitting $word:\n" if $VERBOSE; + for(my $end=$MIN_SIZE;$end= $MIN_COUNT; + print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE; + push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}"; + } + } + } + + # no matches at all? + if (!defined($REACHABLE{$final})) { + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $factored_word; + next; + } + + my ($best_split,$best_score) = ("",0); + + my %ITERATOR; + for(my $i=0;$i<=$final;$i++) { $ITERATOR{$i}=0; } + my $done = 0; + while(1) { + # read off word + my ($pos,$decomp,$score,$num,@INDEX) = ($final,"",1,0); + while($pos>0) { + last unless scalar @{$REACHABLE{$pos}} > $ITERATOR{$pos}; # dead end? + my ($nextpos,$subword,$count) + = split(/ /,$REACHABLE{$pos}[ $ITERATOR{$pos} ]); + $decomp = $subword." ".$decomp; + $score *= $count; + $num++; + push @INDEX,$pos; +# print STDERR "($nextpos-$pos,$decomp,$score,$num)\n"; + $pos = $nextpos-1; + } + + chop($decomp); + print STDERR "\tsplit: $decomp ($score ** 1/$num) = ".($score ** (1/$num))."\n" if $VERBOSE; + $score **= 1/$num; + if ($score>$best_score) { + $best_score = $score; + $best_split = $decomp; + } + + # increase iterator + my $increase = -1; + while($increase<$final) { + $increase = pop @INDEX; + $ITERATOR{$increase}++; + last if scalar @{$REACHABLE{$increase}} > $ITERATOR{$increase}; + } + last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final}; + for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; } + } + if ($best_split !~ / /) { + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $factored_word; # do not change case for unsplit words + next; + } + if (!$SYNTAX) { + print $best_split; + } + else { + $BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT; + $BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n"); + my $pos = $1; + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + + my @SPLIT = split(/ /,$best_split); + my @OUT = (); + if ($BINARIZE) { + for(my $w=0;$w"; + } + } + for(my $w=0;$w=2) { push @OUT, ""; } + push @OUT," $SPLIT[$w] "; + } + print join(" ",@OUT); + } + } + print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer + print "\n"; + } +} diff --git a/mosesdecoder/scripts/generic/extract-factors.pl b/mosesdecoder/scripts/generic/extract-factors.pl new file mode 100644 index 0000000000000000000000000000000000000000..2b1c51cd193f53e0b25b3d122c7f2e5edb7abfab --- /dev/null +++ b/mosesdecoder/scripts/generic/extract-factors.pl @@ -0,0 +1,24 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# $Id$ +#extract-factors.pl: extract only the desired factors from a factored corpus +#usage: extract-factors corpusfile factor-index factor-index ... > outfile +#factor indices start at 0 +#factor indices too large ought to be ignored + +use warnings; +use strict; + +my ($filename, @factors) = @ARGV; +my %indices = map {$_ => 1} @factors; + +open(INFILE, "<$filename") or die "couldn't open '$filename' for read: $!\n"; +while(my $line = ) +{ + chop $line; + print join(' ', map {my $i = 0; join('|', grep($indices{$i++}, split(/\|/, $_)))} split(/\s+/, $line)) . "\n"; +} +close(INFILE); diff --git a/mosesdecoder/scripts/generic/extract-parallel.perl b/mosesdecoder/scripts/generic/extract-parallel.perl new file mode 100644 index 0000000000000000000000000000000000000000..87b6a8deb1e6f286da8e7d662e0086c9f5e42695 --- /dev/null +++ b/mosesdecoder/scripts/generic/extract-parallel.perl @@ -0,0 +1,385 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# example +# ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput + +use warnings; +use strict; +use File::Basename; + +sub RunFork($); +sub systemCheck($); +sub NumStr($); +sub DigitStr($); +sub CharStr($); +sub GetSplitVersion($); + +my $alph = "abcdefghijklmnopqrstuvwxyz"; +my @alph = (split(//,$alph)); + +print "Started ".localtime() ."\n"; + +my $numParallel= $ARGV[0]; +$numParallel = 1 if $numParallel < 1; + +my $splitCmd= $ARGV[1]; +my $sortCmd= $ARGV[2]; +my $extractCmd= $ARGV[3]; + +my $target = $ARGV[4]; # 1st arg of extract argument +my $source = $ARGV[5]; # 2nd arg of extract argument +my $align = $ARGV[6]; # 3rd arg of extract argument +my $extract = $ARGV[7]; # 4th arg of extract argument + +my $makeTTable = 1; # whether to build the ttable extract files +my $otherExtractArgs= ""; +my $weights = ""; +my $baselineExtract; +my $glueFile; +my $phraseOrientation = 0; +my $phraseOrientationPriorsFile; +my $splitCmdOption = ""; + +my $GZIP_EXEC; +if(`which pigz 2> /dev/null`) { + $GZIP_EXEC = 'pigz'; +} +else { + $GZIP_EXEC = 'gzip'; +} +print STDERR "using $GZIP_EXEC \n"; + +my $isBSDSplit = GetSplitVersion($splitCmd); +print STDERR "isBSDSplit=$isBSDSplit \n"; + +if ($isBSDSplit == 0) { + $splitCmdOption .= "-d"; +} + +my $gzOut = 0; + +for (my $i = 8; $i < $#ARGV + 1; ++$i) +{ + $makeTTable = 0 if $ARGV[$i] eq "--NoTTable"; + if ($ARGV[$i] eq '--BaselineExtract') { + $baselineExtract = $ARGV[++$i]; + next; + } + if ($ARGV[$i] eq '--InstanceWeights') { + $weights = $ARGV[++$i]; + next; + } + if ($ARGV[$i] eq '--GlueGrammar') { + $glueFile = $ARGV[++$i]; + next; + } + $phraseOrientation = 1 if $ARGV[$i] eq "--PhraseOrientation"; + if ($ARGV[$i] eq '--PhraseOrientationPriors') { + $phraseOrientationPriorsFile = $ARGV[++$i]; + next; + } + if ($ARGV[$i] eq '--GZOutput') { + $gzOut = 1; + } + + $otherExtractArgs .= $ARGV[$i] ." "; +} + +die("Need to specify --GZOutput for parallel extract") if ($gzOut == 0); + +my $cmd; +my $TMPDIR=dirname($extract) ."/tmp.$$"; +$cmd = "mkdir -p $TMPDIR; ls -l $TMPDIR"; +print STDERR "Executing: $cmd \n"; +`$cmd`; + +my $totalLines = int(`cat $align | wc -l`); +my $linesPerSplit = int($totalLines / $numParallel) + 1; + +print "total=$totalLines line-per-split=$linesPerSplit \n"; + +my @children; +my $pid; + +if ($numParallel > 1) +{ + $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $target $TMPDIR/target."; + $pid = RunFork($cmd); + push(@children, $pid); + + $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $source $TMPDIR/source."; + $pid = RunFork($cmd); + push(@children, $pid); + + $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $align $TMPDIR/align."; + $pid = RunFork($cmd); + push(@children, $pid); + + if ($weights) { + $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $weights $TMPDIR/weights."; + $pid = RunFork($cmd); + push(@children, $pid); + } + + # wait for everything is finished + foreach (@children) { + waitpid($_, 0); + } + +} +else +{ + my $numStr = NumStr(0); + + $cmd = "ln -s $target $TMPDIR/target.$numStr"; + `$cmd`; + + $cmd = "ln -s $source $TMPDIR/source.$numStr"; + `$cmd`; + + $cmd = "ln -s $align $TMPDIR/align.$numStr"; + `$cmd`; + + if ($weights) { + $cmd = "ln -s $weights $TMPDIR/weights.$numStr"; + `$cmd`; + } +} + +# run extract +@children = (); +for (my $i = 0; $i < $numParallel; ++$i) +{ + my $pid = fork(); + + if ($pid == 0) + { # child + my $numStr = NumStr($i); + my $weightsCmd = ""; + if ($weights) { + $weightsCmd = "--InstanceWeights $TMPDIR/weights.$numStr"; + } + + my $glueArg = ""; + if (defined($glueFile)) { + $glueArg = "--GlueGrammar $TMPDIR/glue.$numStr"; + } + #print STDERR "glueArg=$glueArg \n"; + + my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $glueArg $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n"; + `$cmd`; + + exit(); + } + else + { # parent + push(@children, $pid); + } +} + +# wait for everything is finished +foreach (@children) { + waitpid($_, 0); +} + +# merge +my $catCmd = "gunzip -c "; +my $catInvCmd = $catCmd; +my $catOCmd = $catCmd; +my $catContextCmd = $catCmd; +my $catContextInvCmd = $catCmd; + +for (my $i = 0; $i < $numParallel; ++$i) +{ + my $numStr = NumStr($i); + $catCmd .= "$TMPDIR/extract.$numStr.gz "; + $catInvCmd .= "$TMPDIR/extract.$numStr.inv.gz "; + $catOCmd .= "$TMPDIR/extract.$numStr.o.gz "; + $catContextCmd .= "$TMPDIR/extract.$numStr.context "; + $catContextInvCmd .= "$TMPDIR/extract.$numStr.context.inv "; +} +if (defined($baselineExtract)) { + my $sorted = -e "$baselineExtract.sorted.gz" ? ".sorted" : ""; + $catCmd .= "$baselineExtract$sorted.gz "; + $catInvCmd .= "$baselineExtract.inv$sorted.gz "; + $catOCmd .= "$baselineExtract.o$sorted.gz "; +} + +$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.sorted.gz 2>> /dev/stderr \n"; +$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.inv.sorted.gz 2>> /dev/stderr \n"; +$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.o.sorted.gz 2>> /dev/stderr \n"; +$catContextCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.sorted.gz 2>> /dev/stderr \n"; +$catContextInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.inv.sorted.gz 2>> /dev/stderr \n"; + + +@children = (); +if ($makeTTable) +{ + print STDERR "merging extract / extract.inv\n"; + $pid = RunFork($catCmd); + push(@children, $pid); + + $pid = RunFork($catInvCmd); + push(@children, $pid); +} +else { + print STDERR "skipping extract, doing only extract.o\n"; +} + +if ($otherExtractArgs =~ /--FlexibilityScore/) { + $pid = RunFork($catContextCmd); + push(@children, $pid); + + $pid = RunFork($catContextInvCmd); + push(@children, $pid); + } + +my $numStr = NumStr(0); +if (-e "$TMPDIR/extract.$numStr.o.gz") +{ + $pid = RunFork($catOCmd); + push(@children, $pid); +} + +# wait for all sorting to finish +foreach (@children) { + waitpid($_, 0); +} + +# merge glue rules +if (defined($glueFile)) { + my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile"; + print STDERR "Merging glue rules: $cmd \n"; + print STDERR `$cmd`; +} + +# merge phrase orientation priors (GHKM extraction) +if ($phraseOrientation && defined($phraseOrientationPriorsFile)) { + print STDERR "Merging phrase orientation priors\n"; + + my @orientationPriorsCountFiles = glob("$TMPDIR/*.phraseOrientationPriors"); + my %priorCounts; + + foreach my $filenamePhraseOrientationPriors (@orientationPriorsCountFiles) { + if (-f $filenamePhraseOrientationPriors) { + open my $infilePhraseOrientationPriors, '<', $filenamePhraseOrientationPriors or die "cannot open $filenamePhraseOrientationPriors: $!"; + while (my $line = <$infilePhraseOrientationPriors>) { + print $line; + my ($key, $value) = split / /, $line; + $priorCounts{$key} += $value; + } + close $infilePhraseOrientationPriors; + } + } + + open my $outPhraseOrientationPriors, '>', $phraseOrientationPriorsFile or die "cannot open $phraseOrientationPriorsFile: $!"; + foreach my $key (sort keys %priorCounts) { + print $outPhraseOrientationPriors $key." ".$priorCounts{$key}."\n"; + } + close($outPhraseOrientationPriors); +} + +# delete temporary files +$cmd = "rm -rf $TMPDIR \n"; +systemCheck($cmd); + +print STDERR "Finished ".localtime() ."\n"; + +# ----------------------------------------- +# ----------------------------------------- + +sub RunFork($) +{ + my $cmd = shift; + + my $pid = fork(); + + if ($pid == 0) + { # child + print STDERR $cmd; + systemCheck($cmd); + exit(); + } + return $pid; +} + +sub systemCheck($) +{ + my $cmd = shift; + my $retVal = system($cmd); + if ($retVal != 0) + { + exit(1); + } +} + +sub DigitStr($) +{ + my $i = shift; + my $numStr; + if ($i < 10) { + $numStr = "000000$i"; + } + elsif ($i < 100) { + $numStr = "00000$i"; + } + elsif ($i < 1000) { + $numStr = "0000$i"; + } + elsif ($i < 10000) { + $numStr = "000$i"; + } + elsif ($i < 100000) { + $numStr = "00$i"; + } + elsif ($i < 1000000) { + $numStr = "0$i"; + } + else { + $numStr = $i; + } + return $numStr; +} + +sub CharStr($) +{ + my $i = shift; + my $charStr; + my @bit=(); + + while ($i>0){ + push @bit, $i%26; + $i=int($i/26); + } + my $offset=scalar(@bit); + my $h; + for ($h=6;$h>=$offset;--$h) { $charStr.="a"; } + for ($h=$offset-1;$h>=0;--$h) { $charStr.="$alph[$bit[$h]]"; } + return $charStr; +} + +sub NumStr($) +{ + my $i = shift; + if ($isBSDSplit){ + return CharStr($i); + }else{ + return DigitStr($i); + } +} + +sub GetSplitVersion($) +{ + my $splitCmd = shift; + my $retVal = system("$splitCmd --help > /dev/null"); + if ($retVal != 0) { + return 1; + } + else { + return 0; + } +} + diff --git a/mosesdecoder/scripts/generic/fsa-sample.fsa b/mosesdecoder/scripts/generic/fsa-sample.fsa new file mode 100644 index 0000000000000000000000000000000000000000..9d2e0a6abcbb603a187ef94694c153150e05034a --- /dev/null +++ b/mosesdecoder/scripts/generic/fsa-sample.fsa @@ -0,0 +1,10 @@ +0 1 Prague 0.5 +1 2 Stock 1 +2 6 Market 1 +0 3 New 0.5 +3 4 York 1 +4 5 Stock 1 +5 6 Exchange 1 +6 7 falls 0.5 +6 7 drops 0.5 +7 8 . 1 diff --git a/mosesdecoder/scripts/generic/fsa2fsal.pl b/mosesdecoder/scripts/generic/fsa2fsal.pl new file mode 100644 index 0000000000000000000000000000000000000000..28ec28a261f76a71f149196c9db36e4f3fe36df5 --- /dev/null +++ b/mosesdecoder/scripts/generic/fsa2fsal.pl @@ -0,0 +1,53 @@ +#!/usr/bin/env perl +# A very simple script that converts fsa format (openfst lattices) to the same +# thing represented one sentence per line. It uses '|||' to delimit columns and +# ' ' to delimit nodes (i.e. original lines). +# Some rudimentary sanity checks are done on the fly. +# Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; + +my $errs = 0; +sub err { + my $nr = shift; + my $msg = shift; + print STDERR "$nr:$msg\n"; + $errs++; +} + +my $onr = 0; +my @lines = (); +sub flush { + return if 0 == scalar @lines; + print join(" ", @lines); + print "\n"; + $onr++; + @lines = (); +} + +my $nr = 0; +my $numscores = undef; +while (<>) { + chomp; + if ($_ eq "") { + flush(); + next; + } + my ($a, $b, $label, $scores, $rest) = split /\s+/, $_, 5; + err($nr, "The delimiter '|||' can't appear in the input!") if /\|\|\|/; + err($nr, "Node id not numeric: $a") if $a !~ /^\d+$/; + err($nr, "Node id not numeric: $b") if $b !~ /^\d+$/; + err($nr, "Unexpected tail: '$rest'") if defined $rest && $rest !~ /^\s*$/; + my $thisnumscores = ($scores =~ tr/,/,/); + $numscores = $thisnumscores if !defined $numscores; + err($nr, "Incompatible number of arc scores, previous lines had ".($numscores+1).", now ".($thisnumscores+1)) + if $numscores != $thisnumscores; + push @lines, join("|||", ($a,$b,$label,$scores)); +} +flush(); + +exit 1 if $errs; diff --git a/mosesdecoder/scripts/generic/fsa2plf.pl b/mosesdecoder/scripts/generic/fsa2plf.pl new file mode 100644 index 0000000000000000000000000000000000000000..4b9474d5ac222f47ff70f6bb86ef3fc142a65c04 --- /dev/null +++ b/mosesdecoder/scripts/generic/fsa2plf.pl @@ -0,0 +1,182 @@ +#!/usr/bin/env perl +# Converts AT&T FSA format to 'python lattice format'. +# Note that the input FSA needs to be epsilon-free and topologically sorted. +# This script checks for topological sortedness. +# The start node has to have the index 0. +# All path ends are assumed to be final nodes, not just the explicitly stated +# final nodes. +# Note that the output format may not contain any spaces. +# Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; +use Getopt::Long; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +binmode(STDERR, ":utf8"); + +my $filelist; +my $ignore_final_state_cost = 0; +my $mangle_weights = undef; +GetOptions( + "ignore-final-state-cost" => \$ignore_final_state_cost, + # sometimes, final states have a cost (e.g. "45 0.05\n") + # instead of dying there, ignore the problem + "filelist|fl=s" => \$filelist, + "mangle-weights=s" => \$mangle_weights, +) or exit 1; + +my @infiles; +if (defined $filelist) { + my $fh = my_open($filelist); + while (<$fh>) { + chomp; + push @infiles, $_; + } + close $fh; +} +push @infiles, @ARGV; +@ARGV = (); +if (0 == scalar(@infiles)) { + print STDERR "Reading input from stdin\n"; + push @infiles, "-"; +} + +my $err = 0; +foreach my $inf (@infiles) { + my $nr = 0; + NEXTLATTICE: + my %usedids = (); # collect all used ids for densification + my %usedtgtids = (); # collect all used ids for densification + my @outnodes = (); + my $fh = my_open($inf); + my %is_final; # remember which nodes were final + while (<$fh>) { + chomp; + $nr++; + last if $_ eq ""; # assume a blank line delimits lattices + my ($src, $tgt, $label, $weight) = split /\s+/; + die "$inf:$nr:Bad src node index: $src" if $src !~ /^[0-9]+$/; + + if (!defined $label && !defined $weight) { + # explicit final node, warn at the end if there are any intermed. final + # nodes + $is_final{$src}; + # final nodes can have a cost + die "$inf:$nr:Final state $src has cost $tgt. Unsupported, use --ignore-final-state-cost" + if defined $tgt && !$ignore_final_state_cost; + + next; + } + $weight = 0 if !defined $weight; + + $usedids{$src} = 1; + $usedtgtids{$tgt} = 1; + + # process the weight + # when reading RWTH FSA output, the weights are negated natural logarithms + # we need to negate them back + if (defined $mangle_weights) { + if ($mangle_weights eq "expneg") { + $weight = join(",", map {exp(-$_)} split /,/, $weight); + } else { + die "Bad weights mangling: $mangle_weights"; + } + } + # remember the node + my $targetnode = $tgt-$src; + die "$inf:$nr:Not topologically sorted, got arc from $src to $tgt" + if $targetnode <= 0; + push @{$outnodes[$src]}, [ $label, $weight, $tgt ]; + } + if (eof($fh)) { + close $fh; + $fh = undef; + } + + # Assign our dense IDs: source node ids are assigned first + my %denseids = (); # maps node ids from the file to dense ids + my $nextid = 0; + foreach my $id (sort {$a<=>$b} keys %usedids) { + $denseids{$id} = $nextid; + $nextid++; + } + # All unseen target nodes then get the same next id, the final node id + foreach my $id (keys %usedtgtids) { + next if defined $denseids{$id}; + $denseids{$id} = $nextid; + } + + foreach my $f (keys %is_final) { + if (defined $outnodes[$f]) { + print STDERR "$inf:Node $f is final but it has outgoing edges!\n"; + $err = 1; + } + } +# # Verbose: print original to dense IDs mapping +# foreach my $src (sort {$a<=>$b} keys %denseids) { +# print STDERR "$src ...> $denseids{$src}\n"; +# } + + print "("; + for(my $origsrc = 0; $origsrc < @outnodes; $origsrc++) { + my $src = $denseids{$origsrc}; + next if !defined $src; # this original node ID is not used at all + next if $src == $nextid; # this is the ultimate merged final node + my $outnode = $outnodes[$origsrc]; + print "("; + foreach my $arc (@$outnode) { + my $origtgt = $arc->[2]; + my $tgt = $denseids{$origtgt}; + if (!defined $tgt) { + # this was a final node only + $tgt = $denseids{$origtgt} = $nextid; + $nextid++; + } + my $step_to_target = $tgt - $src; + die "$inf:Bug, I damaged top-sortedness (orig $origsrc .. $origtgt; curr $src .. $tgt)." if $step_to_target <= 0; + print "('".apo($arc->[0])."',$arc->[1],$step_to_target),"; + } + print "),"; + } + print ")\n"; + goto NEXTLATTICE if defined $fh && ! eof($fh); +} +die "There were errors." if $err; + +sub apo { + my $s = shift; + # protects apostrophy and backslash + $s =~ s/\\/\\\\/g; + $s =~ s/(['])/\\$1/g; + return $s; +} + +sub my_open { + my $f = shift; + if ($f eq "-") { + binmode(STDIN, ":utf8"); + return *STDIN; + } + + die "Not found: $f" if ! -e $f; + + my $opn; + my $hdl; + my $ft = `file '$f'`; + # file might not recognize some files! + if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) { + $opn = "zcat '$f' |"; + } elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) { + $opn = "bzcat '$f' |"; + } else { + $opn = "$f"; + } + open $hdl, $opn or die "Can't open '$opn': $!"; + binmode $hdl, ":utf8"; + return $hdl; +} diff --git a/mosesdecoder/scripts/generic/generic-parallel.perl b/mosesdecoder/scripts/generic/generic-parallel.perl new file mode 100644 index 0000000000000000000000000000000000000000..07f6a210aed23b4eaba74081f5f4afd713fcc0b2 --- /dev/null +++ b/mosesdecoder/scripts/generic/generic-parallel.perl @@ -0,0 +1,119 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; +use utf8; + +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +sub NumStr($); + +my $NUM_SPLIT_LINES = $ARGV[0]; + +my $TMPDIR = $ARGV[1]; +$TMPDIR = "$TMPDIR/tmp.$$"; +mkdir $TMPDIR; +print STDERR "TMPDIR=$TMPDIR \n"; + +my $cmd = ""; +for (my $i = 2; $i < scalar(@ARGV); ++$i) +{ + $cmd .= $ARGV[$i] ." "; +} + +# split input file +open (INPUT_ALL, "> $TMPDIR/input.all"); +binmode INPUT_ALL, ":utf8"; +while (my $line = ) +{ + chomp($line); + print INPUT_ALL $line."\n"; +} +close(INPUT_ALL); + +my $cmd2 = "split -l $NUM_SPLIT_LINES -a 5 -d $TMPDIR/input.all $TMPDIR/x"; +`$cmd2`; + +# create exec file +open (EXEC, "> $TMPDIR/exec"); +binmode EXEC, ":utf8"; + +# execute in parallel +print STDERR "executing\n"; + +my $i = 0; +my $filePath = "$TMPDIR/x" .NumStr($i); +while (-f $filePath) +{ + print EXEC "$cmd < $filePath > $filePath.out\n"; + + ++$i; + $filePath = "$TMPDIR/x" .NumStr($i); +} +close (EXEC); + +$cmd2 = "parallel < $TMPDIR/exec"; +`$cmd2`; + +# concatenate +print STDERR "concatenating\n"; + +$i = 1; +my $firstPath = "$TMPDIR/x" .NumStr(0) .".out"; +$filePath = "$TMPDIR/x" .NumStr($i) .".out"; +while (-f $filePath) +{ + $cmd = "cat $filePath >> $firstPath"; + `$cmd`; + + ++$i; + $filePath = "$TMPDIR/x" .NumStr($i) .".out"; +} + +# output +open (OUTPUT_ALL, "$firstPath"); +binmode OUTPUT_ALL, ":utf8"; +while (my $line = ) +{ + chomp($line); + print "$line\n"; +} +close(OUTPUT_ALL); + +$cmd = "rm -rf $TMPDIR/"; +`$cmd`; + +########################################### +sub NumStr($) +{ + my $i = shift; + my $numStr; + if ($i < 10) { + $numStr = "000000$i"; + } + elsif ($i < 100) { + $numStr = "00000$i"; + } + elsif ($i < 1000) { + $numStr = "0000$i"; + } + elsif ($i < 10000) { + $numStr = "000$i"; + } + elsif ($i < 100000) { + $numStr = "00$i"; + } + elsif ($i < 1000000) { + $numStr = "0$i"; + } + else { + $numStr = $i; + } + return $numStr; +} + diff --git a/mosesdecoder/scripts/generic/giza-parallel.perl b/mosesdecoder/scripts/generic/giza-parallel.perl new file mode 100644 index 0000000000000000000000000000000000000000..a9921a99293be4d071cfa5153fe1ec047a757388 --- /dev/null +++ b/mosesdecoder/scripts/generic/giza-parallel.perl @@ -0,0 +1,134 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# example +# ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align + +use warnings; +use strict; +use File::Basename; + +sub NumStr($); + +print "Started ".localtime() ."\n"; + +my $numParallel = $ARGV[0]; +my $splitCmd = $ARGV[1]; +my $trainCmd = $ARGV[2]; +my $inputExt = $ARGV[3]; +my $outputExt = $ARGV[4]; +my $corpus = $ARGV[5]; +my $align = $ARGV[6]; + +my $TMPDIR=dirname($align) ."/tmp.$$"; +mkdir $TMPDIR; + +my $scriptDir=dirname($trainCmd) ."/.."; + +# split corpus file +my $totalLines = int(`wc -l $corpus.$inputExt`); +my $linesPerSplit = int($totalLines / $numParallel) + 1; + +my $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$inputExt $TMPDIR/source."; +`$cmd`; + +$cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$outputExt $TMPDIR/target."; +`$cmd`; + +for (my $i = 0; $i < $numParallel; ++$i) +{ + my $numStr = NumStr($i); + rename("$TMPDIR/source.$numStr", "$TMPDIR/$numStr.source"); + rename("$TMPDIR/target.$numStr", "$TMPDIR/$numStr.target"); +} + +#fork & run giza & friends +my $isParent = 1; +my @childs; +for (my $i = 0; $i < $numParallel; ++$i) +{ + my $pid = fork(); + + if ($pid == 0) + { # child + $isParent = 0; + + my $numStr = NumStr($i); + my $cmd = "$trainCmd -dont-zip -last-step 1 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus $TMPDIR/$numStr -corpus-dir $TMPDIR/prepared.$numStr \n"; + print $cmd; + `$cmd`; + + $cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-e2f $TMPDIR/giza.$numStr -direction 2 \n"; + print $cmd; + `$cmd`; + + $cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -direction 1 \n"; + print $cmd; + `$cmd`; + + $cmd = "$trainCmd -dont-zip -first-step 3 -last-step 3 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -giza-e2f $TMPDIR/giza.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -alignment-file $TMPDIR/aligned.$numStr -alignment grow-diag-final-and \n"; + print $cmd; + `$cmd`; + + exit(); + } + else + { # parent + push(@childs, $pid); + } + +} + +# wait for everything is finished +if ($isParent) +{ + foreach (@childs) { + waitpid($_, 0); + } +} +else +{ + die "shouldn't be here"; +} + +# cat all aligned files together. Voila +my $cmd = "cat "; +for (my $i = 0; $i < $numParallel; ++$i) +{ + my $numStr = NumStr($i); + $cmd .= "$TMPDIR/aligned.$numStr.grow-diag-final-and "; +} +$cmd .= " > $align \n"; +print $cmd; +`$cmd`; + +sub NumStr($) +{ + my $i = shift; + my $numStr; + if ($i < 10) { + $numStr = "000000$i"; + } + elsif ($i < 100) { + $numStr = "00000$i"; + } + elsif ($i < 1000) { + $numStr = "0000$i"; + } + elsif ($i < 10000) { + $numStr = "000$i"; + } + elsif ($i < 100000) { + $numStr = "00$i"; + } + elsif ($i < 1000000) { + $numStr = "0$i"; + } + else { + $numStr = $i; + } + return $numStr; +} + diff --git a/mosesdecoder/scripts/generic/lopar2pos.pl b/mosesdecoder/scripts/generic/lopar2pos.pl new file mode 100644 index 0000000000000000000000000000000000000000..fc2c35c7f98ce26442c016a658c9a3f7d069b26a --- /dev/null +++ b/mosesdecoder/scripts/generic/lopar2pos.pl @@ -0,0 +1,20 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# $Id$ +#lopar2pos: extract POSs from LOPAR output +#usage: lopar2pos.pl CORPUS.lopar > CORPUS.pos + +use warnings; + +my $infilename = shift @ARGV; +open(INFILE, "<$infilename") or die "couldn't open '$infilename' for read: $!\n"; +while(my $line = ) +{ + my @words = split(/\s+/, $line); + my @tags = map {$_ =~ /^[^_]*_([A-Z]+)/; $1} @words; + print join(' ', @tags) . "\n"; +} +close(INFILE); diff --git a/mosesdecoder/scripts/generic/moses_sim_pe.py b/mosesdecoder/scripts/generic/moses_sim_pe.py new file mode 100644 index 0000000000000000000000000000000000000000..3497ca5589ae5c8a59ec85049bcc39caded16c8b --- /dev/null +++ b/mosesdecoder/scripts/generic/moses_sim_pe.py @@ -0,0 +1,452 @@ +#!/usr/bin/env python + +# Written by Michael Denkowski +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +"""Parallelize decoding with simulated post-editing via moses XML input. + +(XML entities need to be escaped in tokenization). Memory mapped +dynamic phrase tables (Ulrich Germann, +www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models +(Kenneth Heafield, +http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19) +facilitate memory efficient multi process decoding. Input is divided into +batches, each of which is decoded sequentially. Each batch pre-loads the +data from previous batches. + +To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the +alignment from input to references. Specify the number of jobs with +--decoder-flags="-threads N". +""" + +import gzip +import itertools +import math +import os +import shutil +import subprocess +import sys +import tempfile +import threading + +HELP = '''Moses with simulated post-editing + +Usage: + {} moses-cmd -config moses.ini -input-file text.src -ref text.tgt \ + -symal text.src-tgt.symal [options] [decoder flags] + +Options: + -threads N: number of decoders to run in parallel \ +(default read from moses.ini, 1 if not present) + -n-best-list nbest.out N [distinct]: location and size of N-best list + -show-weights: for mert-moses.pl, just call moses and exit + -tmp: location of temp directory (default /tmp) + +Other options (decoder flags) are passed through to moses-cmd\n''' + + +class ProgramFailure(Exception): + """Known kind of failure, with a known presentation to the user. + + Error message will be printed, and the program will return an error, + but no traceback will be shown to the user. + """ + + +class Progress: + """Provides progress bar.""" + + def __init__(self): + self.i = 0 + self.lock = threading.Lock() + + def inc(self): + self.lock.acquire() + self.i += 1 + if self.i % 100 == 0: + sys.stderr.write('.') + if self.i % 1000 == 0: + sys.stderr.write(' [{}]\n'.format(self.i)) + sys.stderr.flush() + self.lock.release() + + def done(self): + self.lock.acquire() + if self.i % 1000 != 0: + sys.stderr.write('\n') + self.lock.release() + + +def atomic_io(cmd, in_file, out_file, err_file, prog=None): + """Run with atomic (synchronous) I/O.""" + with open(in_file, 'r') as inp, open(out_file, 'w') as out, open(err_file, 'w') as err: + p = subprocess.Popen( + cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=err) + while True: + line = inp.readline() + if not line: + break + p.stdin.write(line) + out.write(p.stdout.readline()) + out.flush() + if prog: + prog.inc() + p.stdin.close() + p.wait() + + +def gzopen(f): + """Open plain or gzipped text.""" + return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r') + + +def wc(f): + """Word count.""" + i = 0 + for line in gzopen(f): + i += 1 + return i + + +def write_gzfile(lines, f): + """Write lines to gzipped file.""" + out = gzip.open(f, 'wb') + for line in lines: + out.write('{}\n'.format(line)) + out.close() + + +def main(argv): + # Defaults + moses_ini = None + moses_ini_lines = None + text_src = None + text_tgt = None + text_symal = None + text_len = None + threads_found = False + threads = 1 + n_best_out = None + n_best_size = None + n_best_distinct = False + hg_ext = None + hg_dir = None + tmp_dir = '/tmp' + xml_found = False + xml_input = 'exclusive' + show_weights = False + mmsapt_dynamic = [] + mmsapt_static = [] + mmsapt_l1 = None + mmsapt_l2 = None + + # Decoder command + cmd = argv[1:] + + # Parse special options and remove from cmd + i = 1 + while i < len(cmd): + if cmd[i] in ('-f', '-config'): + moses_ini = cmd[i + 1] + cmd = cmd[:i] + cmd[i + 2:] + elif cmd[i] in ('-i', '-input-file'): + text_src = cmd[i + 1] + cmd = cmd[:i] + cmd[i + 2:] + elif cmd[i] == '-ref': + text_tgt = cmd[i + 1] + cmd = cmd[:i] + cmd[i + 2:] + elif cmd[i] == '-symal': + text_symal = cmd[i + 1] + cmd = cmd[:i] + cmd[i + 2:] + elif cmd[i] in ('-th', '-threads'): + threads_found = True + threads = int(cmd[i + 1]) + cmd = cmd[:i] + cmd[i + 2:] + elif cmd[i] == '-n-best-list': + n_best_out = cmd[i + 1] + n_best_size = cmd[i + 2] + # Optional "distinct" + if i + 3 < len(cmd) and cmd[i + 3] == 'distinct': + n_best_distinct = True + cmd = cmd[:i] + cmd[i + 4:] + else: + cmd = cmd[:i] + cmd[i + 3:] + elif cmd[i] == '-output-search-graph-hypergraph': + # cmd[i + 1] == true + hg_ext = cmd[i + 2] + if i + 3 < len(cmd) and cmd[i + 3][0] != '-': + hg_dir = cmd[i + 3] + cmd = cmd[:i] + cmd[i + 4:] + else: + hg_dir = 'hypergraph' + cmd = cmd[:i] + cmd[i + 3:] + elif cmd[i] == '-tmp': + tmp_dir = cmd[i + 1] + cmd = cmd[:i] + cmd[i + 2:] + # Handled specially to make sure XML input is turned on somewhere + elif cmd[i] in ('-xi', '-xml-input'): + xml_found = True + xml_input = cmd[i + 1] + cmd = cmd[:i] + cmd[i + 2:] + # Handled specially for mert-moses.pl + elif cmd[i] == '-show-weights': + show_weights = True + # Do not remove from cmd + i += 1 + else: + i += 1 + + # Read moses.ini + if moses_ini: + moses_ini_lines = [line.strip() for line in open(moses_ini, 'r')] + i = 0 + while i < len(moses_ini_lines): + # PhraseDictionaryBitextSampling name=TranslationModel0 + # output-factor=0 num-features=7 path=corpus. L1=src L2=tgt + # pfwd=g pbwd=g smooth=0 sample=1000 workers=1 + if moses_ini_lines[i].startswith('PhraseDictionaryBitextSampling'): + for (k, v) in (pair.split('=') for pair in moses_ini_lines[i].split()[1:]): + if k == 'name': + # Dynamic means update this model + if v.startswith('Dynamic'): + mmsapt_dynamic.append(v) + moses_ini_lines[i] += '{mmsapt_extra}' + else: + mmsapt_static.append(v) + elif k == 'L1': + if mmsapt_l1 and v != mmsapt_l1: + raise ProgramFailure( + 'Error: All PhraseDictionaryBitextSampling ' + 'entries should have same L1: ' + '{} != {}\n'.format(v, mmsapt_l1)) + mmsapt_l1 = v + elif k == 'L2': + if mmsapt_l2 and v != mmsapt_l2: + raise ProgramFailure( + 'Error: All PhraseDictionaryBitextSampling ' + 'entries should have same L2: ' + '{} != {}\n'.format(v, mmsapt_l2)) + mmsapt_l2 = v + # [threads] + # 8 + elif moses_ini_lines[i] == '[threads]': + # Prefer command line over moses.ini + if not threads_found: + threads = int(moses_ini_lines[i + 1]) + i += 1 + # [xml-input] + # exclusive + elif moses_ini_lines[i] == '[xml-input]': + # Prefer command line over moses.ini + if not xml_found: + xml_found = True + xml_input = moses_ini_lines[i + 1] + i += 1 + i += 1 + + # If mert-moses.pl passes -show-weights, just call moses + if show_weights: + # re-append original moses.ini + cmd.append('-config') + cmd.append(moses_ini) + sys.stdout.write(subprocess.check_output(cmd)) + sys.stdout.flush() + sys.exit(0) + + # Input length + if text_src: + text_len = wc(text_src) + + # Check inputs + if not (len(cmd) > 0 and all((moses_ini, text_src, text_tgt, text_symal))): + sys.stderr.write(HELP.format(argv[0])) + sys.exit(2) + if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)): + raise ProgramFailure( + 'Error: moses-cmd "{}" is not executable\n'.format(cmd[0])) + if not mmsapt_dynamic: + raise ProgramFailure(( + 'Error: no PhraseDictionaryBitextSampling entries named ' + '"Dynamic..." found in {}. See ' + 'http://www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40\n' + ).format(moses_ini)) + if wc(text_tgt) != text_len or wc(text_symal) != text_len: + raise ProgramFailure( + 'Error: length mismatch between "{}", "{}", and "{}"\n'.format( + text_src, text_tgt, text_symal)) + + # Setup + work_dir = tempfile.mkdtemp(prefix='moses.', dir=os.path.abspath(tmp_dir)) + threads = min(threads, text_len) + batch_size = int(math.ceil(float(text_len) / threads)) + + # Report settings + sys.stderr.write( + 'Moses flags: {}\n'.format( + ' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:]))) + for (i, n) in enumerate(mmsapt_dynamic): + sys.stderr.write( + 'Dynamic mmsapt {}: {} {} {}\n'.format( + i, n, mmsapt_l1, mmsapt_l2)) + for (i, n) in enumerate(mmsapt_static): + sys.stderr.write( + 'Static mmsapt {}: {} {} {}\n'.format(i, n, mmsapt_l1, mmsapt_l2)) + sys.stderr.write('XML mode: {}\n'.format(xml_input)) + sys.stderr.write( + 'Inputs: {} {} {} ({})\n'.format( + text_src, text_tgt, text_symal, text_len)) + sys.stderr.write('Jobs: {}\n'.format(threads)) + sys.stderr.write('Batch size: {}\n'.format(batch_size)) + if n_best_out: + sys.stderr.write( + 'N-best list: {} ({}{})\n'.format( + n_best_out, n_best_size, + ', distinct' if n_best_distinct else '')) + if hg_dir: + sys.stderr.write('Hypergraph dir: {} ({})\n'.format(hg_dir, hg_ext)) + sys.stderr.write('Temp dir: {}\n'.format(work_dir)) + + # Accumulate seen lines + src_lines = [] + tgt_lines = [] + symal_lines = [] + + # Current XML source file + xml_out = None + + # Split into batches. Each batch after 0 gets extra files with data from + # previous batches. + # Data from previous lines in the current batch is added using XML input. + job = -1 + lc = -1 + lines = itertools.izip( + gzopen(text_src), gzopen(text_tgt), gzopen(text_symal)) + for (src, tgt, symal) in lines: + (src, tgt, symal) = (src.strip(), tgt.strip(), symal.strip()) + lc += 1 + if lc % batch_size == 0: + job += 1 + xml_file = os.path.join(work_dir, 'input.{}.xml'.format(job)) + extra_src_file = os.path.join( + work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l1)) + extra_tgt_file = os.path.join( + work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l2)) + extra_symal_file = os.path.join( + work_dir, 'extra.{}.{}-{}.symal.gz'.format( + job, mmsapt_l1, mmsapt_l2)) + if job > 0: + xml_out.close() + write_gzfile(src_lines, extra_src_file) + write_gzfile(tgt_lines, extra_tgt_file) + write_gzfile(symal_lines, extra_symal_file) + xml_out = open(xml_file, 'w') + ini_file = os.path.join(work_dir, 'moses.{}.ini'.format(job)) + with open(ini_file, 'w') as moses_ini_out: + if job == 0: + extra = '' + else: + extra = ' extra={}'.format( + os.path.join(work_dir, 'extra.{}.'.format(job))) + moses_ini_out.write( + '{}\n'.format( + '\n'.join(moses_ini_lines).format(mmsapt_extra=extra))) + src_lines.append(src) + tgt_lines.append(tgt) + symal_lines.append(symal) + # Lines after first start with update tag including previous + # translation. + # Translation of last line of each batch is included in extra for + # next batch. + xml_tags = [] + if lc % batch_size != 0: + tag_template = ( + ' ') + for n in mmsapt_dynamic: + # Note: space after tag. + xml_tags.append( + tag_template.format( + n, src_lines[-2], tgt_lines[-2], symal_lines[-2])) + xml_out.write('{}{}\n'.format(''.join(xml_tags), src)) + xml_out.close() + + # Run decoders in parallel + workers = [] + prog = Progress() + for i in range(threads): + work_cmd = cmd[:] + work_cmd.append('-config') + work_cmd.append(os.path.join(work_dir, 'moses.{}.ini'.format(i))) + # Workers use 1 CPU each + work_cmd.append('-threads') + work_cmd.append('1') + if not xml_found: + work_cmd.append('-xml-input') + work_cmd.append(xml_input) + if n_best_out: + work_cmd.append('-n-best-list') + work_cmd.append(os.path.join(work_dir, 'nbest.{}'.format(i))) + work_cmd.append(str(n_best_size)) + if n_best_distinct: + work_cmd.append('distinct') + if hg_dir: + work_cmd.append('-output-search-graph-hypergraph') + work_cmd.append('true') + work_cmd.append(hg_ext) + work_cmd.append(os.path.join(work_dir, 'hg.{}'.format(i))) + in_file = os.path.join(work_dir, 'input.{}.xml'.format(i)) + out_file = os.path.join(work_dir, 'out.{}'.format(i)) + err_file = os.path.join(work_dir, 'err.{}'.format(i)) + t = threading.Thread( + target=atomic_io, + args=(work_cmd, in_file, out_file, err_file, prog)) + workers.append(t) + t.start() + # Wait for all to finish + for t in workers: + t.join() + prog.done() + + # Gather N-best lists + if n_best_out: + with open(n_best_out, 'w') as out: + for i in range(threads): + path = os.path.join(work_dir, 'nbest.{}'.format(i)) + for line in open(path, 'r'): + entry = line.partition(' ') + out.write( + '{} {}'.format( + int(entry[0]) + (i * batch_size), entry[2])) + + # Gather hypergraphs + if hg_dir: + if not os.path.exists(hg_dir): + os.mkdir(hg_dir) + shutil.copy( + os.path.join(work_dir, 'hg.0', 'weights'), + os.path.join(hg_dir, 'weights')) + for i in range(threads): + for j in range(batch_size): + shutil.copy( + os.path.join( + work_dir, 'hg.{}'.format(i), + '{}.{}'.format(j, hg_ext)), + os.path.join( + hg_dir, '{}.{}'.format((i * batch_size) + j, hg_ext))) + + # Gather stdout + for i in range(threads): + for line in open(os.path.join(work_dir, 'out.{}'.format(i)), 'r'): + sys.stdout.write(line) + + # Cleanup + shutil.rmtree(work_dir) + +if __name__ == '__main__': + try: + main(sys.argv) + except ProgramFailure as error: + sys.stderr.write("%s\n" % error) + sys.exit(1) diff --git a/mosesdecoder/scripts/generic/mteval-v11b.pl b/mosesdecoder/scripts/generic/mteval-v11b.pl new file mode 100644 index 0000000000000000000000000000000000000000..2dc2f77cb3792c34e8086695e5ab61c3b665cab1 --- /dev/null +++ b/mosesdecoder/scripts/generic/mteval-v11b.pl @@ -0,0 +1,761 @@ +#!/usr/bin/perl -w + +use strict; + +################################# +# History: +# +# version 11b -- text normalization modified: +# * take out the join digit line because it joins digits +# when it shouldn't have +# $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits +# +# version 11a -- corrected output of individual n-gram precision values +# +# version 11 -- bug fixes: +# * make filehandle operate in binary mode to prevent Perl from operating +# (by default in Red Hat 9) in UTF-8 +# * fix failure on joining digits +# version 10 -- updated output to include more details of n-gram scoring. +# Defaults to generate both NIST and BLEU scores. Use -b for BLEU +# only, use -n for NIST only +# +# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4 +# being the max, regardless what was entered on the command line.) +# +# version 09c -- bug fix (During the calculation of ngram information, +# each ngram was being counted only once for each segment. This has +# been fixed so that each ngram is counted correctly in each segment.) +# +# version 09b -- text normalization modified: +# * option flag added to preserve upper case +# * non-ASCII characters left in place. +# +# version 09a -- text normalization modified: +# * " and & converted to "" and &, respectively +# * non-ASCII characters kept together (bug fix) +# +# version 09 -- modified to accommodate sgml tag and attribute +# names revised to conform to default SGML conventions. +# +# version 08 -- modifies the NIST metric in accordance with the +# findings on the 2001 Chinese-English dry run corpus. Also +# incorporates the BLEU metric as an option and supports the +# output of ngram detail. +# +# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI +# Keep strings of non-ASCII characters together as one word +# (rather than splitting them into one-character words). +# Change length penalty so that translations that are longer than +# the average reference translation are not penalized. +# +# version 06 +# Prevent divide-by-zero when a segment has no evaluation N-grams. +# Correct segment index for level 3 debug output. +# +# version 05 +# improve diagnostic error messages +# +# version 04 +# tag segments +# +# version 03 +# add detailed output option (intermediate document and segment scores) +# +# version 02 +# accommodation of modified sgml tags and attributes +# +# version 01 +# same as bleu version 15, but modified to provide formal score output. +# +# original IBM version +# Author: Kishore Papineni +# Date: 06/10/2001 +################################# + +###### +# Intro +my ($date, $time) = date_time_stamp(); +print "MT evaluation scorer began on $date at $time\n"; +print "command line: ", $0, " ", join(" ", @ARGV), "\n"; +my $usage = "\n\nUsage: $0 [-h] -r -s src_file -t \n\n". + "Description: This Perl script evaluates MT system performance.\n". + "\n". + "Required arguments:\n". + " -r is a file containing the reference translations for\n". + " the documents to be evaluated.\n". + " -s is a file containing the source documents for which\n". + " translations are to be evaluated\n". + " -t is a file containing the translations to be evaluated\n". + "\n". + "Optional arguments:\n". + " -c preserves upper-case alphabetic characters\n". + " -b generate BLEU scores only\n". + " -n generate NIST scores only\n". + " -d detailed output flag used in conjunction with \"-b\" or \"-n\" flags:\n". + " 0 (default) for system-level score only\n". + " 1 to include document-level scores\n". + " 2 to include segment-level scores\n". + " 3 to include ngram-level scores\n". + " -h prints this help message to STDOUT\n". + "\n"; + +use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x); +use Getopt::Std; +getopts ('r:s:t:d:hbncx:'); +die $usage if defined($opt_h); +die "Error in command line: ref_file not defined$usage" unless defined $opt_r; +die "Error in command line: src_file not defined$usage" unless defined $opt_s; +die "Error in command line: tst_file not defined$usage" unless defined $opt_t; +my $max_Ngram = 9; +my $detail = defined $opt_d ? $opt_d : 0; +my $preserve_case = defined $opt_c ? 1 : 0; + +my $METHOD = "BOTH"; +if (defined $opt_b) { $METHOD = "BLEU"; } +if (defined $opt_n) { $METHOD = "NIST"; } +my $method; + +my ($ref_file) = $opt_r; +my ($src_file) = $opt_s; +my ($tst_file) = $opt_t; + +###### +# Global variables +my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters +my (%tst_data, %ref_data); # the data -- with structure: {system}{document}[segments] +my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets +my %eval_docs; # document information for the evaluation data set +my %ngram_info; # the information obtained from (the last word in) the ngram + +###### +# Get source document ID's +($src_id) = get_source_info ($src_file); + +###### +# Get reference translations +($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file); + +compute_ngram_info (); + +###### +# Get translations to evaluate +($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file); + +###### +# Check data for completeness and correctness +check_MT_data (); + +###### +# +my %NISTmt = (); +my %BLEUmt = (); + +###### +# Evaluate +print " Evaluation of $src_lang-to-$tgt_lang translation using:\n"; +my $cum_seg = 0; +foreach my $doc (sort keys %eval_docs) { + $cum_seg += @{$eval_docs{$doc}{SEGS}}; +} +print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n"; +print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n"; +print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n"; + +foreach my $sys (sort @tst_sys) { + for (my $n=1; $n<=$max_Ngram; $n++) { + $NISTmt{$n}{$sys}{cum} = 0; + $NISTmt{$n}{$sys}{ind} = 0; + $BLEUmt{$n}{$sys}{cum} = 0; + $BLEUmt{$n}{$sys}{ind} = 0; + } + + if (($METHOD eq "BOTH") || ($METHOD eq "NIST")) { + $method="NIST"; + score_system ($sys, %NISTmt); + } + if (($METHOD eq "BOTH") || ($METHOD eq "BLEU")) { + $method="BLEU"; + score_system ($sys, %BLEUmt); + } +} + +###### +printout_report (); + +($date, $time) = date_time_stamp(); +print "MT evaluation scorer ended on $date at $time\n"; + +exit 0; + +################################# + +sub get_source_info { + + my ($file) = @_; + my ($name, $id, $src, $doc); + my ($data, $tag, $span); + + +#read data from file + open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; + binmode FILE; + $data .= $_ while ; + close (FILE); + +#get source set info + die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n" + unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data); + + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); + + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" + ." with $name in previous input data ('$src_lang')\n\n" + unless (not defined $src_lang or $src eq $src_lang); + $src_lang = $src; + +#get doc info -- ID and # of segs + $data = $span; + while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag); + die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n" + if defined $eval_docs{$doc}; + $span =~ s/[\s\n\r]+/ /g; # concatenate records + my $jseg=0, my $seg_data = $span; + while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) { + ($eval_docs{$doc}{SEGS}[$jseg++]) = NormalizeText ($span); + } + die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" + if $jseg == 0; + } + die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n" + unless keys %eval_docs > 0; + return $id; +} + +################################# + +sub get_MT_data { + + my ($docs, $set_tag, $file) = @_; + my ($name, $id, $src, $tgt, $sys, $doc); + my ($tag, $span, $data); + +#read data from file + open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; + binmode FILE; + $data .= $_ while ; + close (FILE); + +#get tag info + while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless + ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); + + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless + ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" + ." with $name of source ('$src_lang')\n\n" + unless $src eq $src_lang; + + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless + ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n" + ." with $name of the evaluation ('$tgt_lang')\n\n" + unless (not defined $tgt_lang or $tgt eq $tgt_lang); + $tgt_lang = $tgt; + + my $mtdata = $span; + while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless + (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag); + + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless + $doc = extract_sgml_tag_attribute ($name="DocID", $tag); + + die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n" + ." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n" + unless (not defined $docs->{$sys}{$doc}); + + $span =~ s/[\s\n\r]+/ /g; # concatenate records + my $jseg=0, my $seg_data = $span; + while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) { + ($docs->{$sys}{$doc}{SEGS}[$jseg++]) = NormalizeText ($span); + } + die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" + if $jseg == 0; + $docs->{$sys}{$doc}{FILE} = $file; + } + } + return $id; +} + +################################# + +sub check_MT_data { + + @tst_sys = sort keys %tst_data; + @ref_sys = sort keys %ref_data; + +#every evaluation document must be represented for every system and every reference + foreach my $doc (sort keys %eval_docs) { + my $nseg_source = @{$eval_docs{$doc}{SEGS}}; + foreach my $sys (@tst_sys) { + die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n" + unless defined $tst_data{$sys}{$doc}; + my $nseg = @{$tst_data{$sys}{$doc}{SEGS}}; + die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" + ." document '$doc' for system '$sys' contains $nseg segments, while\n" + ." the source document contains $nseg_source segments.\n\n" + unless $nseg == $nseg_source; + } + + foreach my $sys (@ref_sys) { + die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n" + unless defined $ref_data{$sys}{$doc}; + my $nseg = @{$ref_data{$sys}{$doc}{SEGS}}; + die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" + ." document '$doc' for system '$sys' contains $nseg segments, while\n" + ." the source document contains $nseg_source segments.\n\n" + unless $nseg == $nseg_source; + } + } +} + +################################# + +sub compute_ngram_info { + + my ($ref, $doc, $seg); + my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram); + my (%ngram_count, @tot_ngrams); + + foreach $ref (keys %ref_data) { + foreach $doc (keys %{$ref_data{$ref}}) { + foreach $seg (@{$ref_data{$ref}{$doc}{SEGS}}) { + @wrds = split /\s+/, $seg; + $tot_wrds += @wrds; + %ngrams = %{Words2Ngrams (@wrds)}; + foreach $ngram (keys %ngrams) { + $ngram_count{$ngram} += $ngrams{$ngram}; + } + } + } + } + + foreach $ngram (keys %ngram_count) { + @wrds = split / /, $ngram; + pop @wrds, $mgram = join " ", @wrds; + $ngram_info{$ngram} = - log + ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} + : $ngram_count{$ngram}/$tot_wrds) / log 2; + if (defined $opt_x and $opt_x eq "ngram info") { + @wrds = split / /, $ngram; + printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram}, + $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram; + } + } +} + +################################# + +sub score_system { + + my ($sys, $ref, $doc, %SCOREmt); + ($sys, %SCOREmt) = @_; + my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); + my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); + + $cum_ref_length = 0; + for (my $j=1; $j<=$max_Ngram; $j++) { + $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; + } + + foreach $doc (sort keys %eval_docs) { + ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc); + +#output document summary score + if (($detail >= 1 ) && ($METHOD eq "NIST")) { + my %DOCmt = (); + printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", + nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), + scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; + } + if (($detail >= 1 ) && ($METHOD eq "BLEU")) { + my %DOCmt = (); + printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", + bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), + scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; + } + + $cum_ref_length += $shortest_ref_length; + for (my $j=1; $j<=$max_Ngram; $j++) { + $cum_match[$j] += $match_cnt->[$j]; + $cum_tst_cnt[$j] += $tst_cnt->[$j]; + $cum_ref_cnt[$j] += $ref_cnt->[$j]; + $cum_tst_info[$j] += $tst_info->[$j]; + $cum_ref_info[$j] += $ref_info->[$j]; + printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j], + $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j] + if (defined $opt_x and $opt_x eq "document info"); + } + } + +#x #output system summary score +#x printf "$method score = %.4f for system \"$sys\"\n", +#x $method eq "BLEU" ? bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt) : +#x nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt); + if ($method eq "BLEU") { + bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, %SCOREmt); + } + if ($method eq "NIST") { + nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt); + } +} + +################################# + +sub score_document { + + my ($sys, $ref, $doc); + ($sys, $doc) = @_; + my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); + my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); + + $cum_ref_length = 0; + for (my $j=1; $j<=$max_Ngram; $j++) { + $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; + } + +#score each segment + for (my $jseg=0; $jseg<@{$tst_data{$sys}{$doc}{SEGS}}; $jseg++) { + my @ref_segments = (); + foreach $ref (@ref_sys) { + push @ref_segments, $ref_data{$ref}{$doc}{SEGS}[$jseg]; + printf "ref '$ref', seg %d: %s\n", $jseg+1, $ref_data{$ref}{$doc}{SEGS}[$jseg] + if $detail >= 3; + } + printf "sys '$sys', seg %d: %s\n", $jseg+1, $tst_data{$sys}{$doc}{SEGS}[$jseg] + if $detail >= 3; + ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = + score_segment ($tst_data{$sys}{$doc}{SEGS}[$jseg], @ref_segments); + +#output segment summary score +#x printf "$method score = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n", +#x $method eq "BLEU" ? bleu_score($shortest_ref_length, $match_cnt, $tst_cnt) : +#x nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info), +#x $jseg+1, $tst_cnt->[1] +#x if $detail >= 2; + if (($detail >=2) && ($METHOD eq "BLEU")) { + my %DOCmt = (); + printf " $method score using 4-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n", + bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), $jseg+1, $tst_cnt->[1]; + } + if (($detail >=2) && ($METHOD eq "NIST")) { + my %DOCmt = (); + printf " $method score using 5-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n", + nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), $jseg+1, $tst_cnt->[1]; + } + + + $cum_ref_length += $shortest_ref_length; + for (my $j=1; $j<=$max_Ngram; $j++) { + $cum_match[$j] += $match_cnt->[$j]; + $cum_tst_cnt[$j] += $tst_cnt->[$j]; + $cum_ref_cnt[$j] += $ref_cnt->[$j]; + $cum_tst_info[$j] += $tst_info->[$j]; + $cum_ref_info[$j] += $ref_info->[$j]; + } + } + return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]); +} + +################################# + +sub score_segment { + + my ($tst_seg, @ref_segs) = @_; + my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info); + my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info); + my ($ngram); + my (@nwrds_ref); + my $shortest_ref_length; + + for (my $j=1; $j<= $max_Ngram; $j++) { + $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0; + } + +# get the ngram counts for the test segment + @tst_wrds = split /\s+/, $tst_seg; + %tst_ngrams = %{Words2Ngrams (@tst_wrds)}; + for (my $j=1; $j<=$max_Ngram; $j++) { # compute ngram counts + $tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0; + } + +# get the ngram counts for the reference segments + foreach $ref_seg (@ref_segs) { + @ref_wrds = split /\s+/, $ref_seg; + %ref_ngrams = %{Words2Ngrams (@ref_wrds)}; + foreach $ngram (keys %ref_ngrams) { # find the maximum # of occurrences + my @wrds = split / /, $ngram; + $ref_info[@wrds] += $ngram_info{$ngram}; + $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? + max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : + $ref_ngrams{$ngram}; + } + for (my $j=1; $j<=$max_Ngram; $j++) { # update ngram counts + $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0; + } + $shortest_ref_length = scalar @ref_wrds # find the shortest reference segment + if (not defined $shortest_ref_length) or @ref_wrds < $shortest_ref_length; + } + +# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams + foreach $ngram (keys %tst_ngrams) { + next unless defined $ref_ngrams_max{$ngram}; + my @wrds = split / /, $ngram; + $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); + $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); + printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram + if $detail >= 3; + } + + return ($shortest_ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]); +} + +################################# + +sub bleu_score { + + my ($shortest_ref_length, $matching_ngrams, $tst_ngrams, $sys, %SCOREmt) = @_; + + my $score = 0; + my $iscore = 0; + my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]); + + for (my $j=1; $j<=$max_Ngram; $j++) { + if ($matching_ngrams->[$j] == 0) { + $SCOREmt{$j}{$sys}{cum}=0; + } else { +# Cumulative N-Gram score + $score += log ($matching_ngrams->[$j]/$tst_ngrams->[$j]); + $SCOREmt{$j}{$sys}{cum} = exp($score/$j + $len_score); +# Individual N-Gram score + $iscore = log ($matching_ngrams->[$j]/$tst_ngrams->[$j]); + $SCOREmt{$j}{$sys}{ind} = exp($iscore); + } + } + return $SCOREmt{4}{$sys}{cum}; +} + +################################# + +sub nist_score { + + my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, %SCOREmt) = @_; + + my $score = 0; + my $iscore = 0; + + + for (my $n=1; $n<=$max_Ngram; $n++) { + $score += $tst_info->[$n]/max($tst_ngrams->[$n],1); + $SCOREmt{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); + + $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1); + $SCOREmt{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); + } + return $SCOREmt{5}{$sys}{cum}; +} + +################################# + +sub Words2Ngrams { #convert a string of words to an Ngram count hash + + my %count = (); + + for (; @_; shift) { + my ($j, $ngram, $word); + for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) { + $ngram .= defined $ngram ? " $word" : $word; + $count{$ngram}++; + } + } + return {%count}; +} + +################################# + +sub NormalizeText { + my ($norm_text) = @_; + +# language-independent part: + $norm_text =~ s///g; # strip "skipped" tags + $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines + $norm_text =~ s/\n/ /g; # join lines + $norm_text =~ s/"/"/g; # convert SGML tag for quote to " + $norm_text =~ s/&/&/g; # convert SGML tag for ampersand to & + $norm_text =~ s/</ + $norm_text =~ s/>/>/g; # convert SGML tag for greater-than to < + +# language-dependent part (assuming Western languages): + $norm_text = " $norm_text "; + $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case; + $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation + $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit + $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit + $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit + $norm_text =~ s/\s+/ /g; # one space only between words + $norm_text =~ s/^\s+//; # no leading space + $norm_text =~ s/\s+$//; # no trailing space + + return $norm_text; +} + +################################# + +sub nist_length_penalty { + + my ($ratio) = @_; + return 1 if $ratio >= 1; + return 0 if $ratio <= 0; + my $ratio_x = 1.5; + my $score_x = 0.5; + my $beta = -log($score_x)/log($ratio_x)/log($ratio_x); + return exp (-$beta*log($ratio)*log($ratio)); +} + +################################# + +sub date_time_stamp { + + my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); + my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); + my ($date, $time); + + $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec; + $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday; + return ($date, $time); +} + +################################# + +sub extract_sgml_tag_and_span { + + my ($name, $data) = @_; + + ($data =~ m|<$name\s*([^>]*)>(.*?)(.*)|si) ? ($1, $2, $3) : (); +} + +################################# + +sub extract_sgml_tag_attribute { + + my ($name, $data) = @_; + + ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : (); +} + +################################# + +sub max { + + my ($max, $next); + + return unless defined ($max=pop); + while (defined ($next=pop)) { + $max = $next if $next > $max; + } + return $max; +} + +################################# + +sub min { + + my ($min, $next); + + return unless defined ($min=pop); + while (defined ($next=pop)) { + $min = $next if $next < $min; + } + return $min; +} + +################################# + +sub printout_report +{ + + if ( $METHOD eq "BOTH" ) { + foreach my $sys (sort @tst_sys) { + printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum}; + } + } elsif ($METHOD eq "NIST" ) { + foreach my $sys (sort @tst_sys) { + printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum}; + } + } elsif ($METHOD eq "BLEU" ) { + foreach my $sys (sort @tst_sys) { + printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum}; + } + } + + + printf "\n# ------------------------------------------------------------------------\n\n"; + printf "Individual N-gram scoring\n"; + printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; + printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; + + if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) { + foreach my $sys (sort @tst_sys) { + printf " NIST:"; + for (my $i=1; $i<=$max_Ngram; $i++) { + printf " %2.4f ",$NISTmt{$i}{$sys}{ind} + } + printf " \"$sys\"\n"; + } + printf "\n"; + } + + if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) { + foreach my $sys (sort @tst_sys) { + printf " BLEU:"; + for (my $i=1; $i<=$max_Ngram; $i++) { + printf " %2.4f ",$BLEUmt{$i}{$sys}{ind} + } + printf " \"$sys\"\n"; + } + } + + printf "\n# ------------------------------------------------------------------------\n"; + printf "Cumulative N-gram scoring\n"; + printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; + printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; + + if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) { + foreach my $sys (sort @tst_sys) { + printf " NIST:"; + for (my $i=1; $i<=$max_Ngram; $i++) { + printf " %2.4f ",$NISTmt{$i}{$sys}{cum} + } + printf " \"$sys\"\n"; + } + } + printf "\n"; + + + if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) { + foreach my $sys (sort @tst_sys) { + printf " BLEU:"; + for (my $i=1; $i<=$max_Ngram; $i++) { + printf " %2.4f ",$BLEUmt{$i}{$sys}{cum} + } + printf " \"$sys\"\n"; + } + } +} diff --git a/mosesdecoder/scripts/generic/mteval-v12.pl b/mosesdecoder/scripts/generic/mteval-v12.pl new file mode 100644 index 0000000000000000000000000000000000000000..2666c80125031e3d00c145b0b9570a6a970a1a76 --- /dev/null +++ b/mosesdecoder/scripts/generic/mteval-v12.pl @@ -0,0 +1,784 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use utf8; +use Encode; + +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +################################# +# History: +# +# version 12 +# * Text normalization changes: +# * convert entity references (only the entities declared in the DTD) +# * now uses unicode categories +# * tokenize punctuation unless followed AND preceded by digits +# * tokenize symbols +# * UTF-8 handling: +# * files are now read using utf8 mode +# * Added the '-e' command-line option to enclose non-ASCII characters between spaces +# +# version 11b -- text normalization modified: +# * take out the join digit line because it joins digits +# when it shouldn't have +# $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits +# +# version 11a -- corrected output of individual n-gram precision values +# +# version 11 -- bug fixes: +# * make filehandle operate in binary mode to prevent Perl from operating +# (by default in Red Hat 9) in UTF-8 +# * fix failure on joining digits +# version 10 -- updated output to include more details of n-gram scoring. +# Defaults to generate both NIST and BLEU scores. Use -b for BLEU +# only, use -n for NIST only +# +# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4 +# being the max, regardless what was entered on the command line.) +# +# version 09c -- bug fix (During the calculation of ngram information, +# each ngram was being counted only once for each segment. This has +# been fixed so that each ngram is counted correctly in each segment.) +# +# version 09b -- text normalization modified: +# * option flag added to preserve upper case +# * non-ASCII characters left in place. +# +# version 09a -- text normalization modified: +# * " and & converted to "" and &, respectively +# * non-ASCII characters kept together (bug fix) +# +# version 09 -- modified to accommodate sgml tag and attribute +# names revised to conform to default SGML conventions. +# +# version 08 -- modifies the NIST metric in accordance with the +# findings on the 2001 Chinese-English dry run corpus. Also +# incorporates the BLEU metric as an option and supports the +# output of ngram detail. +# +# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI +# Keep strings of non-ASCII characters together as one word +# (rather than splitting them into one-character words). +# Change length penalty so that translations that are longer than +# the average reference translation are not penalized. +# +# version 06 +# Prevent divide-by-zero when a segment has no evaluation N-grams. +# Correct segment index for level 3 debug output. +# +# version 05 +# improve diagnostic error messages +# +# version 04 +# tag segments +# +# version 03 +# add detailed output option (intermediate document and segment scores) +# +# version 02 +# accommodation of modified sgml tags and attributes +# +# version 01 +# same as bleu version 15, but modified to provide formal score output. +# +# original IBM version +# Author: Kishore Papineni +# Date: 06/10/2001 +################################# + +###### +# Intro +my ($date, $time) = date_time_stamp(); +print "MT evaluation scorer began on $date at $time\n"; +print "command line: ", $0, " ", join(" ", @ARGV), "\n"; +my $usage = "\n\nUsage: $0 [-h] -r -s -t \n\n". + "Description: This Perl script evaluates MT system performance.\n". + "\n". + "Required arguments:\n". + " -r is a file containing the reference translations for\n". + " the documents to be evaluated.\n". + " -s is a file containing the source documents for which\n". + " translations are to be evaluated\n". + " -t is a file containing the translations to be evaluated\n". + "\n". + "Optional arguments:\n". + " -c preserves upper-case alphabetic characters\n". + " -b generate BLEU scores only\n". + " -n generate NIST scores only\n". + " -d detailed output flag used in conjunction with \"-b\" or \"-n\" flags:\n". + " 0 (default) for system-level score only\n". + " 1 to include document-level scores\n". + " 2 to include segment-level scores\n". + " 3 to include ngram-level scores\n". + " -e enclose non-ASCII characters between spaces\n". + " -h prints this help message to STDOUT\n". + "\n"; + +use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e); +use Getopt::Std; +getopts ('r:s:t:d:hbncx:e'); +die $usage if defined($opt_h); +die "Error in command line: ref_file not defined$usage" unless defined $opt_r; +die "Error in command line: src_file not defined$usage" unless defined $opt_s; +die "Error in command line: tst_file not defined$usage" unless defined $opt_t; +my $max_Ngram = 9; +my $detail = defined $opt_d ? $opt_d : 0; +my $preserve_case = defined $opt_c ? 1 : 0; +my $split_non_ASCII = defined $opt_e ? 1 : 0; + +my $METHOD = "BOTH"; +if (defined $opt_b) { $METHOD = "BLEU"; } +if (defined $opt_n) { $METHOD = "NIST"; } +my $method; + +my ($ref_file) = $opt_r; +my ($src_file) = $opt_s; +my ($tst_file) = $opt_t; + +###### +# Global variables +my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters +my (%tst_data, %ref_data); # the data -- with structure: {system}{document}[segments] +my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets +my %eval_docs; # document information for the evaluation data set +my %ngram_info; # the information obtained from (the last word in) the ngram + +###### +# Get source document ID's +($src_id) = get_source_info ($src_file); + +###### +# Get reference translations +($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file); + +compute_ngram_info (); + +###### +# Get translations to evaluate +($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file); + +###### +# Check data for completeness and correctness +check_MT_data (); + +###### +# +my %NISTmt = (); +my %BLEUmt = (); + +###### +# Evaluate +print " Evaluation of $src_lang-to-$tgt_lang translation using:\n"; +my $cum_seg = 0; +foreach my $doc (sort keys %eval_docs) { + $cum_seg += @{$eval_docs{$doc}{SEGS}}; +} +print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n"; +print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n"; +print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n"; + +foreach my $sys (sort @tst_sys) { + for (my $n=1; $n<=$max_Ngram; $n++) { + $NISTmt{$n}{$sys}{cum} = 0; + $NISTmt{$n}{$sys}{ind} = 0; + $BLEUmt{$n}{$sys}{cum} = 0; + $BLEUmt{$n}{$sys}{ind} = 0; + } + + if (($METHOD eq "BOTH") || ($METHOD eq "NIST")) { + $method="NIST"; + score_system ($sys, %NISTmt); + } + if (($METHOD eq "BOTH") || ($METHOD eq "BLEU")) { + $method="BLEU"; + score_system ($sys, %BLEUmt); + } +} + +###### +printout_report (); + +($date, $time) = date_time_stamp(); +print "MT evaluation scorer ended on $date at $time\n"; + +exit 0; + +################################# + +sub get_source_info { + + my ($file) = @_; + my ($name, $id, $src, $doc); + my ($data, $tag, $span); + + +#read data from file + open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; + binmode FILE, ":utf8"; + $data .= $_ while ; + close (FILE); + +#get source set info + die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n" + unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data); + + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); + + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" + ." with $name in previous input data ('$src_lang')\n\n" + unless (not defined $src_lang or $src eq $src_lang); + $src_lang = $src; + +#get doc info -- ID and # of segs + $data = $span; + while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag); + die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n" + if defined $eval_docs{$doc}; + $span =~ s/[\s\n\r]+/ /g; # concatenate records + my $jseg=0, my $seg_data = $span; + while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) { + ($eval_docs{$doc}{SEGS}[$jseg++]) = NormalizeText ($span); + } + die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" + if $jseg == 0; + } + die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n" + unless keys %eval_docs > 0; + return $id; +} + +################################# + +sub get_MT_data { + + my ($docs, $set_tag, $file) = @_; + my ($name, $id, $src, $tgt, $sys, $doc); + my ($tag, $span, $data); + +#read data from file + open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; + binmode FILE, ":utf8"; + $data .= $_ while ; + close (FILE); + +#get tag info + while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless + ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); + + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless + ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" + ." with $name of source ('$src_lang')\n\n" + unless $src eq $src_lang; + + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless + ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n" + ." with $name of the evaluation ('$tgt_lang')\n\n" + unless (not defined $tgt_lang or $tgt eq $tgt_lang); + $tgt_lang = $tgt; + + my $mtdata = $span; + while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless + (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag); + + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless + $doc = extract_sgml_tag_attribute ($name="DocID", $tag); + + die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n" + ." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n" + unless (not defined $docs->{$sys}{$doc}); + + $span =~ s/[\s\n\r]+/ /g; # concatenate records + my $jseg=0, my $seg_data = $span; + while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) { + ($docs->{$sys}{$doc}{SEGS}[$jseg++]) = NormalizeText ($span); + } + die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" + if $jseg == 0; + $docs->{$sys}{$doc}{FILE} = $file; + } + } + return $id; +} + +################################# + +sub check_MT_data { + + @tst_sys = sort keys %tst_data; + @ref_sys = sort keys %ref_data; + +#every evaluation document must be represented for every system and every reference + foreach my $doc (sort keys %eval_docs) { + my $nseg_source = @{$eval_docs{$doc}{SEGS}}; + foreach my $sys (@tst_sys) { + die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n" + unless defined $tst_data{$sys}{$doc}; + my $nseg = @{$tst_data{$sys}{$doc}{SEGS}}; + die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" + ." document '$doc' for system '$sys' contains $nseg segments, while\n" + ." the source document contains $nseg_source segments.\n\n" + unless $nseg == $nseg_source; + } + + foreach my $sys (@ref_sys) { + die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n" + unless defined $ref_data{$sys}{$doc}; + my $nseg = @{$ref_data{$sys}{$doc}{SEGS}}; + die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" + ." document '$doc' for system '$sys' contains $nseg segments, while\n" + ." the source document contains $nseg_source segments.\n\n" + unless $nseg == $nseg_source; + } + } +} + +################################# + +sub compute_ngram_info { + + my ($ref, $doc, $seg); + my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram); + my (%ngram_count, @tot_ngrams); + + foreach $ref (keys %ref_data) { + foreach $doc (keys %{$ref_data{$ref}}) { + foreach $seg (@{$ref_data{$ref}{$doc}{SEGS}}) { + @wrds = split /\s+/, $seg; + $tot_wrds += @wrds; + %ngrams = %{Words2Ngrams (@wrds)}; + foreach $ngram (keys %ngrams) { + $ngram_count{$ngram} += $ngrams{$ngram}; + } + } + } + } + + foreach $ngram (keys %ngram_count) { + @wrds = split / /, $ngram; + pop @wrds, $mgram = join " ", @wrds; + $ngram_info{$ngram} = - log + ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} + : $ngram_count{$ngram}/$tot_wrds) / log 2; + if (defined $opt_x and $opt_x eq "ngram info") { + @wrds = split / /, $ngram; + printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram}, + $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram; + } + } +} + +################################# + +sub score_system { + + my ($sys, $ref, $doc, %SCOREmt); + ($sys, %SCOREmt) = @_; + my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); + my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); + + $cum_ref_length = 0; + for (my $j=1; $j<=$max_Ngram; $j++) { + $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; + } + + foreach $doc (sort keys %eval_docs) { + ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc); + +#output document summary score + if (($detail >= 1 ) && ($METHOD eq "NIST")) { + my %DOCmt = (); + printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", + nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), + scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; + } + if (($detail >= 1 ) && ($METHOD eq "BLEU")) { + my %DOCmt = (); + printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", + bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), + scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; + } + + $cum_ref_length += $shortest_ref_length; + for (my $j=1; $j<=$max_Ngram; $j++) { + $cum_match[$j] += $match_cnt->[$j]; + $cum_tst_cnt[$j] += $tst_cnt->[$j]; + $cum_ref_cnt[$j] += $ref_cnt->[$j]; + $cum_tst_info[$j] += $tst_info->[$j]; + $cum_ref_info[$j] += $ref_info->[$j]; + printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j], + $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j] + if (defined $opt_x and $opt_x eq "document info"); + } + } + +#x #output system summary score +#x printf "$method score = %.4f for system \"$sys\"\n", +#x $method eq "BLEU" ? bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt) : +#x nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt); + if ($method eq "BLEU") { + bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, %SCOREmt); + } + if ($method eq "NIST") { + nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt); + } +} + +################################# + +sub score_document { + + my ($sys, $ref, $doc); + ($sys, $doc) = @_; + my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); + my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); + + $cum_ref_length = 0; + for (my $j=1; $j<=$max_Ngram; $j++) { + $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; + } + +#score each segment + for (my $jseg=0; $jseg<@{$tst_data{$sys}{$doc}{SEGS}}; $jseg++) { + my @ref_segments = (); + foreach $ref (@ref_sys) { + push @ref_segments, $ref_data{$ref}{$doc}{SEGS}[$jseg]; + printf "ref '$ref', seg %d: %s\n", $jseg+1, $ref_data{$ref}{$doc}{SEGS}[$jseg] + if $detail >= 3; + } + printf "sys '$sys', seg %d: %s\n", $jseg+1, $tst_data{$sys}{$doc}{SEGS}[$jseg] + if $detail >= 3; + ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = + score_segment ($tst_data{$sys}{$doc}{SEGS}[$jseg], @ref_segments); + +#output segment summary score +#x printf "$method score = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n", +#x $method eq "BLEU" ? bleu_score($shortest_ref_length, $match_cnt, $tst_cnt) : +#x nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info), +#x $jseg+1, $tst_cnt->[1] +#x if $detail >= 2; + if (($detail >=2) && ($METHOD eq "BLEU")) { + my %DOCmt = (); + printf " $method score using 4-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n", + bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), $jseg+1, $tst_cnt->[1]; + } + if (($detail >=2) && ($METHOD eq "NIST")) { + my %DOCmt = (); + printf " $method score using 5-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n", + nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), $jseg+1, $tst_cnt->[1]; + } + + + $cum_ref_length += $shortest_ref_length; + for (my $j=1; $j<=$max_Ngram; $j++) { + $cum_match[$j] += $match_cnt->[$j]; + $cum_tst_cnt[$j] += $tst_cnt->[$j]; + $cum_ref_cnt[$j] += $ref_cnt->[$j]; + $cum_tst_info[$j] += $tst_info->[$j]; + $cum_ref_info[$j] += $ref_info->[$j]; + } + } + return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]); +} + +################################# + +sub score_segment { + + my ($tst_seg, @ref_segs) = @_; + my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info); + my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info); + my ($ngram); + my (@nwrds_ref); + my $shortest_ref_length; + + for (my $j=1; $j<= $max_Ngram; $j++) { + $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0; + } + +# get the ngram counts for the test segment + @tst_wrds = split /\s+/, $tst_seg; + %tst_ngrams = %{Words2Ngrams (@tst_wrds)}; + for (my $j=1; $j<=$max_Ngram; $j++) { # compute ngram counts + $tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0; + } + +# get the ngram counts for the reference segments + foreach $ref_seg (@ref_segs) { + @ref_wrds = split /\s+/, $ref_seg; + %ref_ngrams = %{Words2Ngrams (@ref_wrds)}; + foreach $ngram (keys %ref_ngrams) { # find the maximum # of occurrences + my @wrds = split / /, $ngram; + $ref_info[@wrds] += $ngram_info{$ngram}; + $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? + max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : + $ref_ngrams{$ngram}; + } + for (my $j=1; $j<=$max_Ngram; $j++) { # update ngram counts + $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0; + } + $shortest_ref_length = scalar @ref_wrds # find the shortest reference segment + if (not defined $shortest_ref_length) or @ref_wrds < $shortest_ref_length; + } + +# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams + foreach $ngram (keys %tst_ngrams) { + next unless defined $ref_ngrams_max{$ngram}; + my @wrds = split / /, $ngram; + $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); + $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); + printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram + if $detail >= 3; + } + + return ($shortest_ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]); +} + +################################# + +sub bleu_score { + + my ($shortest_ref_length, $matching_ngrams, $tst_ngrams, $sys, %SCOREmt) = @_; + + my $score = 0; + my $iscore = 0; + my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]); + print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n"; + + for (my $j=1; $j<=$max_Ngram; $j++) { + if ($matching_ngrams->[$j] == 0) { + $SCOREmt{$j}{$sys}{cum}=0; + } else { +# Cumulative N-Gram score + $score += log ($matching_ngrams->[$j]/$tst_ngrams->[$j]); + $SCOREmt{$j}{$sys}{cum} = exp($score/$j + $len_score); +# Individual N-Gram score + $iscore = log ($matching_ngrams->[$j]/$tst_ngrams->[$j]); + $SCOREmt{$j}{$sys}{ind} = exp($iscore); + } + } + return $SCOREmt{4}{$sys}{cum}; +} + +################################# + +sub nist_score { + + my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, %SCOREmt) = @_; + + my $score = 0; + my $iscore = 0; + + + for (my $n=1; $n<=$max_Ngram; $n++) { + $score += $tst_info->[$n]/max($tst_ngrams->[$n],1); + $SCOREmt{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); + + $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1); + $SCOREmt{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); + } + return $SCOREmt{5}{$sys}{cum}; +} + +################################# + +sub Words2Ngrams { #convert a string of words to an Ngram count hash + + my %count = (); + + for (; @_; shift) { + my ($j, $ngram, $word); + for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) { + $ngram .= defined $ngram ? " $word" : $word; + $count{$ngram}++; + } + } + return {%count}; +} + +################################# + +sub NormalizeText { + my ($norm_text) = @_; + + $norm_text =~ s///g; # strip "skipped" tags + $norm_text =~ s/\p{Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines + $norm_text =~ s/\p{Zl}/ /g; # join lines + + # replace entities + $norm_text =~ s/"/\"/g; # quote to " + $norm_text =~ s/&/&/g; # ampersand to & + $norm_text =~ s/<//g; # greater-than to > + $norm_text =~ s/'/\'/g; # apostrophe to ' + + $norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed + $norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII ); + + # punctuation: tokenize any punctuation unless followed AND preceded by a digit + $norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g; + $norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g; + + $norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols + + $norm_text =~ s/\p{Z}+/ /g; # one space only between words + $norm_text =~ s/^\p{Z}+//; # no leading space + $norm_text =~ s/\p{Z}+$//; # no trailing space + + return $norm_text; +} + +################################# + +sub nist_length_penalty { + + my ($ratio) = @_; + return 1 if $ratio >= 1; + return 0 if $ratio <= 0; + my $ratio_x = 1.5; + my $score_x = 0.5; + my $beta = -log($score_x)/log($ratio_x)/log($ratio_x); + return exp (-$beta*log($ratio)*log($ratio)); +} + +################################# + +sub date_time_stamp { + + my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); + my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); + my ($date, $time); + + $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec; + $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday; + return ($date, $time); +} + +################################# + +sub extract_sgml_tag_and_span { + + my ($name, $data) = @_; + + ($data =~ m|<$name\s*([^>]*)>(.*?)(.*)|si) ? ($1, $2, $3) : (); +} + +################################# + +sub extract_sgml_tag_attribute { + + my ($name, $data) = @_; + + ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : (); +} + +################################# + +sub max { + + my ($max, $next); + + return unless defined ($max=pop); + while (defined ($next=pop)) { + $max = $next if $next > $max; + } + return $max; +} + +################################# + +sub min { + + my ($min, $next); + + return unless defined ($min=pop); + while (defined ($next=pop)) { + $min = $next if $next < $min; + } + return $min; +} + +################################# + +sub printout_report +{ + + if ( $METHOD eq "BOTH" ) { + foreach my $sys (sort @tst_sys) { + printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum}; + } + } elsif ($METHOD eq "NIST" ) { + foreach my $sys (sort @tst_sys) { + printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum}; + } + } elsif ($METHOD eq "BLEU" ) { + foreach my $sys (sort @tst_sys) { + printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum}; + } + } + + + printf "\n# ------------------------------------------------------------------------\n\n"; + printf "Individual N-gram scoring\n"; + printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; + printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; + + if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) { + foreach my $sys (sort @tst_sys) { + printf " NIST:"; + for (my $i=1; $i<=$max_Ngram; $i++) { + printf " %2.4f ",$NISTmt{$i}{$sys}{ind} + } + printf " \"$sys\"\n"; + } + printf "\n"; + } + + if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) { + foreach my $sys (sort @tst_sys) { + printf " BLEU:"; + for (my $i=1; $i<=$max_Ngram; $i++) { + printf " %2.4f ",$BLEUmt{$i}{$sys}{ind} + } + printf " \"$sys\"\n"; + } + } + + printf "\n# ------------------------------------------------------------------------\n"; + printf "Cumulative N-gram scoring\n"; + printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; + printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; + + if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) { + foreach my $sys (sort @tst_sys) { + printf " NIST:"; + for (my $i=1; $i<=$max_Ngram; $i++) { + printf " %2.4f ",$NISTmt{$i}{$sys}{cum} + } + printf " \"$sys\"\n"; + } + } + printf "\n"; + + + if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) { + foreach my $sys (sort @tst_sys) { + printf " BLEU:"; + for (my $i=1; $i<=$max_Ngram; $i++) { + printf " %2.4f ",$BLEUmt{$i}{$sys}{cum} + } + printf " \"$sys\"\n"; + } + } +} diff --git a/mosesdecoder/scripts/generic/mteval-v13a.pl b/mosesdecoder/scripts/generic/mteval-v13a.pl new file mode 100644 index 0000000000000000000000000000000000000000..c7749269886c2a006b6211dc139a0340fb8d65a8 --- /dev/null +++ b/mosesdecoder/scripts/generic/mteval-v13a.pl @@ -0,0 +1,1170 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use utf8; +use Encode; +use XML::Twig; + +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + + +################################# +# History: +# +# version 13a +# * modified the scoring functions to prevent division-by-zero errors when a system segment is empty +# * affected methods: 'bleu_score' and 'bleu_score_smoothing' +# * use \p{Line_Breaks} instead of \p{Hyphen} when stripping end-of-line hyphenation and join lines +# * because \p{Hyphen} is deprecated since 2016-06-01, see http://www.unicode.org/reports/tr14/#Hyphen +# +# version 13 +# * Uses a XML parser to read data (only when extension is .xml) +# * Smoothing of the segment-level BLEU scores, done by default +# * smoothing method similar to that of bleu-1.04.pl (IBM) +# * see comments above the 'bleu_score' method for more details on how the smoothing is computed +# * added a '--no-smoothing' option to simulate old scripts behavior +# * Introduction of the 'brevity-penalty' option, taking one of two values: +# * 'closest' (default) : act as IBM BLEU (taking the closest reference translation length) +# * in case two reference translations are at the same distance, will take the shortest one +# * for more details regarding how the BP is computed, see comments of the 'brevity_penalty_closest' function +# * 'shortest' : act as previous versions of the script (taking shortest reference translation length) +# * Introduction of the 'international-tokenization' option, boolean, disabled by default +# by default (when the option is not provided), uses 11b's tokenization function +# when option specified, uses v12's tokenization function +# * Introduction of a 'Metrics MATR output' flag (option '--metricsMATR') +# when used, creates three files for both BLEU score and NIST score: +# * BLEU-seg.scr and NIST-seg.scr: contain segment-level scores +# * BLEU-doc.scr and NIST-doc.scr: contain document-level scores +# * BLEU-sys.scr and NIST-sys.scr: contain system-level scores +# * SGML parsing +# * script will halt if source, reference and test files don't share the same setid attribute value (used for metricsMATR output) +# * correct segment IDs extracted from the files (was previously using an array, and using the index as a segID for output) +# * detailed output flag (-d) can now be used when running both BLEU and NIST +# +# version 12 +# * Text normalization changes: +# * convert entity references (only the entities declared in the DTD) +# * now uses unicode categories +# * tokenize punctuation unless followed AND preceded by digits +# * tokenize symbols +# * UTF-8 handling: +# * files are now read using utf8 mode +# * Added the '-e' command-line option to enclose non-ASCII characters between spaces +# +# version 11b -- text normalization modified: +# * take out the join digit line because it joins digits +# when it shouldn't have +# $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits +# +# version 11a -- corrected output of individual n-gram precision values +# +# version 11 -- bug fixes: +# * make filehandle operate in binary mode to prevent Perl from operating +# (by default in Red Hat 9) in UTF-8 +# * fix failure on joining digits +# version 10 -- updated output to include more details of n-gram scoring. +# Defaults to generate both NIST and BLEU scores. Use -b for BLEU +# only, use -n for NIST only +# +# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4 +# being the max, regardless what was entered on the command line.) +# +# version 09c -- bug fix (During the calculation of ngram information, +# each ngram was being counted only once for each segment. This has +# been fixed so that each ngram is counted correctly in each segment.) +# +# version 09b -- text normalization modified: +# * option flag added to preserve upper case +# * non-ASCII characters left in place. +# +# version 09a -- text normalization modified: +# * " and & converted to "" and &, respectively +# * non-ASCII characters kept together (bug fix) +# +# version 09 -- modified to accommodate sgml tag and attribute +# names revised to conform to default SGML conventions. +# +# version 08 -- modifies the NIST metric in accordance with the +# findings on the 2001 Chinese-English dry run corpus. Also +# incorporates the BLEU metric as an option and supports the +# output of ngram detail. +# +# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI +# Keep strings of non-ASCII characters together as one word +# (rather than splitting them into one-character words). +# Change length penalty so that translations that are longer than +# the average reference translation are not penalized. +# +# version 06 +# Prevent divide-by-zero when a segment has no evaluation N-grams. +# Correct segment index for level 3 debug output. +# +# version 05 +# improve diagnostic error messages +# +# version 04 +# tag segments +# +# version 03 +# add detailed output option (intermediate document and segment scores) +# +# version 02 +# accommodation of modified sgml tags and attributes +# +# version 01 +# same as bleu version 15, but modified to provide formal score output. +# +# original IBM version +# Author: Kishore Papineni +# Date: 06/10/2001 +################################# + +###### +# Intro +my ($date, $time) = date_time_stamp(); +print "MT evaluation scorer began on $date at $time\n"; +print "command line: ", $0, " ", join(" ", @ARGV), "\n"; +my $usage = "\n\nUsage: $0 -r -s -t \n\n". + "Description: This Perl script evaluates MT system performance.\n". + "\n". + "Required arguments:\n". + " -r is a file containing the reference translations for\n". + " the documents to be evaluated.\n". + " -s is a file containing the source documents for which\n". + " translations are to be evaluated\n". + " -t is a file containing the translations to be evaluated\n". + "\n". + "Optional arguments:\n". + " -h prints this help message to STDOUT\n". + " -c preserves upper-case alphabetic characters\n". + " -b generate BLEU scores only\n". + " -n generate NIST scores only\n". + " -d detailed output flag:\n". + " 0 (default) for system-level score only\n". + " 1 to include document-level scores\n". + " 2 to include segment-level scores\n". + " 3 to include ngram-level scores\n". + " -e enclose non-ASCII characters between spaces\n". + " --brevity-penalty ( closest | shortest )\n" . + " closest (default) : acts as IBM BLEU (takes the closest reference translation length)\n" . + " shortest : acts as previous versions of the script (takes the shortest reference translation length)\n" . + " --international-tokenization\n" . + " when specified, uses Unicode-based (only) tokenization rules\n" . + " when not specified (default), uses default tokenization (some language-dependant rules)\n" . + " --metricsMATR : create three files for both BLEU scores and NIST scores:\n" . + " BLEU-seg.scr and NIST-seg.scr : segment-level scores\n" . + " BLEU-doc.scr and NIST-doc.scr : document-level scores\n" . + " BLEU-sys.scr and NIST-sys.scr : system-level scores\n" . + " --no-smoothing : disable smoothing on BLEU scores\n" . + "\n"; + +use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e); +use Getopt::Long; +my $ref_file = ''; +my $src_file = ''; +my $tst_file = ''; +my $detail = 0; +my $help = ''; +my $preserve_case = ''; +my $split_non_ASCII = ''; +my $brevity_penalty = 'closest'; +my $international_tokenization; +my $metricsMATR_output = ''; +my $no_smoothing = ''; +our $opt_x = ''; +our $opt_b = ''; +our $opt_n = ''; +GetOptions( + 'r=s' => \$ref_file, + 's=s' => \$src_file, + 't=s' => \$tst_file, + 'd:i' => \$detail, + 'h|help' => \$help, + 'b', + 'n', + 'c' => \$preserve_case, + 'x:s', + 'e' => \$split_non_ASCII, + 'brevity-penalty:s' => \$brevity_penalty, + 'international-tokenization' => \$international_tokenization, + 'metricsMATR-output' => \$metricsMATR_output, + 'no-smoothing' => \$no_smoothing +); +die $usage if $help; + +die "Error in command line: ref_file not defined$usage" unless ( $ref_file ); +die "Error in command line: src_file not defined$usage" unless ( $src_file ); +die "Error in command line: tst_file not defined$usage" unless ( $tst_file ); +my $BLEU_BP; +if ( !( $brevity_penalty cmp 'closest' ) ) +{ + $BLEU_BP = \&brevity_penalty_closest; +} +elsif ( !( $brevity_penalty cmp 'shortest' ) ) +{ + $BLEU_BP = \&brevity_penalty_shortest; +} +else +{ + die "Incorrect value supplied for 'brevity_penalty'$usage"; +} +my $TOKENIZATION = \&tokenization; +$TOKENIZATION = \&tokenization_international if ( $international_tokenization ); + +my $BLEU_SCORE = \&bleu_score; +$BLEU_SCORE = \&bleu_score_nosmoothing if ( $no_smoothing ); + +my $max_Ngram = 9; + +my $METHOD = "BOTH"; +if ( $opt_b ) { $METHOD = "BLEU"; } +if ( $opt_n ) { $METHOD = "NIST"; } +my $method; + +###### +# Global variables +my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters +my (%tst_data, %ref_data); # the data -- with structure: {system}{document}{segments} +my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets +my %eval_docs; # document information for the evaluation data set +my %ngram_info; # the information obtained from (the last word in) the ngram + +###### +# Get source document ID's +($src_id) = get_source_info ($src_file); + +###### +# Get reference translations +($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file); + +compute_ngram_info (); + +###### +# Get translations to evaluate +($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file); + +###### +# Check data for completeness and correctness +check_MT_data (); + +###### +# +my %NISTmt; +my %NISTOverall; +my %BLEUmt; +my %BLEUOverall; + +###### +# Evaluate +print " Evaluation of $src_lang-to-$tgt_lang translation using:\n"; +my $cum_seg = 0; +foreach my $doc (sort keys %eval_docs) +{ + $cum_seg += scalar( keys( %{$eval_docs{$doc}{SEGS}} ) ); +} +print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n"; +print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n"; +print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n"; + +foreach my $sys (sort @tst_sys) +{ + for (my $n=1; $n<=$max_Ngram; $n++) + { + $NISTmt{$n}{$sys}{cum} = 0; + $NISTmt{$n}{$sys}{ind} = 0; + $BLEUmt{$n}{$sys}{cum} = 0; + $BLEUmt{$n}{$sys}{ind} = 0; + } + if ( ($METHOD eq "BOTH") || ($METHOD eq "NIST") ) + { + $method="NIST"; + score_system ($sys, \%NISTmt, \%NISTOverall); + } + if ( ($METHOD eq "BOTH") || ($METHOD eq "BLEU") ) + { + $method="BLEU"; + score_system ($sys, \%BLEUmt, \%BLEUOverall); + } +} + +###### +printout_report (); +if ( $metricsMATR_output ) +{ + outputMetricsMATR( 'NIST', %NISTOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'NIST' ) ); + outputMetricsMATR( 'BLEU', %BLEUOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'BLEU' ) ); +} + +($date, $time) = date_time_stamp(); +print "MT evaluation scorer ended on $date at $time\n"; + +exit 0; + +################################# + +sub get_source_info +{ + my ($file) = @_; + my ($name, $id, $src, $doc, $seg); + my ($data, $tag, $span); + + # Extension of the file determines the parser used: + # .xml : XML::Twig + # otherwise : simple SGML parsing functions + if ( $file =~ /\.xml$/i ) + { + my $twig = XML::Twig->new(); + $twig->parsefile( $file ); + my $root = $twig->root; + my $currentSet = $root->first_child( 'srcset' ); + die "Source XML file '$file' does not contain the 'srcset' element" if ( not $currentSet ); + $id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'"; + $src = $currentSet->{ 'att' }->{ 'srclang' } or die "No srcset 'srclang' attribute value in '$file'"; + die "Not the same srclang attribute values across sets" unless ( not defined $src_lang or $src eq $src_lang ); + $src_lang = $src; + foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) ) + { + my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'"; + foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) ) + { + my $segID = $currentSeg->{ 'att' }->{ 'id' } or die "No segment 'id' attribute value in '$file'"; + my $segData = $currentSeg->text; + ($eval_docs{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData ); + } + } + } + else + { + #read data from file + open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; + binmode FILE, ":utf8"; + $data .= $_ while ; + close (FILE); + + #get source set info + die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n" + unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data); + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" + ." with $name in previous input data ('$src_lang')\n\n" + unless (not defined $src_lang or $src eq $src_lang); + $src_lang = $src; + + #get doc info -- ID and # of segs + $data = $span; + while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) + { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag); + die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n" + if defined $eval_docs{$doc}; + $span =~ s/[\s\n\r]+/ /g; # concatenate records + my $nseg=0, my $seg_data = $span; + while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) + { + die "\n\nFATAL INPUT ERROR: no attribute '$name' in file '$file'\n\n" + unless ($seg) = extract_sgml_tag_attribute( $name='id', $tag ); + ($eval_docs{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span ); + $nseg++; + } + die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" + if $nseg == 0; + } + die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n" + unless keys %eval_docs > 0; + } + return $id; +} + +################################# + +sub get_MT_data +{ + my ($docs, $set_tag, $file) = @_; + my ($name, $id, $src, $tgt, $sys, $doc, $seg); + my ($tag, $span, $data); + + # Extension of the file determines the parser used: + # .xml : XML::Twig + # otherwise : simple SGML parsing functions + if ( $file =~ /\.xml$/i ) + { + my $twig = XML::Twig->new(); + $twig->parsefile( $file ); + my $root = $twig->root; + foreach my $currentSet ( $root->get_xpath( 'refset' ), $root->get_xpath( 'tstset' ) ) + { + $id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'"; + $src = $currentSet->{ 'att' }->{ 'srclang' } or die "No 'srclang' attribute value in '$file'"; + $tgt = $currentSet->{ 'att' }->{ 'trglang' } or die "No 'trglang' attribute value in '$file'"; + die "Not the same 'srclang' attribute value across sets" unless ( $src eq $src_lang ); + die "Not the same 'trglang' attribute value across sets" unless ( ( not defined $tgt_lang ) or ( $tgt = $tgt_lang ) ); + $tgt_lang = $tgt; + my $sys; + if ( $currentSet->name eq 'tstset' ) + { + $sys = $currentSet->{ 'att' }->{ 'sysid' } or die "No 'sysid' attribute value in '$file'"; + } + else + { + $sys = $currentSet->{ 'att' }->{ 'refid' } or die "No 'refid' attribute value in '$file'"; + } + foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) ) + { + my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'"; + $docs->{ $sys }{ $docID }{ FILE } = $file; + foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) ) + { + my $segID = $currentSeg->{ 'att' }->{ 'id' } or die "No segment 'id' attribute value in '$file'"; + my $segData = $currentSeg->text; + ($docs->{$sys}{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData ); + } + } + } + } + else + { + #read data from file + open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; + binmode FILE, ":utf8"; + $data .= $_ while ; + close (FILE); + + #get tag info + while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) + { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" + ." with $name of source ('$src_lang')\n\n" + unless $src eq $src_lang; + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n" + ." with $name of the evaluation ('$tgt_lang')\n\n" + unless (not defined $tgt_lang or $tgt eq $tgt_lang); + $tgt_lang = $tgt; + + my $mtdata = $span; + while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) + { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag); + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless $doc = extract_sgml_tag_attribute ($name="DocID", $tag); + die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n" + ." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n" + unless (not defined $docs->{$sys}{$doc}); + + $span =~ s/[\s\n\r]+/ /g; # concatenate records + my $nseg=0, my $seg_data = $span; + while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) + { + die "\n\nFATAIL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless $seg = extract_sgml_tag_attribute( $name="id", $tag ); + ($docs->{$sys}{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span ); + $nseg++; + } + die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" if $nseg == 0; + $docs->{$sys}{$doc}{FILE} = $file; + } + } + } + return $id; +} + +################################# + +sub check_MT_data +{ + @tst_sys = sort keys %tst_data; + @ref_sys = sort keys %ref_data; + + die "Not the same 'setid' attribute values across files" unless ( ( $src_id eq $tst_id ) && ( $src_id eq $ref_id ) ); + +#every evaluation document must be represented for every system and every reference + foreach my $doc (sort keys %eval_docs) + { + my $nseg_source = scalar( keys( %{$eval_docs{$doc}{SEGS}} ) ); + foreach my $sys (@tst_sys) + { + die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n" unless defined $tst_data{$sys}{$doc}; + my $nseg = scalar( keys( %{$tst_data{$sys}{$doc}{SEGS}} ) ); + die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" + ." document '$doc' for system '$sys' contains $nseg segments, while\n" + ." the source document contains $nseg_source segments.\n\n" + unless $nseg == $nseg_source; + } + foreach my $sys (@ref_sys) + { + die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n" unless defined $ref_data{$sys}{$doc}; + my $nseg = scalar( keys( %{$ref_data{$sys}{$doc}{SEGS}} ) ); + die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" + ." document '$doc' for system '$sys' contains $nseg segments, while\n" + ." the source document contains $nseg_source segments.\n\n" + unless $nseg == $nseg_source; + } + } +} + +################################# + +sub compute_ngram_info +{ + my ($ref, $doc, $seg); + my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram); + my (%ngram_count, @tot_ngrams); + + foreach $ref (keys %ref_data) + { + foreach $doc (keys %{$ref_data{$ref}}) + { + foreach $seg ( keys %{$ref_data{$ref}{$doc}{SEGS}}) + { + @wrds = split /\s+/, $ref_data{ $ref }{ $doc }{ SEGS }{ $seg }; + $tot_wrds += @wrds; + %ngrams = %{Words2Ngrams (@wrds)}; + foreach $ngram (keys %ngrams) + { + $ngram_count{$ngram} += $ngrams{$ngram}; + } + } + } + } + + foreach $ngram (keys %ngram_count) + { + @wrds = split / /, $ngram; + pop @wrds, $mgram = join " ", @wrds; + $ngram_info{$ngram} = - log ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} : $ngram_count{$ngram}/$tot_wrds) / log 2; + if (defined $opt_x and $opt_x eq "ngram info") + { + @wrds = split / /, $ngram; + printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram}, + $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram; + } + } +} + +################################# + +sub score_system +{ + my ($sys, $ref, $doc, $SCOREmt, $overallScore); + ($sys, $SCOREmt, $overallScore) = @_; + my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); + my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); + + $cum_ref_length = 0; + for (my $j=1; $j<=$max_Ngram; $j++) + { + $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; + } + foreach $doc (sort keys %eval_docs) + { + ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc, $overallScore); + if ( $method eq "NIST" ) + { + my %DOCmt = (); + my $docScore = nist_score( scalar( @ref_sys ), $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, \%DOCmt ); + $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore; + if ( $detail >= 1 ) + { + printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", + $docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; + } + } + + if ( $method eq "BLEU" ) + { + my %DOCmt = (); + my $docScore = &{$BLEU_SCORE}( $ref_length, $match_cnt, $tst_cnt, $sys, \%DOCmt ); + $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore; + if ( $detail >= 1 ) + { + printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", + $docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; + } + } + + $cum_ref_length += $ref_length; + for (my $j=1; $j<=$max_Ngram; $j++) + { + $cum_match[$j] += $match_cnt->[$j]; + $cum_tst_cnt[$j] += $tst_cnt->[$j]; + $cum_ref_cnt[$j] += $ref_cnt->[$j]; + $cum_tst_info[$j] += $tst_info->[$j]; + $cum_ref_info[$j] += $ref_info->[$j]; + printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j], + $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j] + if (defined $opt_x and $opt_x eq "document info"); + } + } + + if ($method eq "BLEU") + { + $overallScore->{ $sys }{ 'score' } = &{$BLEU_SCORE}($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, $SCOREmt, 1); + } + if ($method eq "NIST") + { + $overallScore->{ $sys }{ 'score' } = nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, $SCOREmt); + } +} + +################################# + +sub score_document +{ + my ($sys, $ref, $doc, $overallScore); + ($sys, $doc, $overallScore) = @_; + my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); + my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); + + $cum_ref_length = 0; + for (my $j=1; $j<=$max_Ngram; $j++) + { + $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; + } + +#score each segment + foreach my $seg ( sort{ $a <=> $b } keys( %{$tst_data{$sys}{$doc}{SEGS}} ) ) + { + my @ref_segments = (); + foreach $ref (@ref_sys) + { + push @ref_segments, $ref_data{$ref}{$doc}{SEGS}{$seg}; + if ( $detail >= 3 ) + { + printf "ref '$ref', seg $seg: %s\n", $ref_data{$ref}{$doc}{SEGS}{$seg} + } + + } + + printf "sys '$sys', seg $seg: %s\n", $tst_data{$sys}{$doc}{SEGS}{$seg} if ( $detail >= 3 ); + ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_segment ($tst_data{$sys}{$doc}{SEGS}{$seg}, @ref_segments); + + if ( $method eq "BLEU" ) + { + my %DOCmt = (); + my $segScore = &{$BLEU_SCORE}($ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt); + $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore; + if ( $detail >= 2 ) + { + printf " $method score using 4-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1] + } + } + if ( $method eq "NIST" ) + { + my %DOCmt = (); + my $segScore = nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt); + $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore; + if ( $detail >= 2 ) + { + printf " $method score using 5-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1]; + } + } + $cum_ref_length += $ref_length; + for (my $j=1; $j<=$max_Ngram; $j++) + { + $cum_match[$j] += $match_cnt->[$j]; + $cum_tst_cnt[$j] += $tst_cnt->[$j]; + $cum_ref_cnt[$j] += $ref_cnt->[$j]; + $cum_tst_info[$j] += $tst_info->[$j]; + $cum_ref_info[$j] += $ref_info->[$j]; + } + } + return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]); +} + +############################################################################################################################### +# function returning the shortest reference length +# takes as input: +# - currentLength : the current (shortest) reference length +# - referenceSentenceLength : the current reference sentence length +# - candidateSentenceLength : the current candidate sentence length (unused) +############################################################################################################################### +sub brevity_penalty_shortest +{ + my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_; + return ( $referenceSentenceLength < $currentLength ? $referenceSentenceLength : $currentLength ); +} + +############################################################################################################################### +# function returning the closest reference length (to the candidate sentence length) +# takes as input: +# - currentLength: the current (closest) reference length. +# - candidateSentenceLength : the current reference sentence length +# - candidateSentenceLength : the current candidate sentence length +# when two reference sentences are at the same distance, it will return the shortest reference sentence length +# example of 4 iterations, given: +# - one candidate sentence containing 7 tokens +# - one reference translation containing 11 tokens +# - one reference translation containing 8 tokens +# - one reference translation containing 6 tokens +# - one reference translation containing 7 tokens +# the multiple invokations will return: +# - currentLength is set to 11 (outside of this function) +# - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 ) +# - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8 +# - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 ) +############################################################################################################################### +sub brevity_penalty_closest +{ + my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_; + my $result = $currentLength; + if ( abs( $candidateSentenceLength - $referenceSentenceLength ) <= abs( $candidateSentenceLength - $currentLength ) ) + { + if ( abs( $candidateSentenceLength - $referenceSentenceLength ) == abs( $candidateSentenceLength - $currentLength ) ) + { + if ( $currentLength > $referenceSentenceLength ) + { + $result = $referenceSentenceLength; + } + } + else + { + $result = $referenceSentenceLength; + } + } + return $result; +} + +################################# + +sub score_segment +{ + my ($tst_seg, @ref_segs) = @_; + my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info); + my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info); + my ($ngram); + my (@nwrds_ref); + my $ref_length; + + for (my $j=1; $j<= $max_Ngram; $j++) + { + $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0; + } + +# get the ngram counts for the test segment + @tst_wrds = split /\s+/, $tst_seg; + %tst_ngrams = %{Words2Ngrams (@tst_wrds)}; + for (my $j=1; $j<=$max_Ngram; $j++) + { + # compute ngram counts + $tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0; + } + +# get the ngram counts for the reference segments + foreach $ref_seg (@ref_segs) + { + @ref_wrds = split /\s+/, $ref_seg; + %ref_ngrams = %{Words2Ngrams (@ref_wrds)}; + foreach $ngram (keys %ref_ngrams) + { + # find the maximum # of occurrences + my @wrds = split / /, $ngram; + $ref_info[@wrds] += $ngram_info{$ngram}; + $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : $ref_ngrams{$ngram}; + } + for (my $j=1; $j<=$max_Ngram; $j++) + { + # update ngram counts + $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0; + } + if ( not defined( $ref_length ) ) + { + $ref_length = scalar( @ref_wrds ); + } + else + { + $ref_length = &{$BLEU_BP}( $ref_length, scalar( @ref_wrds ), scalar( @tst_wrds ) ); + } + } + +# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams + foreach $ngram (keys %tst_ngrams) + { + next unless defined $ref_ngrams_max{$ngram}; + my @wrds = split / /, $ngram; + $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); + $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); + printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram + if $detail >= 3; + } + + return ($ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]); +} + +################################# + +sub bleu_score_nosmoothing +{ + my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_; + my $score = 0; + my $iscore = 0; + + for ( my $j = 1; $j <= $max_Ngram; ++$j ) + { + if ($matching_ngrams->[ $j ] == 0) + { + $SCOREmt->{ $j }{ $sys }{ cum }=0; + } + else + { + my $len_score = min (0, 1-$ref_length/$tst_ngrams->[1]); + # Cumulative N-Gram score + $score += log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] ); + $SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j + $len_score ); + # Individual N-Gram score + $iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] ); + $SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore ); + } + } + return $SCOREmt->{ 4 }{ $sys }{ cum }; +} + +############################################################################################################################### +# Default method used to compute the BLEU score, using smoothing. +# Note that the method used can be overridden using the '--no-smoothing' command-line argument +# The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each precision score whose matching n-gram count is null +# k is 1 for the first 'n' value for which the n-gram match count is null +# For example, if the text contains: +# - one 2-gram match +# - and (consequently) two 1-gram matches +# the n-gram count for each individual precision score would be: +# - n=1 => prec_count = 2 (two unigrams) +# - n=2 => prec_count = 1 (one bigram) +# - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) +# - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) +############################################################################################################################### +sub bleu_score +{ + my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt,$report_length) = @_; + my $score = 0; + my $iscore = 0; + my $exp_len_score = 0; + $exp_len_score = exp( min (0, 1 - $ref_length / $tst_ngrams->[ 1 ] ) ) if ( $tst_ngrams->[ 1 ] > 0 ); + print "length ratio: ".($tst_ngrams->[1]/$ref_length)." ($tst_ngrams->[1]/$ref_length), penalty (log): ".log($exp_len_score)."\n" if $report_length; + my $smooth = 1; + for ( my $j = 1; $j <= $max_Ngram; ++$j ) + { + if ( $tst_ngrams->[ $j ] == 0 ) + { + $iscore = 0; + } + elsif ( $matching_ngrams->[ $j ] == 0 ) + { + $smooth *= 2; + $iscore = log( 1 / ( $smooth * $tst_ngrams->[ $j ] ) ); + } + else + { + $iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] ); + } + $SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore ); + $score += $iscore; + $SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j ) * $exp_len_score; + } + return $SCOREmt->{ 4 }{ $sys }{ cum }; +} + +################################# + +sub nist_score +{ + my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, $SCOREmt) = @_; + my $score = 0; + my $iscore = 0; + + for (my $n=1; $n<=$max_Ngram; $n++) + { + $score += $tst_info->[$n]/max($tst_ngrams->[$n],1); + $SCOREmt->{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); + $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1); + $SCOREmt->{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); + } + return $SCOREmt->{5}{$sys}{cum}; +} + +################################# + +sub Words2Ngrams +{ + #convert a string of words to an Ngram count hash + my %count = (); + + for (; @_; shift) + { + my ($j, $ngram, $word); + for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) + { + $ngram .= defined $ngram ? " $word" : $word; + $count{$ngram}++; + } + } + return {%count}; +} + +################################# + +sub tokenization +{ + my ($norm_text) = @_; + +# language-independent part: + $norm_text =~ s///g; # strip "skipped" tags + $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines + $norm_text =~ s/\n/ /g; # join lines + $norm_text =~ s/"/"/g; # convert SGML tag for quote to " + $norm_text =~ s/&/&/g; # convert SGML tag for ampersand to & + $norm_text =~ s/</ + $norm_text =~ s/>/>/g; # convert SGML tag for greater-than to < + +# language-dependent part (assuming Western languages): + $norm_text = " $norm_text "; + $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case; + $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation + $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit + $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit + $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit + $norm_text =~ s/\s+/ /g; # one space only between words + $norm_text =~ s/^\s+//; # no leading space + $norm_text =~ s/\s+$//; # no trailing space + + return $norm_text; +} + + +sub tokenization_international +{ + my ($norm_text) = @_; + + $norm_text =~ s///g; # strip "skipped" tags + $norm_text =~ s/\p{Line_Break: Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines + $norm_text =~ s/\p{Zl}/ /g; # join lines + + # replace entities + $norm_text =~ s/"/\"/g; # quote to " + $norm_text =~ s/&/&/g; # ampersand to & + $norm_text =~ s/<//g; # greater-than to > + $norm_text =~ s/'/\'/g; # apostrophe to ' + + $norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed + $norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII ); + + # punctuation: tokenize any punctuation unless followed AND preceded by a digit + $norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g; + $norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g; + + $norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols + + $norm_text =~ s/\p{Z}+/ /g; # one space only between words + $norm_text =~ s/^\p{Z}+//; # no leading space + $norm_text =~ s/\p{Z}+$//; # no trailing space + + return $norm_text; +} + +################################# + +sub nist_length_penalty +{ + my ($ratio) = @_; + return 1 if $ratio >= 1; + return 0 if $ratio <= 0; + my $ratio_x = 1.5; + my $score_x = 0.5; + my $beta = -log($score_x)/log($ratio_x)/log($ratio_x); + return exp (-$beta*log($ratio)*log($ratio)); +} + +################################# + +sub date_time_stamp +{ + my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); + my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); + my ($date, $time); + $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec; + $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday; + return ($date, $time); +} + +################################# + +sub extract_sgml_tag_and_span +{ + my ($name, $data) = @_; + ($data =~ m|<$name\s*([^>]*)>(.*?)(.*)|si) ? ($1, $2, $3) : (); +} + +################################# + +sub extract_sgml_tag_attribute +{ + my ($name, $data) = @_; + ($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : (); +} + +################################# + +sub max +{ + my ($max, $next); + + return unless defined ($max=pop); + while (defined ($next=pop)) + { + $max = $next if $next > $max; + } + return $max; +} + +################################# + +sub min +{ + my ($min, $next); + + return unless defined ($min=pop); + while (defined ($next=pop)) + { + $min = $next if $next < $min; + } + return $min; +} + +################################# + +sub printout_report +{ + if ( $METHOD eq "BOTH" ) + { + foreach my $sys (sort @tst_sys) + { + printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum}; + } + } + elsif ($METHOD eq "NIST" ) + { + foreach my $sys (sort @tst_sys) + { + printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum}; + } + } + elsif ($METHOD eq "BLEU" ) + { + foreach my $sys (sort @tst_sys) + { + printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum}; + } + } + printf "\n# ------------------------------------------------------------------------\n\n"; + printf "Individual N-gram scoring\n"; + printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; + printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; + + if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "NIST") ) + { + foreach my $sys (sort @tst_sys) + { + printf " NIST:"; + for (my $i=1; $i<=$max_Ngram; $i++) + { + printf " %2.4f ",$NISTmt{$i}{$sys}{ind} + } + printf " \"$sys\"\n"; + } + printf "\n"; + } + + if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") ) + { + foreach my $sys (sort @tst_sys) + { + printf " BLEU:"; + for (my $i=1; $i<=$max_Ngram; $i++) + { + printf " %2.4f ",$BLEUmt{$i}{$sys}{ind} + } + printf " \"$sys\"\n"; + } + } + + printf "\n# ------------------------------------------------------------------------\n"; + printf "Cumulative N-gram scoring\n"; + printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; + printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; + + if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) + { + foreach my $sys (sort @tst_sys) + { + printf " NIST:"; + for (my $i=1; $i<=$max_Ngram; $i++) + { + printf " %2.4f ",$NISTmt{$i}{$sys}{cum} + } + printf " \"$sys\"\n"; + } + } + printf "\n"; + if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") ) + { + foreach my $sys (sort @tst_sys) + { + printf " BLEU:"; + for (my $i=1; $i<=$max_Ngram; $i++) + { + printf " %2.4f ",$BLEUmt{$i}{$sys}{cum} + } + printf " \"$sys\"\n"; + } + } +} + +############################################################################################################################### +# Create three files, by using: +# - $prefix : the prefix used for the output file names +# - %overall : a hash containing seg/doc/sys-level scores: +# - $overall{ $SYSTEM_ID }{ 'score' } => system-level score +# - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'score' } => document-level score +# - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'segments' }{ $SEGMENT_ID } => segment-level score +############################################################################################################################### +sub outputMetricsMATR +{ + my ( $prefix, %overall ) = @_; + my $fileNameSys = $prefix . '-sys.scr'; + my $fileNameDoc = $prefix . '-doc.scr'; + my $fileNameSeg = $prefix . '-seg.scr'; + open FILEOUT_SYS, '>', $fileNameSys or die "Could not open file: ${fileNameSys}"; + open FILEOUT_DOC, '>', $fileNameDoc or die "Could not open file: ${fileNameDoc}"; + open FILEOUT_SEG, '>', $fileNameSeg or die "Could not open file: ${fileNameSeg}"; + foreach my $sys ( sort( keys( %overall ) ) ) + { + my $scoreSys = $overall{ $sys }{ 'score' }; + print FILEOUT_SYS "${tst_id}\t${sys}\t${scoreSys}\n"; + foreach my $doc ( sort( keys( %{$overall{ $sys }{ 'documents' }} ) ) ) + { + my $scoreDoc = $overall{ $sys }{ 'documents' }{ $doc }{ 'score' }; + print FILEOUT_DOC "${tst_id}\t${sys}\t${doc}\t${scoreDoc}\n"; + foreach my $seg ( sort{ $a <=> $b }( keys( %{$overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }} ) ) ) + { + my $scoreSeg = $overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' }; + print FILEOUT_SEG "${tst_id}\t${sys}\t${doc}\t${seg}\t${scoreSeg}\n"; + } + } + } + close FILEOUT_SEG; + close FILEOUT_DOC; + close FILEOUT_SYS; +} + diff --git a/mosesdecoder/scripts/generic/mteval-v14.pl b/mosesdecoder/scripts/generic/mteval-v14.pl new file mode 100644 index 0000000000000000000000000000000000000000..84a7549acf3380ad344733df52b32ad99d16e590 --- /dev/null +++ b/mosesdecoder/scripts/generic/mteval-v14.pl @@ -0,0 +1,1179 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use utf8; +use Encode; +use XML::Twig; +use Sort::Naturally; + +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + + +################################# +# History: +# +# version 14 +# (2016-03-29 lukas.diduch@nist.gov) +# * Fixed warning message in case seg-id is a string, by sorting in correct order using Sort::Naturally. +# +# version 13b +# * Fixed die 'bug' in case seg->id = 0 +# +# version 13a +# * modified the scoring functions to prevent division-by-zero errors when a system segment is empty +# * affected methods: 'bleu_score' and 'bleu_score_smoothing' +# +# version 13 +# * Uses a XML parser to read data (only when extension is .xml) +# * Smoothing of the segment-level BLEU scores, done by default +# * smoothing method similar to that of bleu-1.04.pl (IBM) +# * see comments above the 'bleu_score' method for more details on how the smoothing is computed +# * added a '--no-smoothing' option to simulate old scripts behavior +# * Introduction of the 'brevity-penalty' option, taking one of two values: +# * 'closest' (default) : act as IBM BLEU (taking the closest reference translation length) +# * in case two reference translations are at the same distance, will take the shortest one +# * for more details regarding how the BP is computed, see comments of the 'brevity_penalty_closest' function +# * 'shortest' : act as previous versions of the script (taking shortest reference translation length) +# * Introduction of the 'international-tokenization' option, boolean, disabled by default +# by default (when the option is not provided), uses 11b's tokenization function +# when option specified, uses v12's tokenization function +# * Introduction of a 'Metrics MATR output' flag (option '--metricsMATR') +# when used, creates three files for both BLEU score and NIST score: +# * BLEU-seg.scr and NIST-seg.scr: contain segment-level scores +# * BLEU-doc.scr and NIST-doc.scr: contain document-level scores +# * BLEU-sys.scr and NIST-sys.scr: contain system-level scores +# * SGML parsing +# * script will halt if source, reference and test files don't share the same setid attribute value (used for metricsMATR output) +# * correct segment IDs extracted from the files (was previously using an array, and using the index as a segID for output) +# * detailed output flag (-d) can now be used when running both BLEU and NIST +# +# version 12 +# * Text normalization changes: +# * convert entity references (only the entities declared in the DTD) +# * now uses unicode categories +# * tokenize punctuation unless followed AND preceded by digits +# * tokenize symbols +# * UTF-8 handling: +# * files are now read using utf8 mode +# * Added the '-e' command-line option to enclose non-ASCII characters between spaces +# +# version 11b -- text normalization modified: +# * take out the join digit line because it joins digits +# when it shouldn't have +# $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits +# +# version 11a -- corrected output of individual n-gram precision values +# +# version 11 -- bug fixes: +# * make filehandle operate in binary mode to prevent Perl from operating +# (by default in Red Hat 9) in UTF-8 +# * fix failure on joining digits +# version 10 -- updated output to include more details of n-gram scoring. +# Defaults to generate both NIST and BLEU scores. Use -b for BLEU +# only, use -n for NIST only +# +# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4 +# being the max, regardless what was entered on the command line.) +# +# version 09c -- bug fix (During the calculation of ngram information, +# each ngram was being counted only once for each segment. This has +# been fixed so that each ngram is counted correctly in each segment.) +# +# version 09b -- text normalization modified: +# * option flag added to preserve upper case +# * non-ASCII characters left in place. +# +# version 09a -- text normalization modified: +# * " and & converted to "" and &, respectively +# * non-ASCII characters kept together (bug fix) +# +# version 09 -- modified to accommodate sgml tag and attribute +# names revised to conform to default SGML conventions. +# +# version 08 -- modifies the NIST metric in accordance with the +# findings on the 2001 Chinese-English dry run corpus. Also +# incorporates the BLEU metric as an option and supports the +# output of ngram detail. +# +# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI +# Keep strings of non-ASCII characters together as one word +# (rather than splitting them into one-character words). +# Change length penalty so that translations that are longer than +# the average reference translation are not penalized. +# +# version 06 +# Prevent divide-by-zero when a segment has no evaluation N-grams. +# Correct segment index for level 3 debug output. +# +# version 05 +# improve diagnostic error messages +# +# version 04 +# tag segments +# +# version 03 +# add detailed output option (intermediate document and segment scores) +# +# version 02 +# accommodation of modified sgml tags and attributes +# +# version 01 +# same as bleu version 15, but modified to provide formal score output. +# +# original IBM version +# Author: Kishore Papineni +# Date: 06/10/2001 +################################# + +###### +# Intro +my ($date, $time) = date_time_stamp(); +print "MT evaluation scorer began on $date at $time\n"; +print "\ncommand line: ", $0, " ", join(" ", @ARGV), "\n"; +my $usage = "\n\nUsage: $0 -r -s -t \n\n". + "Description: This Perl script evaluates MT system performance.\n". + "\n". + "Required arguments:\n". + " -r is a file containing the reference translations for\n". + " the documents to be evaluated.\n". + " -s is a file containing the source documents for which\n". + " translations are to be evaluated\n". + " -t is a file containing the translations to be evaluated\n". + "\n". + "Optional arguments:\n". + " -h prints this help message to STDOUT\n". + " -c preserves upper-case alphabetic characters\n". + " -b generate BLEU scores only\n". + " -n generate NIST scores only\n". + " -d detailed output flag:\n". + " 0 (default) for system-level score only\n". + " 1 to include document-level scores\n". + " 2 to include segment-level scores\n". + " 3 to include ngram-level scores\n". + " -e enclose non-ASCII characters between spaces\n". + " --brevity-penalty ( closest | shortest )\n" . + " closest (default) : acts as IBM BLEU (takes the closest reference translation length)\n" . + " shortest : acts as previous versions of the script (takes the shortest reference translation length)\n" . + " --international-tokenization\n" . + " when specified, uses Unicode-based (only) tokenization rules\n" . + " when not specified (default), uses default tokenization (some language-dependant rules)\n" . + " --metricsMATR : create three files for both BLEU scores and NIST scores:\n" . + " BLEU-seg.scr and NIST-seg.scr : segment-level scores\n" . + " BLEU-doc.scr and NIST-doc.scr : document-level scores\n" . + " BLEU-sys.scr and NIST-sys.scr : system-level scores\n" . + " --no-smoothing : disable smoothing on BLEU scores\n" . + "\n"; + +use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e); +use Getopt::Long; +my $ref_file = ''; +my $src_file = ''; +my $tst_file = ''; +my $detail = 0; +my $help = ''; +my $preserve_case = ''; +my $split_non_ASCII = ''; +my $brevity_penalty = 'closest'; +my $international_tokenization; +my $metricsMATR_output = ''; +my $no_smoothing = ''; +our $opt_x = ''; +our $opt_b = ''; +our $opt_n = ''; +GetOptions( + 'r=s' => \$ref_file, + 's=s' => \$src_file, + 't=s' => \$tst_file, + 'd:i' => \$detail, + 'h|help' => \$help, + 'b', + 'n', + 'c' => \$preserve_case, + 'x:s', + 'e' => \$split_non_ASCII, + 'brevity-penalty:s' => \$brevity_penalty, + 'international-tokenization' => \$international_tokenization, + 'metricsMATR-output' => \$metricsMATR_output, + 'no-smoothing' => \$no_smoothing +); +die $usage if $help; + +die "Error in command line: ref_file not defined$usage" unless ( $ref_file ); +die "Error in command line: src_file not defined$usage" unless ( $src_file ); +die "Error in command line: tst_file not defined$usage" unless ( $tst_file ); +my $BLEU_BP; +if ( !( $brevity_penalty cmp 'closest' ) ) +{ + $BLEU_BP = \&brevity_penalty_closest; +} +elsif ( !( $brevity_penalty cmp 'shortest' ) ) +{ + $BLEU_BP = \&brevity_penalty_shortest; +} +else +{ + die "Incorrect value supplied for 'brevity_penalty'$usage"; +} +my $TOKENIZATION = \&tokenization; +$TOKENIZATION = \&tokenization_international if ( $international_tokenization ); + +my $BLEU_SCORE = \&bleu_score; +$BLEU_SCORE = \&bleu_score_nosmoothing if ( $no_smoothing ); + +my $max_Ngram = 9; + +my $METHOD = "BOTH"; +if ( $opt_b ) { $METHOD = "BLEU"; } +if ( $opt_n ) { $METHOD = "NIST"; } +my $method; + +###### +# Global variables +my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters +my (%tst_data, %ref_data); # the data -- with structure: {system}{document}{segments} +my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets +my %eval_docs; # document information for the evaluation data set +my %ngram_info; # the information obtained from (the last word in) the ngram + +###### +# Get source document ID's +($src_id) = get_source_info ($src_file); + +###### +# Get reference translations +($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file); + +compute_ngram_info (); + +###### +# Get translations to evaluate +($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file); + +###### +# Check data for completeness and correctness +check_MT_data (); + +###### +# +my %NISTmt; +my %NISTOverall; +my %BLEUmt; +my %BLEUOverall; + +###### +# Evaluate +print "\nEvaluation of $src_lang-to-$tgt_lang translation using:\n"; +my $cum_seg = 0; +foreach my $doc (sort keys %eval_docs) +{ + $cum_seg += scalar( keys( %{$eval_docs{$doc}{SEGS}} ) ); +} +print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n"; +print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n"; +print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n"; + +foreach my $sys (sort @tst_sys) +{ + for (my $n=1; $n<=$max_Ngram; $n++) + { + $NISTmt{$n}{$sys}{cum} = 0; + $NISTmt{$n}{$sys}{ind} = 0; + $BLEUmt{$n}{$sys}{cum} = 0; + $BLEUmt{$n}{$sys}{ind} = 0; + } + if ( ($METHOD eq "BOTH") || ($METHOD eq "NIST") ) + { + $method="NIST"; + score_system ($sys, \%NISTmt, \%NISTOverall); + } + if ( ($METHOD eq "BOTH") || ($METHOD eq "BLEU") ) + { + $method="BLEU"; + score_system ($sys, \%BLEUmt, \%BLEUOverall); + } +} + +###### +printout_report (); +if ( $metricsMATR_output ) +{ + outputMetricsMATR( 'NIST', %NISTOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'NIST' ) ); + outputMetricsMATR( 'BLEU', %BLEUOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'BLEU' ) ); +} + +($date, $time) = date_time_stamp(); +print "\nMT evaluation scorer ended on $date at $time\n"; + +exit 0; + +################################# + +sub get_source_info +{ + my ($file) = @_; + my ($name, $id, $src, $doc, $seg); + my ($data, $tag, $span); + + # Extension of the file determines the parser used: + # .xml : XML::Twig + # otherwise : simple SGML parsing functions + if ( $file =~ /\.xml$/i ) + { + my $twig = XML::Twig->new(); + $twig->parsefile( $file ); + my $root = $twig->root; + my $currentSet = $root->first_child( 'srcset' ); + die "Source XML file '$file' does not contain the 'srcset' element" if ( not $currentSet ); + $id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'"; + $src = $currentSet->{ 'att' }->{ 'srclang' } or die "No srcset 'srclang' attribute value in '$file'"; + die "Not the same srclang attribute values across sets" unless ( not defined $src_lang or $src eq $src_lang ); + $src_lang = $src; + foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) ) + { + my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'"; + foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) ) + { + + my $segID = $currentSeg->{ 'att' }->{ 'id' }; + die "No segment 'id' attribute value in '$file'" if (! defined $segID); + my $segData = $currentSeg->text; + ($eval_docs{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData ); + } + } + } + else + { + #read data from file + open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; + binmode FILE, ":utf8"; + $data .= $_ while ; + close (FILE); + + #get source set info + die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n" + unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data); + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" + ." with $name in previous input data ('$src_lang')\n\n" + unless (not defined $src_lang or $src eq $src_lang); + $src_lang = $src; + + #get doc info -- ID and # of segs + $data = $span; + while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) + { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag); + die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n" + if defined $eval_docs{$doc}; + $span =~ s/[\s\n\r]+/ /g; # concatenate records + my $nseg=0, my $seg_data = $span; + while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) + { + die "\n\nFATAL INPUT ERROR: no attribute '$name' in file '$file'\n\n" + unless ($seg) = extract_sgml_tag_attribute( $name='id', $tag ); + ($eval_docs{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span ); + $nseg++; + } + die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" + if $nseg == 0; + } + die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n" + unless keys %eval_docs > 0; + } + return $id; +} + +################################# + +sub get_MT_data +{ + my ($docs, $set_tag, $file) = @_; + my ($name, $id, $src, $tgt, $sys, $doc, $seg); + my ($tag, $span, $data); + + # Extension of the file determines the parser used: + # .xml : XML::Twig + # otherwise : simple SGML parsing functions + if ( $file =~ /\.xml$/i ) + { + my $twig = XML::Twig->new(); + $twig->parsefile( $file ); + my $root = $twig->root; + foreach my $currentSet ( $root->get_xpath( 'refset' ), $root->get_xpath( 'tstset' ) ) + { + $id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'"; + $src = $currentSet->{ 'att' }->{ 'srclang' } or die "No 'srclang' attribute value in '$file'"; + $tgt = $currentSet->{ 'att' }->{ 'trglang' } or die "No 'trglang' attribute value in '$file'"; + die "Not the same 'srclang' attribute value across sets" unless ( $src eq $src_lang ); + die "Not the same 'trglang' attribute value across sets" unless ( ( not defined $tgt_lang ) or ( $tgt = $tgt_lang ) ); + $tgt_lang = $tgt; + my $sys; + if ( $currentSet->name eq 'tstset' ) + { + $sys = $currentSet->{ 'att' }->{ 'sysid' } or die "No 'sysid' attribute value in '$file'"; + } + else + { + $sys = $currentSet->{ 'att' }->{ 'refid' } or die "No 'refid' attribute value in '$file'"; + } + foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) ) + { + my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'"; + $docs->{ $sys }{ $docID }{ FILE } = $file; + foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) ) + { + my $segID = $currentSeg->{ 'att' }->{ 'id' }; + die "No segment 'id' attribute value in '$file'" if (! defined $segID); + my $segData = $currentSeg->text; + ($docs->{$sys}{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData ); + } + } + } + } + else + { + #read data from file + open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; + binmode FILE, ":utf8"; + $data .= $_ while ; + close (FILE); + + #get tag info + while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) + { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" + ." with $name of source ('$src_lang')\n\n" + unless $src eq $src_lang; + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag); + die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n" + ." with $name of the evaluation ('$tgt_lang')\n\n" + unless (not defined $tgt_lang or $tgt eq $tgt_lang); + $tgt_lang = $tgt; + + my $mtdata = $span; + while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) + { + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag); + die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless $doc = extract_sgml_tag_attribute ($name="DocID", $tag); + die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n" + ." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n" + unless (not defined $docs->{$sys}{$doc}); + + $span =~ s/[\s\n\r]+/ /g; # concatenate records + my $nseg=0, my $seg_data = $span; + while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) + { + die "\n\nFATAIL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" + unless $seg = extract_sgml_tag_attribute( $name="id", $tag ); + ($docs->{$sys}{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span ); + $nseg++; + } + die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" if $nseg == 0; + $docs->{$sys}{$doc}{FILE} = $file; + } + } + } + return $id; +} + +################################# + +sub check_MT_data +{ + @tst_sys = sort keys %tst_data; + @ref_sys = sort keys %ref_data; + + die "Not the same 'setid' attribute values across files" unless ( ( $src_id eq $tst_id ) && ( $src_id eq $ref_id ) ); + +#every evaluation document must be represented for every system and every reference + foreach my $doc (sort keys %eval_docs) + { + my $nseg_source = scalar( keys( %{$eval_docs{$doc}{SEGS}} ) ); + foreach my $sys (@tst_sys) + { + die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n" unless defined $tst_data{$sys}{$doc}; + my $nseg = scalar( keys( %{$tst_data{$sys}{$doc}{SEGS}} ) ); + die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" + ." document '$doc' for system '$sys' contains $nseg segments, while\n" + ." the source document contains $nseg_source segments.\n\n" + unless $nseg == $nseg_source; + } + foreach my $sys (@ref_sys) + { + die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n" unless defined $ref_data{$sys}{$doc}; + my $nseg = scalar( keys( %{$ref_data{$sys}{$doc}{SEGS}} ) ); + die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" + ." document '$doc' for system '$sys' contains $nseg segments, while\n" + ." the source document contains $nseg_source segments.\n\n" + unless $nseg == $nseg_source; + } + } +} + +################################# + +sub compute_ngram_info +{ + my ($ref, $doc, $seg); + my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram); + my (%ngram_count, @tot_ngrams); + + foreach $ref (keys %ref_data) + { + foreach $doc (keys %{$ref_data{$ref}}) + { + foreach $seg ( keys %{$ref_data{$ref}{$doc}{SEGS}}) + { + @wrds = split /\s+/, $ref_data{ $ref }{ $doc }{ SEGS }{ $seg }; + $tot_wrds += @wrds; + %ngrams = %{Words2Ngrams (@wrds)}; + foreach $ngram (keys %ngrams) + { + $ngram_count{$ngram} += $ngrams{$ngram}; + } + } + } + } + + foreach $ngram (keys %ngram_count) + { + @wrds = split / /, $ngram; + pop @wrds, $mgram = join " ", @wrds; + $ngram_info{$ngram} = - log ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} : $ngram_count{$ngram}/$tot_wrds) / log 2; + if (defined $opt_x and $opt_x eq "ngram info") + { + @wrds = split / /, $ngram; + printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram}, + $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram; + } + } +} + +################################# + +sub score_system +{ + my ($sys, $ref, $doc, $SCOREmt, $overallScore); + ($sys, $SCOREmt, $overallScore) = @_; + my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); + my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); + + $cum_ref_length = 0; + for (my $j=1; $j<=$max_Ngram; $j++) + { + $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; + } + foreach $doc (sort keys %eval_docs) + { + ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc, $overallScore); + if ( $method eq "NIST" ) + { + my %DOCmt = (); + my $docScore = nist_score( scalar( @ref_sys ), $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, \%DOCmt ); + $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore; + if ( $detail >= 1 ) + { + printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", + $docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; + } + } + + if ( $method eq "BLEU" ) + { + my %DOCmt = (); + my $docScore = &{$BLEU_SCORE}( $ref_length, $match_cnt, $tst_cnt, $sys, \%DOCmt ); + $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore; + if ( $detail >= 1 ) + { + printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", + $docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; + } + } + + $cum_ref_length += $ref_length; + for (my $j=1; $j<=$max_Ngram; $j++) + { + $cum_match[$j] += $match_cnt->[$j]; + $cum_tst_cnt[$j] += $tst_cnt->[$j]; + $cum_ref_cnt[$j] += $ref_cnt->[$j]; + $cum_tst_info[$j] += $tst_info->[$j]; + $cum_ref_info[$j] += $ref_info->[$j]; + printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j], + $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j] + if (defined $opt_x and $opt_x eq "document info"); + } + } + + if ($method eq "BLEU") + { + $overallScore->{ $sys }{ 'score' } = &{$BLEU_SCORE}($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, $SCOREmt); + } + if ($method eq "NIST") + { + $overallScore->{ $sys }{ 'score' } = nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, $SCOREmt); + } +} + +################################# + +sub score_document +{ + my ($sys, $ref, $doc, $overallScore); + ($sys, $doc, $overallScore) = @_; + my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); + my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); + + $cum_ref_length = 0; + for (my $j=1; $j<=$max_Ngram; $j++) + { + $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; + } + + # score each segment + foreach my $seg ( nsort keys( %{$tst_data{$sys}{$doc}{SEGS}} ) ) + { + + my @ref_segments = (); + foreach $ref (@ref_sys) + { + push @ref_segments, $ref_data{$ref}{$doc}{SEGS}{$seg}; + if ( $detail >= 3 ) + { + printf "ref '$ref', seg $seg: %s\n", $ref_data{$ref}{$doc}{SEGS}{$seg} + } + + } + + printf "sys '$sys', seg $seg: %s\n", $tst_data{$sys}{$doc}{SEGS}{$seg} if ( $detail >= 3 ); + ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_segment ($tst_data{$sys}{$doc}{SEGS}{$seg}, @ref_segments); + + if ( $method eq "BLEU" ) + { + my %DOCmt = (); + my $segScore = &{$BLEU_SCORE}($ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt); + $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore; + if ( $detail >= 2 ) + { + printf " $method score using 4-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1] + } + } + if ( $method eq "NIST" ) + { + my %DOCmt = (); + my $segScore = nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt); + $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore; + if ( $detail >= 2 ) + { + printf " $method score using 5-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1]; + } + } + $cum_ref_length += $ref_length; + for (my $j=1; $j<=$max_Ngram; $j++) + { + $cum_match[$j] += $match_cnt->[$j]; + $cum_tst_cnt[$j] += $tst_cnt->[$j]; + $cum_ref_cnt[$j] += $ref_cnt->[$j]; + $cum_tst_info[$j] += $tst_info->[$j]; + $cum_ref_info[$j] += $ref_info->[$j]; + } + } + return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]); +} + +############################################################################################################################### +# function returning the shortest reference length +# takes as input: +# - currentLength : the current (shortest) reference length +# - referenceSentenceLength : the current reference sentence length +# - candidateSentenceLength : the current candidate sentence length (unused) +############################################################################################################################### +sub brevity_penalty_shortest +{ + my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_; + return ( $referenceSentenceLength < $currentLength ? $referenceSentenceLength : $currentLength ); +} + +############################################################################################################################### +# function returning the closest reference length (to the candidate sentence length) +# takes as input: +# - currentLength: the current (closest) reference length. +# - candidateSentenceLength : the current reference sentence length +# - candidateSentenceLength : the current candidate sentence length +# when two reference sentences are at the same distance, it will return the shortest reference sentence length +# example of 4 iterations, given: +# - one candidate sentence containing 7 tokens +# - one reference translation containing 11 tokens +# - one reference translation containing 8 tokens +# - one reference translation containing 6 tokens +# - one reference translation containing 7 tokens +# the multiple invokations will return: +# - currentLength is set to 11 (outside of this function) +# - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 ) +# - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8 +# - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 ) +############################################################################################################################### +sub brevity_penalty_closest +{ + my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_; + my $result = $currentLength; + if ( abs( $candidateSentenceLength - $referenceSentenceLength ) <= abs( $candidateSentenceLength - $currentLength ) ) + { + if ( abs( $candidateSentenceLength - $referenceSentenceLength ) == abs( $candidateSentenceLength - $currentLength ) ) + { + if ( $currentLength > $referenceSentenceLength ) + { + $result = $referenceSentenceLength; + } + } + else + { + $result = $referenceSentenceLength; + } + } + return $result; +} + +################################# + +sub score_segment +{ + my ($tst_seg, @ref_segs) = @_; + my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info); + my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info); + my ($ngram); + my (@nwrds_ref); + my $ref_length; + + for (my $j=1; $j<= $max_Ngram; $j++) + { + $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0; + } + +# get the ngram counts for the test segment + @tst_wrds = split /\s+/, $tst_seg; + %tst_ngrams = %{Words2Ngrams (@tst_wrds)}; + for (my $j=1; $j<=$max_Ngram; $j++) + { + # compute ngram counts + $tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0; + } + +# get the ngram counts for the reference segments + foreach $ref_seg (@ref_segs) + { + @ref_wrds = split /\s+/, $ref_seg; + %ref_ngrams = %{Words2Ngrams (@ref_wrds)}; + foreach $ngram (keys %ref_ngrams) + { + # find the maximum # of occurrences + my @wrds = split / /, $ngram; + $ref_info[@wrds] += $ngram_info{$ngram}; + $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : $ref_ngrams{$ngram}; + } + for (my $j=1; $j<=$max_Ngram; $j++) + { + # update ngram counts + $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0; + } + if ( not defined( $ref_length ) ) + { + $ref_length = scalar( @ref_wrds ); + } + else + { + $ref_length = &{$BLEU_BP}( $ref_length, scalar( @ref_wrds ), scalar( @tst_wrds ) ); + } + } + +# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams + foreach $ngram (keys %tst_ngrams) + { + next unless defined $ref_ngrams_max{$ngram}; + my @wrds = split / /, $ngram; + $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); + $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); + printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram + if $detail >= 3; + } + + return ($ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]); +} + +################################# + +sub bleu_score_nosmoothing +{ + my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_; + my $score = 0; + my $iscore = 0; + + for ( my $j = 1; $j <= $max_Ngram; ++$j ) + { + if ($matching_ngrams->[ $j ] == 0) + { + $SCOREmt->{ $j }{ $sys }{ cum }=0; + } + else + { + my $len_score = min (0, 1-$ref_length/$tst_ngrams->[1]); + # Cumulative N-Gram score + $score += log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] ); + $SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j + $len_score ); + # Individual N-Gram score + $iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] ); + $SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore ); + } + } + return $SCOREmt->{ 4 }{ $sys }{ cum }; +} + +############################################################################################################################### +# Default method used to compute the BLEU score, using smoothing. +# Note that the method used can be overridden using the '--no-smoothing' command-line argument +# The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each precision score whose matching n-gram count is null +# k is 1 for the first 'n' value for which the n-gram match count is null +# For example, if the text contains: +# - one 2-gram match +# - and (consequently) two 1-gram matches +# the n-gram count for each individual precision score would be: +# - n=1 => prec_count = 2 (two unigrams) +# - n=2 => prec_count = 1 (one bigram) +# - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) +# - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) +############################################################################################################################### +sub bleu_score +{ + my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_; + my $score = 0; + my $iscore = 0; + my $exp_len_score = 0; + $exp_len_score = exp( min (0, 1 - $ref_length / $tst_ngrams->[ 1 ] ) ) if ( $tst_ngrams->[ 1 ] > 0 ); + my $smooth = 1; + for ( my $j = 1; $j <= $max_Ngram; ++$j ) + { + if ( $tst_ngrams->[ $j ] == 0 ) + { + $iscore = 0; + } + elsif ( $matching_ngrams->[ $j ] == 0 ) + { + $smooth *= 2; + $iscore = log( 1 / ( $smooth * $tst_ngrams->[ $j ] ) ); + } + else + { + $iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] ); + } + $SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore ); + $score += $iscore; + $SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j ) * $exp_len_score; + } + return $SCOREmt->{ 4 }{ $sys }{ cum }; +} + +################################# + +sub nist_score +{ + my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, $SCOREmt) = @_; + my $score = 0; + my $iscore = 0; + + for (my $n=1; $n<=$max_Ngram; $n++) + { + $score += $tst_info->[$n]/max($tst_ngrams->[$n],1); + $SCOREmt->{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); + $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1); + $SCOREmt->{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); + } + return $SCOREmt->{5}{$sys}{cum}; +} + +################################# + +sub Words2Ngrams +{ + #convert a string of words to an Ngram count hash + my %count = (); + + for (; @_; shift) + { + my ($j, $ngram, $word); + for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) + { + $ngram .= defined $ngram ? " $word" : $word; + $count{$ngram}++; + } + } + return {%count}; +} + +################################# + +sub tokenization +{ + my ($norm_text) = @_; + +# language-independent part: + $norm_text =~ s///g; # strip "skipped" tags + $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines + $norm_text =~ s/\n/ /g; # join lines + $norm_text =~ s/"/"/g; # convert SGML tag for quote to " + $norm_text =~ s/&/&/g; # convert SGML tag for ampersand to & + $norm_text =~ s/</ + $norm_text =~ s/>/>/g; # convert SGML tag for greater-than to < + +# language-dependent part (assuming Western languages): + $norm_text = " $norm_text "; + $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case; + $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation + $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit + $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit + $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit + $norm_text =~ s/\s+/ /g; # one space only between words + $norm_text =~ s/^\s+//; # no leading space + $norm_text =~ s/\s+$//; # no trailing space + + return $norm_text; +} + + +sub tokenization_international +{ + my ($norm_text) = @_; + + $norm_text =~ s///g; # strip "skipped" tags + #$norm_text =~ s/\p{Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines + $norm_text =~ s/\p{Zl}/ /g; # join lines + + # replace entities + $norm_text =~ s/"/\"/g; # quote to " + $norm_text =~ s/&/&/g; # ampersand to & + $norm_text =~ s/<//g; # greater-than to > + $norm_text =~ s/'/\'/g; # apostrophe to ' + + $norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed + $norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII ); + + # punctuation: tokenize any punctuation unless followed AND preceded by a digit + $norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g; + $norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g; + + $norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols + + $norm_text =~ s/\p{Z}+/ /g; # one space only between words + $norm_text =~ s/^\p{Z}+//; # no leading space + $norm_text =~ s/\p{Z}+$//; # no trailing space + + return $norm_text; +} + +################################# + +sub nist_length_penalty +{ + my ($ratio) = @_; + return 1 if $ratio >= 1; + return 0 if $ratio <= 0; + my $ratio_x = 1.5; + my $score_x = 0.5; + my $beta = -log($score_x)/log($ratio_x)/log($ratio_x); + return exp (-$beta*log($ratio)*log($ratio)); +} + +################################# + +sub date_time_stamp +{ + my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); + my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); + my ($date, $time); + $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec; + $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday; + return ($date, $time); +} + +################################# + +sub extract_sgml_tag_and_span +{ + my ($name, $data) = @_; + ($data =~ m|<$name\s*([^>]*)>(.*?)(.*)|si) ? ($1, $2, $3) : (); +} + +################################# + +sub extract_sgml_tag_attribute +{ + my ($name, $data) = @_; + ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : (); +} + +################################# + +sub max +{ + my ($max, $next); + + return unless defined ($max=pop); + while (defined ($next=pop)) + { + $max = $next if $next > $max; + } + return $max; +} + +################################# + +sub min +{ + my ($min, $next); + + return unless defined ($min=pop); + while (defined ($next=pop)) + { + $min = $next if $next < $min; + } + return $min; +} + +################################# + +sub printout_report +{ + if ( $METHOD eq "BOTH" ) + { + foreach my $sys (sort @tst_sys) + { + printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum}; + } + } + elsif ($METHOD eq "NIST" ) + { + foreach my $sys (sort @tst_sys) + { + printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum}; + } + } + elsif ($METHOD eq "BLEU" ) + { + foreach my $sys (sort @tst_sys) + { + printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum}; + } + } + printf "\n# ------------------------------------------------------------------------\n\n"; + printf "Individual N-gram scoring\n"; + printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; + printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; + + if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "NIST") ) + { + foreach my $sys (sort @tst_sys) + { + printf " NIST:"; + for (my $i=1; $i<=$max_Ngram; $i++) + { + printf " %2.4f ",$NISTmt{$i}{$sys}{ind} + } + printf " \"$sys\"\n"; + } + printf "\n"; + } + + if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") ) + { + foreach my $sys (sort @tst_sys) + { + printf " BLEU:"; + for (my $i=1; $i<=$max_Ngram; $i++) + { + printf " %2.4f ",$BLEUmt{$i}{$sys}{ind} + } + printf " \"$sys\"\n"; + } + } + + printf "\n# ------------------------------------------------------------------------\n"; + printf "\nCumulative N-gram scoring\n"; + printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; + printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; + + if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) + { + foreach my $sys (sort @tst_sys) + { + printf " NIST:"; + for (my $i=1; $i<=$max_Ngram; $i++) + { + printf " %2.4f ",$NISTmt{$i}{$sys}{cum} + } + printf " \"$sys\"\n"; + } + } + printf "\n"; + if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") ) + { + foreach my $sys (sort @tst_sys) + { + printf " BLEU:"; + for (my $i=1; $i<=$max_Ngram; $i++) + { + printf " %2.4f ",$BLEUmt{$i}{$sys}{cum} + } + printf " \"$sys\"\n"; + } + } +} + +############################################################################################################################### +# Create three files, by using: +# - $prefix : the prefix used for the output file names +# - %overall : a hash containing seg/doc/sys-level scores: +# - $overall{ $SYSTEM_ID }{ 'score' } => system-level score +# - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'score' } => document-level score +# - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'segments' }{ $SEGMENT_ID } => segment-level score +############################################################################################################################### +sub outputMetricsMATR +{ + my ( $prefix, %overall ) = @_; + my $fileNameSys = $prefix . '-sys.scr'; + my $fileNameDoc = $prefix . '-doc.scr'; + my $fileNameSeg = $prefix . '-seg.scr'; + open FILEOUT_SYS, '>', $fileNameSys or die "Could not open file: ${fileNameSys}"; + open FILEOUT_DOC, '>', $fileNameDoc or die "Could not open file: ${fileNameDoc}"; + open FILEOUT_SEG, '>', $fileNameSeg or die "Could not open file: ${fileNameSeg}"; + foreach my $sys ( sort( keys( %overall ) ) ) + { + my $scoreSys = $overall{ $sys }{ 'score' }; + print FILEOUT_SYS "${tst_id}\t${sys}\t${scoreSys}\n"; + foreach my $doc ( sort( keys( %{$overall{ $sys }{ 'documents' }} ) ) ) + { + my $scoreDoc = $overall{ $sys }{ 'documents' }{ $doc }{ 'score' }; + print FILEOUT_DOC "${tst_id}\t${sys}\t${doc}\t${scoreDoc}\n"; + foreach my $seg ( nsort keys( %{$overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }} ) ) + { + my $scoreSeg = $overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' }; + print FILEOUT_SEG "${tst_id}\t${sys}\t${doc}\t${seg}\t${scoreSeg}\n"; + } + } + } + close FILEOUT_SEG; + close FILEOUT_DOC; + close FILEOUT_SYS; +} + diff --git a/mosesdecoder/scripts/generic/multi-bleu-detok.perl b/mosesdecoder/scripts/generic/multi-bleu-detok.perl new file mode 100644 index 0000000000000000000000000000000000000000..e1f69501e3c96c70c1e124e44336a31054ca6b63 --- /dev/null +++ b/mosesdecoder/scripts/generic/multi-bleu-detok.perl @@ -0,0 +1,214 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# This file uses the internal tokenization of mteval-v13a.pl, +# giving the exact same (case-sensitive) results on untokenized text. +# Using this script with detokenized output and untokenized references is +# preferrable over multi-bleu.perl, since scores aren't affected by tokenization differences. +# +# like multi-bleu.perl , it supports plain text input and multiple references. + +# $Id$ +use warnings; +use strict; + +binmode(STDIN, ":utf8"); +use open ':encoding(UTF-8)'; + +my $lowercase = 0; +if ($ARGV[0] eq "-lc") { + $lowercase = 1; + shift; +} + +my $stem = $ARGV[0]; +if (!defined $stem) { + print STDERR "usage: multi-bleu-detok.pl [-lc] reference < hypothesis\n"; + print STDERR "Reads the references from reference or reference0, reference1, ...\n"; + exit(1); +} + +$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; + +my @REF; +my $ref=0; +while(-e "$stem$ref") { + &add_to_ref("$stem$ref",\@REF); + $ref++; +} +&add_to_ref($stem,\@REF) if -e $stem; +die("ERROR: could not find reference file $stem") unless scalar @REF; + +# add additional references explicitly specified on the command line +shift; +foreach my $stem (@ARGV) { + &add_to_ref($stem,\@REF) if -e $stem; +} + + + +sub add_to_ref { + my ($file,$REF) = @_; + my $s=0; + if ($file =~ /.gz$/) { + open(REF,"gzip -dc $file|") or die "Can't read $file"; + } else { + open(REF,$file) or die "Can't read $file"; + } + while() { + chop; + $_ = tokenization($_); + push @{$$REF[$s++]}, $_; + } + close(REF); +} + +my(@CORRECT,@TOTAL,$length_translation,$length_reference); +my $s=0; +while() { + chop; + $_ = lc if $lowercase; + $_ = tokenization($_); + my @WORD = split; + my %REF_NGRAM = (); + my $length_translation_this_sentence = scalar(@WORD); + my ($closest_diff,$closest_length) = (9999,9999); + foreach my $reference (@{$REF[$s]}) { +# print "$s $_ <=> $reference\n"; + $reference = lc($reference) if $lowercase; + my @WORD = split(' ',$reference); + my $length = scalar(@WORD); + my $diff = abs($length_translation_this_sentence-$length); + if ($diff < $closest_diff) { + $closest_diff = $diff; + $closest_length = $length; + # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; + } elsif ($diff == $closest_diff) { + $closest_length = $length if $length < $closest_length; + # from two references with the same closeness to me + # take the *shorter* into account, not the "first" one. + } + for(my $n=1;$n<=4;$n++) { + my %REF_NGRAM_N = (); + for(my $start=0;$start<=$#WORD-($n-1);$start++) { + my $ngram = "$n"; + for(my $w=0;$w<$n;$w++) { + $ngram .= " ".$WORD[$start+$w]; + } + $REF_NGRAM_N{$ngram}++; + } + foreach my $ngram (keys %REF_NGRAM_N) { + if (!defined($REF_NGRAM{$ngram}) || + $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { + $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; +# print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; + } + } + } + } + $length_translation += $length_translation_this_sentence; + $length_reference += $closest_length; + for(my $n=1;$n<=4;$n++) { + my %T_NGRAM = (); + for(my $start=0;$start<=$#WORD-($n-1);$start++) { + my $ngram = "$n"; + for(my $w=0;$w<$n;$w++) { + $ngram .= " ".$WORD[$start+$w]; + } + $T_NGRAM{$ngram}++; + } + foreach my $ngram (keys %T_NGRAM) { + $ngram =~ /^(\d+) /; + my $n = $1; + # my $corr = 0; +# print "$i e $ngram $T_NGRAM{$ngram}
\n"; + $TOTAL[$n] += $T_NGRAM{$ngram}; + if (defined($REF_NGRAM{$ngram})) { + if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { + $CORRECT[$n] += $T_NGRAM{$ngram}; + # $corr = $T_NGRAM{$ngram}; +# print "$i e correct1 $T_NGRAM{$ngram}
\n"; + } + else { + $CORRECT[$n] += $REF_NGRAM{$ngram}; + # $corr = $REF_NGRAM{$ngram}; +# print "$i e correct2 $REF_NGRAM{$ngram}
\n"; + } + } + # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; + # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" + } + } + $s++; +} +my $brevity_penalty = 1; +my $bleu = 0; + +my @bleu=(); + +for(my $n=1;$n<=4;$n++) { + if (defined ($TOTAL[$n])){ + $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; + # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; + }else{ + $bleu[$n]=0; + } +} + +if ($length_reference==0){ + printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; + exit(1); +} + +if ($length_translation<$length_reference) { + $brevity_penalty = exp(1-$length_reference/$length_translation); +} +$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + + my_log( $bleu[2] ) + + my_log( $bleu[3] ) + + my_log( $bleu[4] ) ) / 4) ; +printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", + 100*$bleu, + 100*$bleu[1], + 100*$bleu[2], + 100*$bleu[3], + 100*$bleu[4], + $brevity_penalty, + $length_translation / $length_reference, + $length_translation, + $length_reference; + +sub my_log { + return -9999999999 unless $_[0]; + return log($_[0]); +} + + + +sub tokenization +{ + my ($norm_text) = @_; + +# language-independent part: + $norm_text =~ s///g; # strip "skipped" tags + $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines + $norm_text =~ s/\n/ /g; # join lines + $norm_text =~ s/"/"/g; # convert SGML tag for quote to " + $norm_text =~ s/&/&/g; # convert SGML tag for ampersand to & + $norm_text =~ s/</ + $norm_text =~ s/>/>/g; # convert SGML tag for greater-than to < + +# language-dependent part (assuming Western languages): + $norm_text = " $norm_text "; + $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation + $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit + $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit + $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit + $norm_text =~ s/\s+/ /g; # one space only between words + $norm_text =~ s/^\s+//; # no leading space + $norm_text =~ s/\s+$//; # no trailing space + + return $norm_text; +} diff --git a/mosesdecoder/scripts/generic/multi-bleu.perl b/mosesdecoder/scripts/generic/multi-bleu.perl new file mode 100644 index 0000000000000000000000000000000000000000..4394def3534a249b0268da2bb1f690d488a10176 --- /dev/null +++ b/mosesdecoder/scripts/generic/multi-bleu.perl @@ -0,0 +1,177 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# $Id$ +use warnings; +use strict; + +my $lowercase = 0; +if ($ARGV[0] eq "-lc") { + $lowercase = 1; + shift; +} + +my $stem = $ARGV[0]; +if (!defined $stem) { + print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; + print STDERR "Reads the references from reference or reference0, reference1, ...\n"; + exit(1); +} + +$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; + +my @REF; +my $ref=0; +while(-e "$stem$ref") { + &add_to_ref("$stem$ref",\@REF); + $ref++; +} +&add_to_ref($stem,\@REF) if -e $stem; +die("ERROR: could not find reference file $stem") unless scalar @REF; + +# add additional references explicitly specified on the command line +shift; +foreach my $stem (@ARGV) { + &add_to_ref($stem,\@REF) if -e $stem; +} + + + +sub add_to_ref { + my ($file,$REF) = @_; + my $s=0; + if ($file =~ /.gz$/) { + open(REF,"gzip -dc $file|") or die "Can't read $file"; + } else { + open(REF,$file) or die "Can't read $file"; + } + while() { + chomp; + push @{$$REF[$s++]}, $_; + } + close(REF); +} + +my(@CORRECT,@TOTAL,$length_translation,$length_reference); +my $s=0; +while() { + chomp; + $_ = lc if $lowercase; + my @WORD = split; + my %REF_NGRAM = (); + my $length_translation_this_sentence = scalar(@WORD); + my ($closest_diff,$closest_length) = (9999,9999); + foreach my $reference (@{$REF[$s]}) { +# print "$s $_ <=> $reference\n"; + $reference = lc($reference) if $lowercase; + my @WORD = split(' ',$reference); + my $length = scalar(@WORD); + my $diff = abs($length_translation_this_sentence-$length); + if ($diff < $closest_diff) { + $closest_diff = $diff; + $closest_length = $length; + # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; + } elsif ($diff == $closest_diff) { + $closest_length = $length if $length < $closest_length; + # from two references with the same closeness to me + # take the *shorter* into account, not the "first" one. + } + for(my $n=1;$n<=4;$n++) { + my %REF_NGRAM_N = (); + for(my $start=0;$start<=$#WORD-($n-1);$start++) { + my $ngram = "$n"; + for(my $w=0;$w<$n;$w++) { + $ngram .= " ".$WORD[$start+$w]; + } + $REF_NGRAM_N{$ngram}++; + } + foreach my $ngram (keys %REF_NGRAM_N) { + if (!defined($REF_NGRAM{$ngram}) || + $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { + $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; +# print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; + } + } + } + } + $length_translation += $length_translation_this_sentence; + $length_reference += $closest_length; + for(my $n=1;$n<=4;$n++) { + my %T_NGRAM = (); + for(my $start=0;$start<=$#WORD-($n-1);$start++) { + my $ngram = "$n"; + for(my $w=0;$w<$n;$w++) { + $ngram .= " ".$WORD[$start+$w]; + } + $T_NGRAM{$ngram}++; + } + foreach my $ngram (keys %T_NGRAM) { + $ngram =~ /^(\d+) /; + my $n = $1; + # my $corr = 0; +# print "$i e $ngram $T_NGRAM{$ngram}
\n"; + $TOTAL[$n] += $T_NGRAM{$ngram}; + if (defined($REF_NGRAM{$ngram})) { + if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { + $CORRECT[$n] += $T_NGRAM{$ngram}; + # $corr = $T_NGRAM{$ngram}; +# print "$i e correct1 $T_NGRAM{$ngram}
\n"; + } + else { + $CORRECT[$n] += $REF_NGRAM{$ngram}; + # $corr = $REF_NGRAM{$ngram}; +# print "$i e correct2 $REF_NGRAM{$ngram}
\n"; + } + } + # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; + # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" + } + } + $s++; +} +my $brevity_penalty = 1; +my $bleu = 0; + +my @bleu=(); + +for(my $n=1;$n<=4;$n++) { + if (defined ($TOTAL[$n])){ + $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; + # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; + }else{ + $bleu[$n]=0; + } +} + +if ($length_reference==0){ + printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; + exit(1); +} + +if ($length_translation<$length_reference) { + $brevity_penalty = exp(1-$length_reference/$length_translation); +} +$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + + my_log( $bleu[2] ) + + my_log( $bleu[3] ) + + my_log( $bleu[4] ) ) / 4) ; +printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", + 100*$bleu, + 100*$bleu[1], + 100*$bleu[2], + 100*$bleu[3], + 100*$bleu[4], + $brevity_penalty, + $length_translation / $length_reference, + $length_translation, + $length_reference; + + +print STDERR "It is not advisable to publish scores from multi-bleu.perl. The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups. Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization. Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n"; + +sub my_log { + return -9999999999 unless $_[0]; + return log($_[0]); +} diff --git a/mosesdecoder/scripts/generic/multi_moses.py b/mosesdecoder/scripts/generic/multi_moses.py new file mode 100644 index 0000000000000000000000000000000000000000..97ffc2cdb32ee24a0032909471d29cc34ae57d18 --- /dev/null +++ b/mosesdecoder/scripts/generic/multi_moses.py @@ -0,0 +1,332 @@ +#!/usr/bin/env python + +# Written by Michael Denkowski +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +'''Parallelize decoding with multiple instances of moses on a local machine + +To use with mert-moses.pl, activate --multi-moses and set the number of moses +instances and threads per instance with --decoder-flags='--threads P:T:E' + +This script runs a specified number of moses instances, each using one or more +threads. The highest speed is generally seen with many single-threaded +instances while the lowest memory usage is seen with a single many-threaded +instance. It is recommended to use the maximum number of instances that will +fit into memory (up to the number of available CPUs) and distribute CPUs across +them equally. For example, a machine with 32 CPUs that can fit 3 copies of +moses into memory would use --threads 2:11:10 for 2 instances with 11 threads +each and an extra instance with 10 threads (3 instances total using all CPUs). + +Memory mapped models can be shared by multiple processes and increase the number +of instances that can fit into memory: + +Mmaped phrase tables (Ulrich Germann) +http://www.statmt.org/moses/?n=Advanced.Incremental#ntoc3 + +Mmaped mapped language models (Kenneth Heafield) +http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19 +''' + +import collections +import os +import Queue +import signal +import subprocess +import sys +import threading +import time + +HELP = '''Multiple process decoding with Moses + +Usage: + {} moses --config moses.ini [options] [decoder flags] + +Options: + --threads P:T:E + P: Number of parallel instances to run + T: Number of threads per instance + E: Number of threads in optional extra instance + (default 1:1:0, overrides [threads] in moses.ini. Specifying T + and E is optional, e.g. --threads 16 starts 16 single-threaded + instances) + --n-best-list nbest.out N [distinct]: location and size of N-best list + --show-weights: for mert-moses.pl, just call moses and exit + +Other options (decoder flags) are passed through to moses instances +''' + +# Defaults +INPUT = sys.stdin +PROCS = 1 +THREADS = 1 +EXTRA = 0 +DONE = threading.Event() +PID = os.getpid() +# A very long time, used as Queue operation timeout even though we don't +# actually want a timeout but we do want interruptibility +# (https://bugs.python.org/issue1360) +NEVER = 60 * 60 * 24 * 365 * 1000 + +# Single unit of computation: decode a line, output result, signal done +Task = collections.namedtuple('Task', ['id', 'line', 'out', 'event']) + + +def kill_main(msg): + '''kill -9 the main thread to stop everything immediately''' + sys.stderr.write('{}\n'.format(msg)) + os.kill(PID, signal.SIGKILL) + + +def gzopen(f): + '''Open plain or gzipped text''' + return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r') + + +def run_instance(cmd_base, threads, tasks, cpu_affinity, cpu_offset, n_best=False): + '''Run an instance of moses that processes tasks (input lines) from a + queue using a specified number of threads''' + cmd = cmd_base[:] + cmd.append('--threads') + cmd.append(str(threads)) + + if cpu_affinity: + cmd.append('--cpu-affinity-offset') + cmd.append(str(cpu_offset)) + + #print 'BEFORE' + #print cmd + #print 'AFTER\n' + + try: + # Queue of tasks instance is currently working on, limited to the number + # of threads * 2 (minimal buffering). The queue should be kept full for + # optimal CPU usage. + work = Queue.Queue(maxsize=(threads * 2)) + # Multi-threaded instance + moses = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + + # Read and handle instance output as available + def handle_output(): + while True: + # Output line triggers task completion + line = moses.stdout.readline() + # End of output (instance finished) + if not line: + break + task = work.get(timeout=NEVER) + if n_best: + # Read and copy lines until sentinel line, copy real line id + # id ||| hypothesis words ||| feature scores ||| total score + (first_i, rest) = line.split(' ||| ', 1) + task.out.append(' ||| '.join((task.id, rest))) + while True: + line = moses.stdout.readline() + (i, rest) = line.split(' ||| ', 1) + # Sentinel + if i != first_i: + break + task.out.append(' ||| '.join((task.id, rest))) + else: + task.out.append(line) + # Signal task done + task.event.set() + # Output thread + handler = threading.Thread(target=handle_output, args=()) + # Daemon: guaranteed to finish before non-daemons + handler.setDaemon(True) + handler.start() + + # Input thread: take tasks as they are available and add them to work + # queue. Stop when DONE encountered. + while True: + task = tasks.get(timeout=NEVER) + work.put(task, timeout=NEVER) + if task.event == DONE: + break + if n_best: + # Input line followed by blank line (sentinel) + moses.stdin.write(task.line) + moses.stdin.write('\n') + else: + moses.stdin.write(task.line) + + # Cleanup + moses.stdin.close() + moses.wait() + handler.join() + + except: + kill_main('Error with moses instance: see stderr') + + +def write_results(results, n_best=False, n_best_out=None): + '''Write out results (output lines) from a queue as they are populated''' + while True: + task = results.get(timeout=NEVER) + if task.event == DONE: + break + task.event.wait() + if n_best: + # Write top-best and N-best + # id ||| hypothesis words ||| feature scores ||| total score + top_best = task.out[0].split(' ||| ', 2)[1] + # Except don't write top-best if writing N-best to stdout "-" + if n_best_out != sys.stdout: + sys.stdout.write('{}\n'.format(top_best)) + sys.stdout.flush() + for line in task.out: + n_best_out.write(line) + n_best_out.flush() + else: + sys.stdout.write(task.out[0]) + sys.stdout.flush() + + +def main(argv): + # Defaults + moses_ini = None + input = INPUT + procs = PROCS + threads = THREADS + extra = EXTRA + n_best = False + n_best_file = None + n_best_size = None + n_best_distinct = False + n_best_out = None + show_weights = False + cpu_affinity = False + + # Decoder command + cmd = argv[1:] + + # Parse special options and remove from cmd + i = 1 + while i < len(cmd): + if cmd[i] in ('-f', '-config', '--config'): + moses_ini = cmd[i + 1] + # Do not remove from cmd + i += 2 + elif cmd[i] in ('-i', '-input-file', '--input-file'): + input = gzopen(cmd[i + 1]) + cmd = cmd[:i] + cmd[i + 2:] + elif cmd[i] in ('-th', '-threads', '--threads'): + # P:T:E + args = cmd[i + 1].split(':') + procs = int(args[0]) + if len(args) > 1: + threads = int(args[1]) + if len(args) > 2: + extra = int(args[2]) + cmd = cmd[:i] + cmd[i + 2:] + elif cmd[i] in ('-n-best-list', '--n-best-list'): + n_best = True + n_best_file = cmd[i + 1] + n_best_size = cmd[i + 2] + # Optional "distinct" + if i + 3 < len(cmd) and cmd[i + 3] == 'distinct': + n_best_distinct = True + cmd = cmd[:i] + cmd[i + 4:] + else: + cmd = cmd[:i] + cmd[i + 3:] + # Handled specially for mert-moses.pl + elif cmd[i] in ('-show-weights', '--show-weights'): + show_weights = True + # Do not remove from cmd + i += 1 + elif cmd[i] in ('-cpu-affinity', '--cpu-affinity'): + cpu_affinity = True + cmd = cmd[:i] + cmd[i + 1:] + else: + i += 1 + + # If mert-moses.pl passes -show-weights, just call moses + if show_weights: + sys.stdout.write(subprocess.check_output(cmd)) + sys.stdout.flush() + return + + # Check inputs + if not (len(cmd) > 0 and moses_ini): + sys.stderr.write(HELP.format(os.path.basename(argv[0]))) + sys.exit(2) + if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)): + raise Exception('moses "{}" is not executable\n'.format(cmd[0])) + + # Report settings + sys.stderr.write('Moses flags: {}\n'.format(' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:]))) + sys.stderr.write('Instances: {}\n'.format(procs)) + sys.stderr.write('Threads per: {}\n'.format(threads)) + if extra: + sys.stderr.write('Extra: {}\n'.format(extra)) + if n_best: + sys.stderr.write('N-best list: {} ({}{})\n'.format(n_best_file, n_best_size, ', distinct' if n_best_distinct else '')) + + # Task and result queues (buffer 8 * total threads input lines) + tasks = Queue.Queue(maxsize=(8 * ((procs * threads) + extra))) + results = Queue.Queue() + + # N-best capture + if n_best: + cmd.append('--n-best-list') + cmd.append('-') + cmd.append(n_best_size) + if n_best_distinct: + cmd.append('distinct') + if n_best_file == '-': + n_best_out = sys.stdout + else: + n_best_out = open(n_best_file, 'w') + + # Start instances + cpu_offset = -threads + instances = [] + for i in range(procs + (1 if extra else 0)): + if cpu_affinity: + cpu_offset += threads + + t = threading.Thread(target=run_instance, args=(cmd, (threads if i < procs else extra), tasks, cpu_affinity, cpu_offset, n_best)) + instances.append(t) + # Daemon: guaranteed to finish before non-daemons + t.setDaemon(True) + t.start() + #time.sleep(1) + + # Start results writer + writer = threading.Thread(target=write_results, args=(results, n_best, n_best_out)) + writer.start() + + # Main loop: queue task for each input line + id = 0 + while True: + line = input.readline() + if not line: + break + # (input, out lines, err lines, "done" event) + task = Task(str(id), line, [], threading.Event()) + results.put(task, timeout=NEVER) + tasks.put(task, timeout=NEVER) + id += 1 + + # Tell instances to exit + for t in instances: + tasks.put(Task(None, None, None, DONE), timeout=NEVER) + for t in instances: + t.join() + + # Stop results writer + results.put(Task(None, None, None, DONE), timeout=NEVER) + writer.join() + + # Cleanup + if n_best: + n_best_out.close() + + +if __name__ == '__main__': + try: + main(sys.argv) + except: + kill_main('Error with main I/O: see stderr') diff --git a/mosesdecoder/scripts/generic/ph_numbers.perl b/mosesdecoder/scripts/generic/ph_numbers.perl new file mode 100644 index 0000000000000000000000000000000000000000..618e6fe15ffe759bde0a2587d279836955a798c8 --- /dev/null +++ b/mosesdecoder/scripts/generic/ph_numbers.perl @@ -0,0 +1,106 @@ +#!/usr/bin/env perl + +package ph_numbers; + +# Script to recognize and replace numbers in Moses training corpora +# and decoder input +# +# (c) 2013 TAUS +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; + +run() unless caller(); +use Getopt::Std; + +my $debug = $ENV{DEBUG} || 0; + +sub run { + my %opts; + if(!getopts('s:t:cm:hl',\%opts) || $opts{h}) { + print "Usage: perl $0 [-s source_locale][-t target_locale][-c][-h][-l][-m symbol] < in > out\n"; + exit; + } + my $sourceLocale = $opts{s} || ""; + my $targetLocale = $opts{t} || ""; + my $numberSymbol = $opts{m} || '@num@'; + while(<>) { + chomp; + print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n"; + } +} + +sub mark_numbers { + my $input = shift; + my $corpusMode = shift; + my $legacyMode = shift; + my $numberSymbol = shift || '@num@'; + + my $numref = recognize($input); + my $input_length = length($input); + my $output = ""; + my $position = 0; + for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) { + my $numstart = $numref->[$i][0]; + my $numend = $numref->[$i][1]; + if($position < $numstart) { + $output .= substr($input,$position,$numstart-$position); + } + my $number = substr($input,$numstart,$numend-$numstart); + if($corpusMode) { + $output .= $numberSymbol; + } + else { + if($legacyMode) { + $output .= "$numberSymbol"; + } + else { + $output .= "$numberSymbol"; + } + } + $position = $numend; + } + $output .= substr($input,$position); + return $output; +} + +sub recognize { + my $input = shift; + #print STDERR "input=$input\n"; + + my @recognized = (); + while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) { + my $start = $-[3]; + my $end = $+[3]; + while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) { + $end = $+[2]; + } + + # ALL characters in the word must be + my $isRecognized = 1; + if ($start == 0 || substr($input, $start - 1, 1) eq " ") { + # 1st word, or previous char is a space + } + else { + $isRecognized = 0; + } + + if ($end == length($input) || substr($input, $end, 1) eq " ") { + # last word, or next char is a space + } + else { + $isRecognized = 0; + } + + #print STDERR "start=$start end=$end len=" .length($input) ."\n"; + if ($isRecognized) { + push @recognized,[$start,$end]; + } + } + return \@recognized; +} + +1; diff --git a/mosesdecoder/scripts/generic/reverse-alignment.perl b/mosesdecoder/scripts/generic/reverse-alignment.perl new file mode 100644 index 0000000000000000000000000000000000000000..f01acf5b0af125e617429afa45b3a4fc2ffeb6b9 --- /dev/null +++ b/mosesdecoder/scripts/generic/reverse-alignment.perl @@ -0,0 +1,24 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; + +my $line; +while ($line = ) +{ + chomp($line); + my @toks = split(/ /, $line); + + foreach (my $i = 0; $i < @toks; ++$i) + { + my $tok = $toks[$i]; + my @alignPair = split(/-/, $tok); + (@alignPair == 2) or die("Something wrong"); + print $alignPair[1]."-".$alignPair[0]." "; + } + print "\n"; +} + diff --git a/mosesdecoder/scripts/generic/score-parallel.perl b/mosesdecoder/scripts/generic/score-parallel.perl new file mode 100644 index 0000000000000000000000000000000000000000..48f29c62778855e08d1f1e74c47f3fc875327a4e --- /dev/null +++ b/mosesdecoder/scripts/generic/score-parallel.perl @@ -0,0 +1,428 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# example +# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0 +# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.inv.sorted.gz ./lex.2.e2f ./phrase-table.2.half.e2f --Inverse 1 + +use warnings; +use strict; +use File::Basename; + +sub RunFork($); +sub systemCheck($); +sub GetSourcePhrase($); +sub NumStr($); +sub CutContextFile($$$); + +my $GZIP_EXEC; +if(`which pigz`) { + $GZIP_EXEC = 'pigz'; +} +else { + $GZIP_EXEC = 'gzip'; +} +print STDERR "using $GZIP_EXEC \n"; + +#my $EXTRACT_SPLIT_LINES = 5000000; +my $EXTRACT_SPLIT_LINES = 50000000; + +print STDERR "Started ".localtime() ."\n"; + +my $numParallel = $ARGV[0]; +$numParallel = 1 if $numParallel < 1; + +my $sortCmd = $ARGV[1]; +my $scoreCmd = $ARGV[2]; + +my $extractFile = $ARGV[3]; # 1st arg of extract argument +my $lexFile = $ARGV[4]; +my $ptHalf = $ARGV[5]; # output +my $inverse = 0; +my $sourceLabelsFile; +my $partsOfSpeechFile; +my $targetSyntacticPreferencesLabelsFile; + +my $otherExtractArgs= ""; +for (my $i = 6; $i < $#ARGV; ++$i) +{ + if ($ARGV[$i] eq '--SourceLabels') { + $sourceLabelsFile = $ARGV[++$i]; + $otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS "; + next; + } + if ($ARGV[$i] eq '--PartsOfSpeech') { + $partsOfSpeechFile = $ARGV[++$i]; + $otherExtractArgs .= "--PartsOfSpeech "; + next; + } + if ($ARGV[$i] eq '--TargetSyntacticPreferences') { + $targetSyntacticPreferencesLabelsFile = $ARGV[++$i]; + $otherExtractArgs .= "--TargetSyntacticPreferences "; + next; + } + if ($ARGV[$i] eq '--Inverse') { + $inverse = 1; + $otherExtractArgs .= $ARGV[$i] ." "; + next; + } + $otherExtractArgs .= $ARGV[$i] ." "; +} +#$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs + +my $FlexibilityScore = $otherExtractArgs =~ /--FlexibilityScore/; +my $FlexibilityCmd = $otherExtractArgs; +$otherExtractArgs =~ s/--FlexibilityScore=\S+//; # don't pass flexibility_score command to score program +if ($FlexibilityCmd =~ /--FlexibilityScore=(\S+)/) { + $FlexibilityCmd = $1; +} + +my $doSort = $ARGV[$#ARGV]; # last arg + +my $TMPDIR=dirname($ptHalf) ."/tmp.$$"; +mkdir $TMPDIR; + +my $cmd; + +my $extractFileContext; +if ($FlexibilityScore) { + $extractFileContext = $extractFile; + $extractFileContext =~ s/extract./extract.context./; +} + +my $fileCount = 0; +if ($numParallel <= 1) +{ # don't do parallel. Just link the extract file into place + $cmd = "ln -s $extractFile $TMPDIR/extract.0.gz"; + if ($FlexibilityScore) { + $cmd .= " && ln -s $extractFileContext $TMPDIR/extract.context.0.gz"; + } + print STDERR "$cmd \n"; + systemCheck($cmd); + + $fileCount = 1; +} +else +{ # cut up extract file into smaller mini-extract files. + if ($extractFile =~ /\.gz$/) { + open(IN, "gunzip -c $extractFile |") || die "can't open pipe to $extractFile"; + } + else { + open(IN, $extractFile) || die "can't open $extractFile"; + } + + my $lastlineContext; + if ($FlexibilityScore) { + $lastlineContext = ""; + if ($extractFileContext =~ /\.gz$/) { + open(IN_CONTEXT, "gunzip -c $extractFileContext |") || die "can't open pipe to $extractFileContext"; + } + else { + open(IN_CONTEXT, $extractFileContext) || die "can't open $extractFileContext"; + } + } + + my $filePath = "$TMPDIR/extract.$fileCount.gz"; + open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!"; + + my $lineCount = 0; + my $line; + my $prevSourcePhrase = ""; + while ($line=) + { + chomp($line); + ++$lineCount; + + if ($lineCount > $EXTRACT_SPLIT_LINES) + { # over line limit. Cut off at next source phrase change + my $sourcePhrase = GetSourcePhrase($line); + + if ($prevSourcePhrase eq "") + { # start comparing + $prevSourcePhrase = $sourcePhrase; + } + elsif ($sourcePhrase eq $prevSourcePhrase) + { # can't cut off yet. Do nothing + } + else + { # cut off, open next min-extract file & write to that instead + close OUT; + + if ($FlexibilityScore) { + $lastlineContext = CutContextFile($prevSourcePhrase, $fileCount, $lastlineContext); + } + $prevSourcePhrase = ""; + $lineCount = 0; + ++$fileCount; + my $filePath = $fileCount; + $filePath = "$TMPDIR/extract.$filePath.gz"; + open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!"; + } + } + else + { # keep on writing to current mini-extract file + } + + print OUT "$line\n"; + + } + close OUT; + if ($FlexibilityScore) { + $lastlineContext = CutContextFile($prevSourcePhrase, $fileCount, $lastlineContext); + } + ++$fileCount; +} + + +# create run scripts +my @runFiles = (0..($numParallel-1)); +for (my $i = 0; $i < $numParallel; ++$i) +{ + my $path = "$TMPDIR/run.$i.sh"; + open(my $fh, ">", $path) or die "cannot open $path: $!"; + $runFiles[$i] = $fh; +} + +# write scoring of mini-extracts to run scripts +for (my $i = 0; $i < $fileCount; ++$i) +{ + my $numStr = NumStr($i); + + my $fileInd = $i % $numParallel; + my $fh = $runFiles[$fileInd]; + + my $cmd = "$scoreCmd $TMPDIR/extract.$i.gz $lexFile $TMPDIR/phrase-table.half.$numStr.gz $otherExtractArgs 2>> /dev/stderr \n"; + print STDERR $cmd; + + if ($FlexibilityScore) { + $cmd .= "gzip -cd $TMPDIR/phrase-table.half.$numStr.gz | $FlexibilityCmd $TMPDIR/extract.context.$i.gz"; + $cmd .= " --Inverse" if ($otherExtractArgs =~ /--Inverse/); + $cmd .= " --Hierarchical" if ($otherExtractArgs =~ /--Hierarchical/); + $cmd .= " | $GZIP_EXEC -c > $TMPDIR/phrase-table.half.$numStr.flex.gz\n"; + $cmd .= "mv $TMPDIR/phrase-table.half.$numStr.flex.gz $TMPDIR/phrase-table.half.$numStr.gz\n"; + } + + print $fh $cmd; +} + +# close run script files +for (my $i = 0; $i < $numParallel; ++$i) +{ + close($runFiles[$i]); + my $path = "$TMPDIR/run.$i.sh"; + systemCheck("chmod +x $path"); +} + +# run each score script in parallel +my @children; +for (my $i = 0; $i < $numParallel; ++$i) +{ + my $cmd = "$TMPDIR/run.$i.sh"; + my $pid = RunFork($cmd); + push(@children, $pid); +} + +# wait for everything is finished +foreach (@children) { + waitpid($_, 0); +} + +# merge & sort +$cmd = "\n\nOH SHIT. This should have been filled in \n\n"; +if ($fileCount == 1 && !$doSort && !$FlexibilityScore) +{ + my $numStr = NumStr(0); + $cmd = "mv $TMPDIR/phrase-table.half.$numStr.gz $ptHalf"; +} +else +{ + $cmd = "gunzip -c $TMPDIR/phrase-table.half.*.gz 2>> /dev/stderr"; + + if ($doSort) { + $cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR "; + } + + $cmd .= " | $GZIP_EXEC -c > $ptHalf 2>> /dev/stderr "; +} +print STDERR $cmd; +systemCheck($cmd); + +# merge coc +my $numStr = NumStr(0); +my $cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc"; + +if (-e $cocPath) +{ + my @arrayCOC; + my $line; + + # 1st file + open(FHCOC, $cocPath) || die "can't open pipe to $cocPath"; + while ($line = ) + { + my $coc = int($line); + push(@arrayCOC, $coc); + } + close(FHCOC); + + # all other files + for (my $i = 1; $i < $fileCount; ++$i) + { + $numStr = NumStr($i); + $cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc"; + open(FHCOC, $cocPath) || die "can't open pipe to $cocPath"; + my $arrayInd = 0; + while ($line = ) + { + my $coc = int($line); + $arrayCOC[$arrayInd] += $coc; + + ++$arrayInd; + } + + close(FHCOC); + } + + # output + $cocPath = "$ptHalf.coc"; + open(FHCOC, ">", $cocPath) or die "cannot open $cocPath: $!"; + for (my $i = 0; $i < @arrayCOC; ++$i) + { + print FHCOC $arrayCOC[$i]."\n"; + } + close(FHCOC); +} + +# merge source labels files +if (!$inverse && defined($sourceLabelsFile)) +{ + my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; echo \"SSTART 2\"; echo \"SEND 3\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $sourceLabelsFile"; + print STDERR "Merging source labels files: $cmd \n"; + `$cmd`; +} + +# merge parts-of-speech files +if (!$inverse && defined($partsOfSpeechFile)) +{ + my $cmd = "(echo \"SSTART 0\"; echo \"SEND 1\"; cat $TMPDIR/phrase-table.half.*.gz.partsOfSpeech | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $partsOfSpeechFile"; + print STDERR "Merging parts-of-speech files: $cmd \n"; + `$cmd`; +} + +# merge target syntactic preferences labels files +if (!$inverse && defined($targetSyntacticPreferencesLabelsFile)) +{ + my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.tgtpref | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $targetSyntacticPreferencesLabelsFile"; + print STDERR "Merging target syntactic preferences labels files: $cmd \n"; + `$cmd`; +} + +$cmd = "rm -rf $TMPDIR \n"; +print STDERR $cmd; +systemCheck($cmd); + +print STDERR "Finished ".localtime() ."\n"; + +# ----------------------------------------- +# ----------------------------------------- + +sub RunFork($) +{ + my $cmd = shift; + + my $pid = fork(); + + if ($pid == 0) + { # child + print STDERR $cmd; + systemCheck($cmd); + exit(); + } + return $pid; +} +sub systemCheck($) +{ + my $cmd = shift; + my $retVal = system($cmd); + if ($retVal != 0) + { + exit(1); + } +} + +sub GetSourcePhrase($) +{ + my $line = shift; + my $pos = index($line, "|||"); + my $sourcePhrase = substr($line, 0, $pos); + return $sourcePhrase; +} + + +sub NumStr($) +{ + my $i = shift; + my $numStr; + if ($i < 10) { + $numStr = "000000$i"; + } + elsif ($i < 100) { + $numStr = "00000$i"; + } + elsif ($i < 1000) { + $numStr = "0000$i"; + } + elsif ($i < 10000) { + $numStr = "000$i"; + } + elsif ($i < 100000) { + $numStr = "00$i"; + } + elsif ($i < 1000000) { + $numStr = "0$i"; + } + else { + $numStr = $i; + } + return $numStr; +} + + +sub CutContextFile($$$) +{ + my($lastsourcePhrase, $fileCount, $lastline) = @_; + my $line; + my $sourcePhrase; + + my $filePath = "$TMPDIR/extract.context.$fileCount.gz"; + open (OUT_CONTEXT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!"; + + if ($lastline ne "") { + print OUT_CONTEXT "$lastline\n"; + } + + #write all lines in context file until we meet last source phrase in extract file + while ($line=) + { + chomp($line); + $sourcePhrase = GetSourcePhrase($line); + print OUT_CONTEXT "$line\n"; + if ($sourcePhrase eq $lastsourcePhrase) {last;} + } + + #write all lines in context file that correspond to last source phrase in extract file + while ($line=) + { + chomp($line); + $sourcePhrase = GetSourcePhrase($line); + if ($sourcePhrase ne $lastsourcePhrase) {last;} + print OUT_CONTEXT "$line\n"; + } + + close(OUT_CONTEXT); + + return $line; + +} diff --git a/mosesdecoder/scripts/generic/score_parallel.py b/mosesdecoder/scripts/generic/score_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..0fb0c6fbed88c54b4ec3fed02e05bd0afe1041a4 --- /dev/null +++ b/mosesdecoder/scripts/generic/score_parallel.py @@ -0,0 +1,776 @@ +#! /usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. +# +# Script contributed by Precision Translation Tools. + +"""Run Moses `score` jobs in parallel. + +This script is a replacement for `score-parallel.perl`. The two are similar, +but there are differences in usage. In addition, this script can be called +directly from Python code without the need to run it as a separate process. +""" + +from __future__ import ( + absolute_import, + print_function, + unicode_literals, + ) + +__metaclass__ = type + +from argparse import ArgumentParser +from contextlib import contextmanager +from datetime import datetime +import errno +import gzip +from multiprocessing import Pool +import os +import os.path +import pipes +from shutil import rmtree +from subprocess import check_call +import sys +import tempfile + + +def get_unicode_type(): + """Return the Unicode string type appropriate to this Python version.""" + if sys.version_info.major <= 2: + # Unicode string type. In Python 2 this is the "unicode" type, + # while "str" is a binary string type. + return unicode + else: + # Unicode string type. In Python 3 this is the default "str" type. + # The binary string type is now called "bytes". + return str + + +UNICODE_TYPE = get_unicode_type() + + +class CommandLineError(Exception): + """Invalid command line.""" + + +class ProgramFailure(Exception): + """Failure, not a bug, which is reported neatly to the user.""" + + +def parse_args(): + """Parse command line arguments, return as `Namespace`.""" + parser = ArgumentParser(description=__doc__) + parser.add_argument( + '--extract-file', '-e', metavar='PATH', required=True, + help=( + "Path to input file: extract file (e.g. 'extract.sorted.gz' or " + "'extract.inv.sorted.gz'). Required.")) + parser.add_argument( + '--lex-file', '-l', metavar='PATH', required=True, + help=( + "Path to input file: lex file (e.g. 'lex.f2e' or 'lex.e2f'). " + "Required.")) + parser.add_argument( + '--output', '-o', metavar='PATH', required=True, + help=( + "Write phrase table to file PATH (e.g. 'phrase-table.half.f2e' " + "or 'phrase-table.half.e2f'). Required.")) + parser.add_argument( + '--inverse', '-i', action='store_true', + help="Inverse scoring. Defaults to direct scoring.") + parser.add_argument( + '--labels-file', '-L', metavar='PATH', + help="Also write source labels to file PATH.") + parser.add_argument( + '--parts-of-speech', '-p', metavar='PATH', + help="Also write parts-of-speech file to PATH.") + parser.add_argument( + '--flexibility-score', '-F', metavar='PATH', + help="Path to the 'flexibility_score.py' script. Defaults to none.") + parser.add_argument( + '--hierarchical', '-H', action='store_true', + help="Process hierarchical rules.") + parser.add_argument( + '--args', '-a', metavar='ARGUMENTS', + help="Additional arguments for `score` and `flexibility_score`.") + parser.add_argument( + '--sort', '-s', action='store_true', + help="Sort output file.") + parser.add_argument( + '--jobs', '-j', metavar='N', type=int, default=1, + help="Run up to N jobs in parallel. Defaults to %(default)s.") + parser.add_argument( + '--score-exe', '-x', metavar='PROGRAM', + help="Name of, or path to, the 'score' executable.") + parser.add_argument( + '--sort-command', '-S', metavar='COMMAND-LINE', + help=( + "Command line for sorting text files to standard output. " + "Must support operation as a pipe, as well as input files named " + "as command-line arguments.")) + parser.add_argument( + '--gzip-command', '-z', metavar='PROGRAM', + help="Path to a gzip or pigz executable.") + parser.add_argument( + '--verbose', '-v', action='store_true', + help="Print what's going on.") + parser.add_argument( + '--debug', '-d', action='store_true', + help="Don't delete temporary directories when done.") + return parser.parse_args() + + +def normalize_path(optional_path=None): + """Return a cleaned-up version of a given filesystem path, or None. + + Converts the path to the operating system's native conventions, and + removes redundancies like `.`. + + The return value will be `None`, an absolute path, or a relative path, + same as the argument. But it will have redundant path separators, + unnecessary detours through parent directories, and use of the current + directory "." removed. + """ + if optional_path is None: + return None + else: + path = os.path.normpath(optional_path) + path = path.replace('/', os.path.sep) + path = path.replace('\\', os.path.sep) + return path + + +def quote(path): + """Quote and escape a filename for use in a shell command. + + The Windows implementation is very limited and will break on anything + more advanced than a space. + """ + if os.name == 'posix': + return pipes.quote(path) + else: + # TODO: Improve escaping for Windows. + return '"%s"' % path + + +def sanitize_args(args): + """Check `args` for sanity, clean up, and set nontrivial defaults.""" + if args.jobs < 1: + raise CommandLineError("Number of parallel jobs must be 1 or more.") + if args.sort_command is None: + args.sort_command = find_first_executable( + ['neandersort', 'gsort', 'sort']) + if args.sort_command is None: + raise CommandLineError( + "No 'sort' command is available. " + "Choose one using the --sort-command option.") + if args.gzip_command is None: + args.gzip_command = find_first_executable(['pigz', 'gzip']) + if args.gzip_command is None: + raise CommandLineError( + "No 'gzip' or 'pigz' command is available. " + "Choose one using the --gzip-command option.") + if args.score_exe is None: + # Look for "score" executable. It may be in the current project + # directory somewhere, or in the PATH. + moses_dir = os.path.dirname(os.path.dirname( + os.path.abspath(__file__))) + args.score_exe = find_first_executable( + ['score'], + [ + moses_dir, + os.path.join(moses_dir, 'phrase-extract'), + os.path.join(moses_dir, 'binaries'), + ]) + args.extract_file = normalize_path(args.extract_file) + args.lex_file = normalize_path(args.lex_file) + args.output = normalize_path(args.output) + args.labels_file = normalize_path(args.labels_file) + args.parts_of_speech = normalize_path(args.parts_of_speech) + args.flexibility_score = normalize_path(args.flexibility_score) + args.score_exe = normalize_path(args.score_exe) + + +def add_exe_suffix(program): + """Return the full filename for an executable. + + On Windows, this adds a `.exe` suffix to the name. On other + systems, it returns the original name unchanged. + """ + if os.name == 'nt': + # Windows. + return program + '.exe' + else: + # Assume POSIX or similar. + return program + + +def find_executable(exe, extra_path=None): + """Return full path to an executable of the given name, or `None`. + + If the given name is a qualified path to an executable, it will be returned + unchanged. A qualified path where no executable is found results in a + `CommandLineError`. + """ + if extra_path is None: + extra_path = [] + + if os.path.sep in exe: + # The executable name includes a path. Only one place it can be. + if not os.path.isfile(exe) or not os.access(exe, os.X_OK): + raise CommandLineError("Not an executable: '%s'." % exe) + return exe + + for path in extra_path + os.getenv('PATH').split(os.pathsep): + full_path = os.path.join(path, exe) + if os.access(full_path, os.X_OK): + return full_path + return None + + +def find_first_executable(candidates, extra_path=None): + """Find the first available of the given candidate programs. + + :raise ProgramFailure: If none of `candidates` was found. + """ + for program in candidates: + executable = find_executable(add_exe_suffix(program), extra_path) + if executable is not None: + return executable + raise ProgramFailure( + "Could not find any of these executables in path: %s." + % ', '.join(candidates)) + + +def execute_shell(command, verbose=False): + """Run `command` string through the shell. + + Inherits environment, but sets `LC_ALL` to `C` for predictable results, + especially from sort commands. + + This uses a full-featured shell, including pipes, substitution, etc. So + remember to quote/escape arguments where appropriate! + """ + assert isinstance(command, UNICODE_TYPE), ( + "Wrong argument for execute_shell.") + if verbose: + print("Executing: %s" % command) + env = os.environ.copy() + if os.name == 'posix': + env['LC_ALL'] = 'C' + check_call(command, shell=True, env=env) + + +@contextmanager +def tempdir(keep=False): + """Context manager: temporary directory.""" + directory = tempfile.mkdtemp() + yield directory + if not keep: + rmtree(directory) + + +def make_dirs(path): + """Equivalent to `mkdir -p -- path`.""" + try: + os.makedirs(path) + except OSError as error: + if error.errno != errno.EEXIST: + raise + + +def open_file(path, mode='r'): + """Open a file, which may be gzip-compressed.""" + if path.endswith('.gz'): + return gzip.open(path, mode) + else: + return open(path, mode) + + +def count_lines(filename): + """Count the number of lines in `filename` (may be gzip-compressed).""" + count = 0 + with open_file(filename) as stream: + for _ in stream: + count += 1 + return count + + +def set_temp_dir(): + """Set temporary directory to `$MOSES_TEMP_DIR`, if set. + + Create the directory if necessary. + """ + temp_dir = os.getenv('MOSES_TEMP_DIR') + if temp_dir is not None: + make_dirs(temp_dir) + tempfile.tempdir = temp_dir + + +def strip_newline(line): + """Remove trailing carriage return and/or line feed, if present.""" + if line.endswith('\n'): + line = line[:-1] + if line.endswith('\r'): + line = line[:-1] + return line + + +def open_chunk_file(split_dir, chunk_number): + """Open a file to write one chunk of the extract file.""" + return open_file( + os.path.join(split_dir, 'extract.%d.gz' % chunk_number), 'w') + + +def name_context_chunk_file(split_dir, chunk_number): + """Compose file name for one chunk of the extract context file.""" + return os.path.join( + split_dir, 'extract.context.%d.gz' % chunk_number) + + +def extract_source_phrase(line): + """Extract the source phrase from an extract-file line.""" + return line.split(b'|||', 1)[0] + + +def cut_context_file(last_source_phrase, chunk_file, last_line, + context_stream): + """Write one chunk of extract context file into its own file. + + :param last_source_phrase: Last source phrase that should be in the + chunk. Stop processing after this source phrase. + :param chunk_file: Path to the extract context file for this chunk. + :param last_line: Previously read line that may still need writing. + :param context_stream: Extract context file, opened for reading. + :return: Last line read from `context_stream`. This line will still + need processing. + """ + # TODO: Use open_file. + with gzip.open(chunk_file, 'w') as chunk: + if last_line is not None: + chunk.write('%s\n' % last_line) + + # Are we processing our last source phrase yet? + on_last_source_phrase = False + + # Write all lines in context file until we meet last source phrase + # in extract file. + for line in context_stream: + # Reading from a gzip file returns lines *including the newline*. + # Either way, we want to ignore carriage returns as well. + line = strip_newline(line) + source_phrase = extract_source_phrase(line) + if on_last_source_phrase and source_phrase != last_source_phrase: + # First new source phrase after our last one. We're done. + return line + else: + # Still adding lines to our chunk. + chunk.write('%s\n' % line) + if source_phrase == last_source_phrase: + # We're on our last source phrase now. + on_last_source_phrase = True + + +def split_extract_files(split_dir, extract_file, extract_context_file=None, + jobs=1): + """Split extract file into chunks, so we can process them in parallel. + + :param split_dir: A temporary directory where this function can write + temporary files. The caller must ensure that this directory will be + cleaned up after it's done with the files. + :return: An iterable of tuples. Each tuple hols a partial extract file, + and the corresponding context file. The files may be in `split_dir`, + or there may just be the original extract file. + """ + if jobs == 1: + # No splitting needed. Read the original file(s). + return [(extract_file, extract_context_file)] + + # Otherwise: split files. + files = [] + num_lines = count_lines(extract_file) + chunk_size = (num_lines + jobs - 1) / jobs + assert isinstance(chunk_size, int) + + line_count = 0 + chunk_number = 0 + prev_source_phrase = None + last_line_context = None + extract_stream = open_file(extract_file) + chunk_file = open_chunk_file(split_dir, chunk_number) + if extract_context_file is None: + chunk_context_file = None + if extract_context_file is not None: + context_stream = open_file(extract_context_file) + + for line in extract_stream: + line_count += 1 + line = line.decode('utf-8') + line = strip_newline(line) + if line_count >= chunk_size: + # At or over chunk size. Cut off at next source phrase change. + source_phrase = extract_source_phrase(line) + if prev_source_phrase is None: + # Start looking for a different source phrase. + prev_source_phrase = source_phrase + elif source_phrase == prev_source_phrase: + # Can't cut yet. Still working on the same source phrase. + pass + else: + # Hit first new source phrase after chunk limit. Cut new + # file(s). + chunk_file.close() + if extract_context_file is not None: + chunk_context_file = name_context_chunk_file( + split_dir, chunk_number) + last_line_context = cut_context_file( + prev_source_phrase, chunk_context_file, + last_line_context, context_stream) + files.append((chunk_file.name, chunk_context_file)) + + # Start on new chunk. + prev_source_phrase = None + line_count = 0 + chunk_number += 1 + chunk_file = open_chunk_file(split_dir, chunk_number) + chunk_file.write(('%s\n' % line).encode('utf-8')) + + chunk_file.close() + if extract_context_file is not None: + chunk_context_file = name_context_chunk_file(split_dir, chunk_number) + last_line_context = cut_context_file( + prev_source_phrase, chunk_number, last_line_context, + context_stream) + files.append((chunk_file.name, chunk_context_file)) + return files + + +def compose_score_command(extract_file, context_file, half_file, + flex_half_file, args): + """Compose command line text to run one instance of `score`. + + :param extract_file: One chunk of extract file. + :param context_file: If doing flexibility scoring, one chunk of + extract context file. Otherwise, None. + :param half_file: ??? + :param flex_half_file: ??? + :param args: Arguments namespace. + """ + command = [ + args.score_exe, + extract_file, + args.lex_file, + half_file, + ] + if args.args not in (None, ''): + command.append(args.args) + other_args = build_score_args(args) + if other_args != '': + command.append(other_args) + if context_file is not None: + command += [ + '&&', + find_first_executable(['bzcat']), + half_file, + '|', + quote(args.flexibility_score), + quote(context_file), + ] + if args.inverse: + command.append('--Inverse') + if args.hierarchical: + command.append('--Hierarchical') + command += [ + '|', + quote(args.gzip_command), + '-c', + '>%s' % quote(flex_half_file), + ] + return ' '.join(command) + + +def score_parallel(split_dir, file_pairs, args): + """Run the `score` command in parallel. + + :param split_dir: Temporary directory where we can create split files. + :param file_pairs: Sequence of tuples for the input files, one tuple + per chunk of the work. Each tuple consists of a partial extract + file, and optionally a partial extract context file. + :param args: Arguments namespace. + :return: A list of tuples. Each tuple contains two file paths. The first + is for a partial half-phrase-table file. The second is for the + corresponding partial flex file, if a context file is given; or + `None` otherwise. + """ + partial_files = [] + # Pool of worker processes for executing the partial "score" invocations + # concurrently. + pool = Pool(args.jobs) + try: + for chunk_num, file_pair in enumerate(file_pairs): + half_file = os.path.join( + split_dir, 'phrase-table.half.%06d.gz' % chunk_num) + extract_file, context_file = file_pair + if context_file is None: + flex_half_file = None + else: + flex_half_file = os.path.join( + split_dir, 'phrase-table.half.%06d.flex.gz' % chunk_num) + # Pickling of arguments for the pool is awkward on Windows, so + # keep them simple. Compose the command line in the parent + # process, then hand them to worker processes which execute them. + command_line = compose_score_command( + extract_file, context_file, half_file, flex_half_file, args) + pool.apply_async( + execute_shell, (command_line, ), {'verbose': args.verbose}) + partial_files.append((half_file, flex_half_file)) + pool.close() + except BaseException: + pool.terminate() + raise + finally: + pool.join() + return partial_files + + +def merge_and_sort(files, output, sort_command=None, gzip_exe=None, + verbose=False): + """Merge partial files. + + :param files: List of partial half-phrase-table files. + :param output: Path for resulting combined phrase-table file. + """ +# TODO: The Perl code mentioned "sort" and "flexibility_score" here. +# What do we do with those? + + # Sort whether we're asked to or not, as a way of combining the input + # files. + if sort_command == 'neandersort': + # Neandersort transparently decompresses input and compresses output. + check_call([ + 'neandersort', + '-o', output, + ] + files) + else: + command = ( + "%(gzip)s -c -d %(files)s | " + "%(sort)s | " + "%(gzip)s -c >>%(output)s" + % { + 'gzip': quote(gzip_exe), + 'sort': sort_command, + 'files': ' '.join(map(quote, files)), + 'output': quote(output), + }) + execute_shell(command, verbose=verbose) + + +def build_score_args(args): + """Compose command line for the `score` program.""" + command_line = [] + if args.labels_file: + command_line += [ + '--SourceLabels', + '--SourceLabelCountsLHS', + '--SourceLabelSet', + ] + if args.parts_of_speech: + command_line.append('--PartsOfSpeech') + if args.inverse: + command_line.append('--Inverse') + if args.args is not None: + command_line.append(args.args) + return ' '.join(command_line) + + +def list_existing(paths): + """Return, in the same order, those of the given files which exist.""" + return filter(os.path.exists, paths) + + +def compose_coc_path_for(path): + """Compose COC-file path for the given file.""" + return '%s.coc' % path + + +def read_cocs(path): + """Read COC file at `path`, return contents as tuple of ints.""" + with open(path) as lines: + return tuple( + int(line.rstrip('\r\n')) + for line in lines + ) + + +def add_cocs(original, additional): + """Add two tuples of COCs. Extend as needed.""" + assert not (original is None and additional is None), "No COCs to add!" + if original is None: + return additional + elif additional is None: + return original + else: + common = tuple(lhs + rhs for lhs, rhs in zip(original, additional)) + return ( + common + + tuple(original[len(common):]) + + tuple(additional[len(common):])) + + +def merge_coc(files, output): + """Merge COC files for the given partial files. + + Each COC file is a series of integers, one per line. This reads them, and + adds them up line-wise into one file of the same format: the sum of the + numbers the respective files have at line 1, the sum of the numbers the + respective files have at line 2, and so on. + """ + assert len(files) > 0, "No partial files - no work to do." + extract_files = [extract_file for extract_file, _ in files] + if not os.path.exists(compose_coc_path_for(extract_files[0])): + # Nothing to merge. + return + totals = None +# TODO: Shouldn't we just fail if any of these files is missing? + for coc_path in list_existing(map(compose_coc_path_for, extract_files)): + totals = add_cocs(totals, read_cocs(coc_path)) + + # Write to output file. + with open(output, 'w') as output_stream: + for entry in totals: + output_stream.write('%d\n' % entry) + + +def suffix_line_numbers(infile, outfile): + """Rewrite `infile` to `outfile`; suffix line number to each line. + + The line number is zero-based, and separated from the rest of the line + by a single space. + """ + temp_file = '%s.numbering' % outfile + with open(infile, 'r') as instream, open(outfile, 'w') as outstream: + line_no = 0 + for line in instream: + outstream.write(line) + outstream.write(' %d\n' % line_no) + line_no += 1 + os.rename(temp_file, outfile) + + +def compose_source_labels_path_for(path): + """Return source labels file path for given file.""" + return '%s.syntaxLabels.src' % path + + +def merge_numbered_files(inputs, output, header_lines, sort_command, + verbose=False): + """Sort and merge files `inputs`, add header and line numbers. + + :param inputs: Iterable of input files. + :param output: Output file. + :header_lines: Iterable of header lines. + :sort_command: Command line for sorting input files. + """ + sort_temp = '%s.sorting' % output + with open(sort_temp, 'w') as stream: + for line in header_lines: + stream.write(line) + stream.write('\n') + execute_shell( + "%s %s >>%s" % ( + sort_command, + ' '.join(map(quote, inputs)), + quote(sort_temp)), + verbose=verbose) + suffix_line_numbers(sort_temp, output) + + +def merge_source_labels(files, output, sort_command, verbose=False): + """Merge source labels files.""" +# TODO: Shouldn't we just fail if any of these files is missing? + labels_files = list_existing(map(compose_source_labels_path_for, files)) + header = [ + 'GlueTop', + 'GlueX', + 'SSTART', + 'SEND', + ] + merge_numbered_files( + labels_files, output, header, sort_command, verbose=verbose) + + +def compose_parts_of_speech_path_for(path): + """Return parts-of-speech file path for given file.""" + return '%s.partsOfSpeech' % path + + +def merge_parts_of_speech(files, output, sort_command, verbose=False): + """Merge parts-of-speech files into output.""" +# TODO: Shouldn't we just fail if any of these files is missing? + parts_files = list_existing(map(compose_parts_of_speech_path_for, files)) + header = [ + 'SSTART', + 'SEND', + ] + merge_numbered_files( + parts_files, output, header, sort_command, verbose=verbose) + + +def main(): + """Command-line entry point. Marshals and forwards to `score_parallel`.""" + args = parse_args() + sanitize_args(args) + set_temp_dir() + + if args.flexibility_score is None: + extract_context_file = None + else: + extract_context_file = args.extract_file.replace( + 'extract.', 'extract.context.') + + if args.verbose: + print("Started %s." % datetime.now()) + print("Using '%s' for gzip." % args.gzip_command) + + with tempdir(args.debug) as split_dir: + extract_files = split_extract_files( + split_dir, args.extract_file, + extract_context_file=extract_context_file, jobs=args.jobs) + + scored_files = score_parallel(split_dir, extract_files, args) + + if args.verbose: + sys.stderr.write("Finished score %s.\n" % datetime.now()) + +# TODO: Pass on "sort" and "flexibility-score" arguments? + merge_and_sort( + [phrase_chunk for phrase_chunk, _ in scored_files], args.output, + sort_command=args.sort_command, gzip_exe=args.gzip_command, + verbose=args.verbose) + merge_coc(extract_files, compose_coc_path_for(args.output)) + + if not args.inverse and args.labels_file is not None: + if args.verbose: + print("Merging source labels files.") + merge_source_labels( + extract_files, args.labels_file, + sort_command=args.sort_command, verbose=args.verbose) + + if not args.inverse and args.parts_of_speech is not None: + if args.verbose: + print("Merging parts-of-speech files.") + merge_parts_of_speech( + extract_files, args.parts_of_speech, + sort_command=args.sort_command, verbose=args.verbose) + + +if __name__ == '__main__': + try: + main() + except ProgramFailure as error: + sys.stderr.write('%s\n' % error) + sys.exit(1) + except CommandLineError as error: + sys.stderr.write("Command line error: %s\n" % error) + sys.exit(2) diff --git a/mosesdecoder/scripts/generic/strip-xml.perl b/mosesdecoder/scripts/generic/strip-xml.perl new file mode 100644 index 0000000000000000000000000000000000000000..a5dbbaa37e9a52a9b459c57b38b1529ef0580e87 --- /dev/null +++ b/mosesdecoder/scripts/generic/strip-xml.perl @@ -0,0 +1,48 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; + +while (my $line = ) { + chomp($line); + #print "$line\n"; + + my $len = length($line); + my $inXML = 0; + my $prevSpace = 1; + my $prevBar = 0; + + for (my $i = 0; $i < $len; ++$i) { + my $c = substr($line, $i, 1); + if ($c eq "<" && !$prevBar) { + ++$inXML; + } + elsif ($c eq ">" && $inXML>0) { + --$inXML; + } + elsif ($prevSpace == 1 && $c eq " ") + { # duplicate space. Do nothing + } + elsif ($inXML == 0) { + if ($c eq " ") { + $prevSpace = 1; + $prevBar = 0; + } + elsif ($c eq "|") { + $prevSpace = 0; + $prevBar = 1; + } + else { + $prevSpace = 0; + $prevBar = 0; + } + print $c; + } + } + + print "\n"; +} + diff --git a/mosesdecoder/scripts/generic/trainlm-irst2.perl b/mosesdecoder/scripts/generic/trainlm-irst2.perl new file mode 100644 index 0000000000000000000000000000000000000000..8af372fac556a60039a7d159f6ebbb2386db1957 --- /dev/null +++ b/mosesdecoder/scripts/generic/trainlm-irst2.perl @@ -0,0 +1,72 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# Compatible with sri LM-creating script, eg. +# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt +# To use it in the EMS, add this to the [LM] section +# lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irst-dir" +# settings = "" +# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section. +# It should point to the root of the LM toolkit, eg +# irst-dir = /Users/hieu/workspace/irstlm/trunk/bin +# Set smoothing method in settings, if different from modified Kneser-Ney + +use warnings; +use strict; +use FindBin qw($RealBin); +use Getopt::Long; + +my $order = 3; # order of language model (default trigram) +my $corpusPath; # input text data +my $lmPath; # generated language model +my $cores = 2; # number of CPUs used +my $irstPath; # bin directory of IRSTLM +my $tempPath = "tmp"; # temp dir +my $pruneSingletons = 1; # 1 = prune singletons, 0 = keep singletons +my $smoothing = "msb"; # smoothing method: wb = witten-bell, sb = kneser-ney, msb = modified-kneser-ney +my $dummy; + +GetOptions("order=s" => \$order, + "text=s" => \$corpusPath, + "lm=s" => \$lmPath, + "cores=s" => \$cores, + "irst-dir=s" => \$irstPath, + "temp-dir=s" => \$tempPath, + "p=i" => \$pruneSingletons, # irstlm parameter: prune singletons + "s=s" => \$smoothing, # irstlm parameter: smoothing method + "interpolate!" => \$dummy, #ignore + "kndiscount!" => \$dummy #ignore + ) or exit 1; + +#die("ERROR: please set order") unless defined($order); +die("ERROR: please set text") unless defined($corpusPath); +die("ERROR: please set lm") unless defined($lmPath); +die("ERROR: please set irst-dir") unless defined($irstPath); + + +$tempPath .= "/irstlm-build-tmp.$$"; +`mkdir -p $tempPath`; + +# add and +my $cmd = "cat $corpusPath | $irstPath/add-start-end.sh > $tempPath/setagged"; +print STDERR "EXECUTING $cmd\n"; +`$cmd`; + +# collect n-gram counts +$cmd = "$irstPath/ngt -i=$tempPath/setagged -n=$order -b=yes -o=$tempPath/counts"; +print STDERR "EXECUTING $cmd\n"; +`$cmd`; + +# build lm +$cmd = "$irstPath/tlm -o=$lmPath -lm=$smoothing -bo=yes -n=$order -tr=$tempPath/counts"; +$cmd .= " -ps=no" unless $pruneSingletons; +print STDERR "EXECUTING $cmd\n"; +`$cmd`; + +$cmd = "rm -rf $tempPath"; +print STDERR "EXECUTING $cmd\n"; +`$cmd`; + +print STDERR "FINISH.\n"; diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/README.txt b/mosesdecoder/scripts/share/nonbreaking_prefixes/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..2276a11386681c8dfe227c3d2546967dfb509677 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/README.txt @@ -0,0 +1,8 @@ +The language suffix can be found here: + +http://www.loc.gov/standards/iso639-2/php/code_list.php + +This code includes data from Daniel Naber's Language Tools (czech abbreviations). +This code includes data from czech wiktionary (also czech abbreviations). + + diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.as b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.as new file mode 100644 index 0000000000000000000000000000000000000000..866ee158ab1081140f0fcd861c1773d87128fb34 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.as @@ -0,0 +1,65 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Dr +ড + +#others + + +#phonetics +# A +এ +# B +বি +# C +সি +# D +ডি +# E +ই +# F +এফ +# G +জি +# H +এইচ +# I +আম +# J +জে +# K +কে +# L +এল +# M +এম +# N +এন +# O +হে +# P +পি +# Q +কিউ +# R +আর +# S +এস +# T +টি +# U +ইউ +# V +ভি +# W +ডব্লু +# X +এক্স +# Y +ওয়াই +# Z +জেড + +#consonants + diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ca b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ca new file mode 100644 index 0000000000000000000000000000000000000000..d048d2479d4e40a5ea2e969f39454e183cfff370 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ca @@ -0,0 +1,75 @@ +Dr +Dra +pàg +p +c +av +Sr +Sra +adm +esq +Prof +S.A +S.L +p.e +ptes +Sta +St +pl +màx +cast +dir +nre +fra +admdora +Emm +Excma +espf +dc +admdor +tel +angl +aprox +ca +dept +dj +dl +dt +ds +dg +dv +ed +entl +al +i.e +maj +smin +n +núm +pta +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.cs b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.cs new file mode 100644 index 0000000000000000000000000000000000000000..dce6167aebf39ceee3881b2dbdcc38524f05501b --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.cs @@ -0,0 +1,390 @@ +Bc +BcA +Ing +Ing.arch +MUDr +MVDr +MgA +Mgr +JUDr +PhDr +RNDr +PharmDr +ThLic +ThDr +Ph.D +Th.D +prof +doc +CSc +DrSc +dr. h. c +PaedDr +Dr +PhMr +DiS +abt +ad +a.i +aj +angl +anon +apod +atd +atp +aut +bd +biogr +b.m +b.p +b.r +cca +cit +cizojaz +c.k +col +čes +čín +čj +ed +facs +fasc +fol +fot +franc +h.c +hist +hl +hrsg +ibid +il +ind +inv.č +jap +jhdt +jv +koed +kol +korej +kl +krit +lat +lit +m.a +maď +mj +mp +násl +např +nepubl +něm +no +nr +n.s +okr +odd +odp +obr +opr +orig +phil +pl +pokrač +pol +port +pozn +př.kr +př.n.l +přel +přeprac +příl +pseud +pt +red +repr +resp +revid +rkp +roč +roz +rozš +samost +sect +sest +seš +sign +sl +srv +stol +sv +šk +šk.ro +špan +tab +t.č +tis +tj +tř +tzv +univ +uspoř +vol +vl.jm +vs +vyd +vyobr +zal +zejm +zkr +zprac +zvl +n.p +např +než +MUDr +abl +absol +adj +adv +ak +ak. sl +akt +alch +amer +anat +angl +anglosas +arab +arch +archit +arg +astr +astrol +att +bás +belg +bibl +biol +boh +bot +bulh +círk +csl +č +čas +čes +dat +děj +dep +dět +dial +dór +dopr +dosl +ekon +epic +etnonym +eufem +f +fam +fem +fil +film +form +fot +fr +fut +fyz +gen +geogr +geol +geom +germ +gram +hebr +herald +hist +hl +hovor +hud +hut +chcsl +chem +ie +imp +impf +ind +indoevr +inf +instr +interj +ión +iron +it +kanad +katalán +klas +kniž +komp +konj + +konkr +kř +kuch +lat +lék +les +lid +lit +liturg +lok +log +m +mat +meteor +metr +mod +ms +mysl +n +náb +námoř +neklas +něm +nesklon +nom +ob +obch +obyč +ojed +opt +part +pas +pejor +pers +pf +pl +plpf + +práv +prep +předl +přivl +r +rcsl +refl +reg +rkp +ř +řec +s +samohl +sg +sl +souhl +spec +srov +stfr +střv +stsl +subj +subst +superl +sv +sz +táz +tech +telev +teol +trans +typogr +var +vedl +verb +vl. jm +voj +vok +vůb +vulg +výtv +vztaž +zahr +zájm +zast +zejm + +zeměd +zkr +zř +mj +dl +atp +sport +Mgr +horn +MVDr +JUDr +RSDr +Bc +PhDr +ThDr +Ing +aj +apod +PharmDr +pomn +ev +slang +nprap +odp +dop +pol +st +stol +p. n. l +před n. l +n. l +př. Kr +po Kr +př. n. l +odd +RNDr +tzv +atd +tzn +resp +tj +p +br +č. j +čj +č. p +čp +a. s +s. r. o +spol. s r. o +p. o +s. p +v. o. s +k. s +o. p. s +o. s +v. r +v z +ml +vč +kr +mld +hod +popř +ap +event +rus +slov +rum +švýc +P. T +zvl +hor +dol +S.O.S \ No newline at end of file diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en new file mode 100644 index 0000000000000000000000000000000000000000..7c8f44dad9471360580c727ea1cfcae259a18c10 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en @@ -0,0 +1,123 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Adj +Adm +Adv +Asst +Bart +Bldg +Brig +Bros +Capt +Cmdr +Col +Comdr +Con +Corp +Cpl +DR +Dr +Drs +Ens +Gen +Gov +Hon +Hr +Hosp +Insp +Lt +MM +MR +MRS +MS +Maj +Messrs +Mlle +Mme +Mr +Mrs +Ms +Msgr +Op +Ord +Pfc +Ph +Prof +Pvt +Rep +Reps +Res +Rev +Rt +Sen +Sens +Sfc +Sgt +Sr +St +Supt +Surg + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +v +vs +i.e +rev +e.g +# rupees +Rs + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# +Nos +Art #NUMERIC_ONLY# +Nr +pp #NUMERIC_ONLY# + +#month abbreviations +Jan +Feb +Mar +Apr +#May is a full word +Jun +Jul +Aug +Sep +Oct +Nov +Dec diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.es b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.es new file mode 100644 index 0000000000000000000000000000000000000000..d8b275518512e1a4bc05529fb4db4ba3bd4f8984 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.es @@ -0,0 +1,118 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm + +A.C +Apdo +Av +Bco +CC.AA +Da +Dep +Dn +Dr +Dra +EE.UU +Excmo +FF.CC +Fil +Gral +J.C +Let +Lic +N.B +P.D +P.V.P +Prof +Pts +Rte +S.A +S.A.R +S.E +S.L +S.R.C +Sr +Sra +Srta +Sta +Sto +T.V.E +Tel +Ud +Uds +V.B +V.E +Vd +Vds +a/c +adj +admón +afmo +apdo +av +c +c.f +c.g +cap +cm +cta +dcha +doc +ej +entlo +esq +etc +f.c +gr +grs +izq +kg +km +mg +mm +núm +núm +p +p.a +p.ej +ptas +pág +págs +pág +págs +q.e.g.e +q.e.s.m +s +s.s.s +vid +vol diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.et b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.et new file mode 100644 index 0000000000000000000000000000000000000000..466c6a837618e47742be569ba3922fffcc0d9ef2 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.et @@ -0,0 +1,138 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT +#indicate an end-of-sentence marker. Special cases are included for prefixes +#that ONLY appear before 0-9 numbers. + +#This list is compiled from omorfi database +#by Tommi A Pirinen. + + +#any single upper case letter followed by a period is not a sentence ender +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +Å +Ä +Ö + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +alik +alil +amir +apul +apul.prof +arkkit +ass +assist +dipl +dipl.arkkit +dipl.ekon +dipl.ins +dipl.kielenk +dipl.kirjeenv +dipl.kosm +dipl.urk +dos +erikoiseläinl +erikoishammasl +erikoisl +erikoist +ev.luutn +evp +fil +ft +hallinton +hallintot +hammaslääket +jatk +jääk +kansaned +kapt +kapt.luutn +kenr +kenr.luutn +kenr.maj +kers +kirjeenv +kom +kom.kapt +komm +konst +korpr +luutn +maist +maj +Mr +Mrs +Ms +M.Sc +neuv +nimim +Ph.D +prof +puh.joht +pääll +res +san +siht +suom +sähköp +säv +toht +toim +toim.apul +toim.joht +toim.siht +tuom +ups +vänr +vääp +ye.ups +ylik +ylil +ylim +ylimatr +yliop +yliopp +ylip +yliv + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall +#into this category - it sometimes ends a sentence) +e.g +ent +esim +huom +i.e +ilm +l +mm +myöh +nk +nyk +par +po +t +v diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.fi b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.fi new file mode 100644 index 0000000000000000000000000000000000000000..466c6a837618e47742be569ba3922fffcc0d9ef2 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.fi @@ -0,0 +1,138 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT +#indicate an end-of-sentence marker. Special cases are included for prefixes +#that ONLY appear before 0-9 numbers. + +#This list is compiled from omorfi database +#by Tommi A Pirinen. + + +#any single upper case letter followed by a period is not a sentence ender +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +Å +Ä +Ö + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +alik +alil +amir +apul +apul.prof +arkkit +ass +assist +dipl +dipl.arkkit +dipl.ekon +dipl.ins +dipl.kielenk +dipl.kirjeenv +dipl.kosm +dipl.urk +dos +erikoiseläinl +erikoishammasl +erikoisl +erikoist +ev.luutn +evp +fil +ft +hallinton +hallintot +hammaslääket +jatk +jääk +kansaned +kapt +kapt.luutn +kenr +kenr.luutn +kenr.maj +kers +kirjeenv +kom +kom.kapt +komm +konst +korpr +luutn +maist +maj +Mr +Mrs +Ms +M.Sc +neuv +nimim +Ph.D +prof +puh.joht +pääll +res +san +siht +suom +sähköp +säv +toht +toim +toim.apul +toim.joht +toim.siht +tuom +ups +vänr +vääp +ye.ups +ylik +ylil +ylim +ylimatr +yliop +yliopp +ylip +yliv + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall +#into this category - it sometimes ends a sentence) +e.g +ent +esim +huom +i.e +ilm +l +mm +myöh +nk +nyk +par +po +t +v diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.fr b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.fr new file mode 100644 index 0000000000000000000000000000000000000000..ccb61a7541c69964a1a21ae427b98d46113b6631 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.fr @@ -0,0 +1,153 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. +# +#any single upper case letter followed by a period is not a sentence ender +#usually upper case letters are initials in a name +#no French words end in single lower-case letters, so we throw those in too? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +#a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + +# Period-final abbreviation list for French +A.C.N +A.M +art +ann +apr +av +auj +lib +B.P +boul +ca +c.-à-d +cf +ch.-l +chap +contr +C.P.I +C.Q.F.D +C.N +C.N.S +C.S +dir +éd +e.g +env +al +etc +E.V +ex +fasc +fém +fig +fr +hab +ibid +id +i.e +inf +LL.AA +LL.AA.II +LL.AA.RR +LL.AA.SS +L.D +LL.EE +LL.MM +LL.MM.II.RR +loc.cit +masc +MM +ms +N.B +N.D.A +N.D.L.R +N.D.T +n/réf +NN.SS +N.S +N.D +N.P.A.I +p.c.c +pl +pp +p.ex +p.j +P.S +R.A.S +R.-V +R.P +R.I.P +SS +S.S +S.A +S.A.I +S.A.R +S.A.S +S.E +sec +sect +sing +S.M +S.M.I.R +sq +sqq +suiv +sup +suppl +tél +T.S.V.P +vb +vol +vs +X.O +Z.I diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ga b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ga new file mode 100644 index 0000000000000000000000000000000000000000..d6c94694687ac79426c5c45154e69b3bb84b9e60 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ga @@ -0,0 +1,48 @@ + +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +Á +É +Í +Ó +Ú + +Uacht +Dr +B.Arch + +m.sh +.i +Co +Cf +cf +i.e +r +Chr +lch #NUMERIC_ONLY# +lgh #NUMERIC_ONLY# +uimh #NUMERIC_ONLY# diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.gu b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.gu new file mode 100644 index 0000000000000000000000000000000000000000..856cdb9abb604ff16bafa795c7ad3038febbaa65 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.gu @@ -0,0 +1,105 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Rs +રૂ +# Dr +ડો +# Dr +ડૉ +# Mr +શ્રી + +#others + + +#phonetics +# A +એ +# B +બી +# C +સી +# D +ડી +# E +ઇ +# F +એફ +# G +જી +# H +એચ +# I +આઈ +# J +જે +# K +કે +# L +એલ +# M +એમ +# N +એન +# O +ઓ +# P +પી +# Q +ક્યૂ +# R +આર +# S +એસ +# T +ટી +# U +યુ +# V +વી +# W +ડબલ્યુ +# X +એક્સ +# Y +વાય +# Z +ઝેડ + +#consonants +ક +ખ +ગ +ઘ +ઙ +ચ +છ +જ +ઝ +ઞ +ટ +ઠ +ડ +ઢ +ણ +ત +થ +દ +ધ +ન +પ +ફ +બ +ભ +મ +ય +ર +લ +ળ +વ +શ +ષ +સ +હ + diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu new file mode 100644 index 0000000000000000000000000000000000000000..78f8909378190411fb1c0248060bd679f609a2e8 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu @@ -0,0 +1,103 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +Á +É +Í +Ó +Ö +Ő +Ú +Ü +Ű + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Dr +dr +kb +Kb +vö +Vö +pl +Pl +ca +Ca +min +Min +max +Max +ún +Ún +prof +Prof +de +De +du +Du +Szt +St + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix + +# Month name abbreviations +jan #NUMERIC_ONLY# +Jan #NUMERIC_ONLY# +Feb #NUMERIC_ONLY# +feb #NUMERIC_ONLY# +márc #NUMERIC_ONLY# +Márc #NUMERIC_ONLY# +ápr #NUMERIC_ONLY# +Ápr #NUMERIC_ONLY# +máj #NUMERIC_ONLY# +Máj #NUMERIC_ONLY# +jún #NUMERIC_ONLY# +Jún #NUMERIC_ONLY# +Júl #NUMERIC_ONLY# +júl #NUMERIC_ONLY# +aug #NUMERIC_ONLY# +Aug #NUMERIC_ONLY# +Szept #NUMERIC_ONLY# +szept #NUMERIC_ONLY# +okt #NUMERIC_ONLY# +Okt #NUMERIC_ONLY# +nov #NUMERIC_ONLY# +Nov #NUMERIC_ONLY# +dec #NUMERIC_ONLY# +Dec #NUMERIC_ONLY# + +# Other abbreviations +tel #NUMERIC_ONLY# +Tel #NUMERIC_ONLY# +Fax #NUMERIC_ONLY# +fax #NUMERIC_ONLY# diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.is b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.is new file mode 100644 index 0000000000000000000000000000000000000000..5b8a71086e616dae17e0185decb22c7694d6ca19 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.is @@ -0,0 +1,251 @@ +no #NUMERIC_ONLY# +No #NUMERIC_ONLY# +nr #NUMERIC_ONLY# +Nr #NUMERIC_ONLY# +nR #NUMERIC_ONLY# +NR #NUMERIC_ONLY# +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +^ +í +á +ó +æ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +ab.fn +a.fn +afs +al +alm +alg +andh +ath +aths +atr +ao +au +aukaf +áfn +áhrl.s +áhrs +ákv.gr +ákv +bh +bls +dr +e.Kr +et +ef +efn +ennfr +eink +end +e.st +erl +fél +fskj +fh +f.hl +físl +fl +fn +fo +forl +frb +frl +frh +frt +fsl +fsh +fs +fsk +fst +f.Kr +ft +fv +fyrrn +fyrrv +germ +gm +gr +hdl +hdr +hf +hl +hlsk +hljsk +hljv +hljóðv +hr +hv +hvk +holl +Hos +höf +hk +hrl +ísl +kaf +kap +Khöfn +kk +kg +kk +km +kl +klst +kr +kt +kgúrsk +kvk +leturbr +lh +lh.nt +lh.þt +lo +ltr +mlja +mljó +millj +mm +mms +m.fl +miðm +mgr +mst +mín +nf +nh +nhm +nl +nk +nmgr +no +núv +nt +o.áfr +o.m.fl +ohf +o.fl +o.s.frv +ófn +ób +óákv.gr +óákv +pfn +PR +pr +Ritstj +Rvík +Rvk +samb +samhlj +samn +samn +sbr +sek +sérn +sf +sfn +sh +sfn +sh +s.hl +sk +skv +sl +sn +so +ss.us +s.st +samþ +sbr +shlj +sign +skál +st +st.s +stk +sþ +teg +tbl +tfn +tl +tvíhlj +tvt +till +to +umr +uh +us +uppl +útg +vb +Vf +vh +vkf +Vl +vl +vlf +vmf +8vo +vsk +vth +þt +þf +þjs +þgf +þlt +þolm +þm +þml +þýð diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.kn b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.kn new file mode 100644 index 0000000000000000000000000000000000000000..1c20f61c229532031e538bf8f9419e594f581439 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.kn @@ -0,0 +1,70 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Rs +ರೂ +# Dr +ಡಾ +# Mr +ಶ್ರೀ + +#others + + +#phonetics +# A +ಎ +# B +ಬಿ +# C +ಸಿ +# D +ಡಿ +# E +ಇ +# F +ಎಫ್ +# G +ಜಿ +# H +ಹೆಚ್ +ಎಚ್‌ +# I +ಐ +# J +ಜೆ +# K +ಕೆ +# L +ಎಲ್ +# M +ಎಂ +# N +ಎನ್ +# O +ಒ +# P +ಪಿ +# Q +ಕ್ಯೂ +# R +ಆರ್ +# S +ಎಸ್ +# T +ಟಿ +# U +ಯು +# V +ವಿ +# W +ಡಬ್ಲ್ಯೂ +# X +ಎಕ್ಸ್ +# Y +ವೈ +# Z +ಜೆಡ್ + +#consonants + diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt new file mode 100644 index 0000000000000000000000000000000000000000..fa72196d9730bc8c614c3d72abe8decdc4efd580 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt @@ -0,0 +1,698 @@ +# Anything in this file, followed by a period (and an upper-case word), +# does NOT indicate an end-of-sentence marker. +# Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +# Any single upper case letter followed by a period is not a sentence ender +# (excluding I occasionally, but we leave it in) +# usually upper case letters are initials in a name +A +Ā +B +C +Č +D +E +Ē +F +G +Ģ +H +I +Ī +J +K +Ķ +L +Ļ +M +N +Ņ +O +P +Q +R +S +Š +T +U +Ū +V +W +X +Y +Z +Ž + +# Initialis -- Džonas +Dz +Dž +Just + +# Day and month abbreviations +# m. menesis d. diena g. gimes +m +mėn +d +g +gim +# Pirmadienis Penktadienis +Pr +Pn +Pirm +Antr +Treč +Ketv +Penkt +Šešt +Sekm +Saus +Vas +Kov +Bal +Geg +Birž +Liep +Rugpj +Rugs +Spal +Lapkr +Gruod + +# Business, governmental, geographical terms +a +# aikštė +adv +# advokatas +akad +# akademikas +aklg +# akligatvis +akt +# aktorius +al +# alėja +A.V +# antspaudo vieta +aps +apskr +# apskritis +apyg +# apygarda +aps +apskr +# apskritis +asist +# asistentas +asmv +avd +# asmenvardis +a.k +asm +asm.k +# asmens kodas +atsak +# atsakingasis +atsisk +sąsk +# atsiskaitomoji sąskaita +aut +# autorius +b +k +b.k +# banko kodas +bkl +# bakalauras +bt +# butas +buv +# buvęs, -usi +dail +# dailininkas +dek +# dekanas +dėst +# dėstytojas +dir +# direktorius +dirig +# dirigentas +doc +# docentas +drp +# durpynas +dš +# dešinysis +egz +# egzempliorius +eil +# eilutė +ekon +# ekonomika +el +# elektroninis +etc +ež +# ežeras +faks +# faksas +fak +# fakultetas +gen +# generolas +gyd +# gydytojas +gv +# gyvenvietė +įl +# įlanka +Įn +# įnagininkas +insp +# inspektorius +pan +# ir panašiai +t.t +# ir taip toliau +k.a +# kaip antai +kand +# kandidatas +kat +# katedra +kyš +# kyšulys +kl +# klasė +kln +# kalnas +kn +# knyga +koresp +# korespondentas +kpt +# kapitonas +kr +# kairysis +kt +# kitas +kun +# kunigas +l +e +p +l.e.p +# laikinai einantis pareigas +ltn +# leitenantas +m +mst +# miestas +m.e +# mūsų eros +m.m +# mokslo metai +mot +# moteris +mstl +# miestelis +mgr +# magistras +mgnt +# magistrantas +mjr +# majoras +mln +# milijonas +mlrd +# milijardas +mok +# mokinys +mokyt +# mokytojas +moksl +# mokslinis +nkt +# nekaitomas +ntk +# neteiktinas +Nr +nr +# numeris +p +# ponas +p.d +a.d +# pašto dėžutė, abonentinė dėžutė +p.m.e +# prieš mūsų erą +pan +# ir panašiai +pav +# paveikslas +pavad +# pavaduotojas +pirm +# pirmininkas +pl +# plentas +plg +# palygink +plk +# pulkininkas; pelkė +pr +# prospektas +Kr +pr.Kr +# prieš Kristų +prok +# prokuroras +prot +# protokolas +pss +# pusiasalis +pšt +# paštas +pvz +# pavyzdžiui +r +# rajonas +red +# redaktorius +rš +# raštų kalbos +sąs +# sąsiuvinis +saviv +sav +# savivaldybė +sekr +# sekretorius +sen +# seniūnija, seniūnas +sk +# skaityk; skyrius +skg +# skersgatvis +skyr +sk +# skyrius +skv +# skveras +sp +# spauda; spaustuvė +spec +# specialistas +sr +# sritis +st +# stotis +str +# straipsnis +stud +# studentas +š +š.m +# šių metų +šnek +# šnekamosios +tir +# tiražas +tūkst +# tūkstantis +up +# upė +upl +# upelis +vad +# vadinamasis, -oji +vlsč +# valsčius +ved +# vedėjas +vet +# veterinarija +virš +# viršininkas, viršaitis +vyr +# vyriausiasis, -ioji; vyras +vyresn +# vyresnysis +vlsč +# valsčius +vs +# viensėdis +Vt +vt +# vietininkas +vtv +vv +# vietovardis +žml +# žemėlapis + +# Technical terms, abbreviations used in guidebooks, advertisments, etc. +# Generally lower-case. +air +# airiškai +amer +# amerikanizmas +anat +# anatomija +angl +# angl. angliskai +arab +# arabų +archeol +archit +asm +# asmuo +astr +# astronomija +austral +# australiškai +aut +# automobilis +av +# aviacija +bažn +bdv +# būdvardis +bibl +# Biblija +biol +# biologija +bot +# botanika +brt +# burtai, burtažodis. +brus +# baltarusių +buh +# buhalterija +chem +# chemija +col +# collectivum +con +conj +# conjunctivus, jungtukas +dab +# dab. dabartine +dgs +# daugiskaita +dial +# dialektizmas +dipl +dktv +# daiktavardis +džn +# dažnai +ekon +el +# elektra +esam +# esamasis laikas +euf +# eufemizmas +fam +# familiariai +farm +# farmacija +filol +# filologija +filos +# filosofija +fin +# finansai +fiz +# fizika +fiziol +# fiziologija +flk +# folkloras +fon +# fonetika +fot +# fotografija +geod +# geodezija +geogr +geol +# geologija +geom +# geometrija +glžk +gr +# graikų +gram +her +# heraldika +hidr +# hidrotechnika +ind +# Indų +iron +# ironiškai +isp +# ispanų +ist +istor +# istorija +it +# italų +įv +reikšm +įv.reikšm +# įvairiomis reikšmėmis +jap +# japonų +juok +# juokaujamai +jūr +# jūrininkystė +kalb +# kalbotyra +kar +# karyba +kas +# kasyba +kin +# kinematografija +klaus +# klausiamasis +knyg +# knyginis +kom +# komercija +komp +# kompiuteris +kosm +# kosmonautika +kt +# kitas +kul +# kulinarija +kuop +# kuopine +l +# laikas +lit +# literatūrinis +lingv +# lingvistika +log +# logika +lot +# lotynų +mat +# matematika +maž +# mažybinis +med +# medicina +medž +# medžioklė +men +# menas +menk +# menkinamai +metal +# metalurgija +meteor +min +# mineralogija +mit +# mitologija +mok +# mokyklinis +ms +# mįslė +muz +# muzikinis +n +# naujasis +neig +# neigiamasis +neol +# neologizmas +niek +# niekinamai +ofic +# oficialus +opt +# optika +orig +# original +p +# pietūs +pan +# panašiai +parl +# parlamentas +pat +# patarlė +paž +# pažodžiui +plg +# palygink +poet +# poetizmas +poez +# poezija +poligr +# poligrafija +polit +# politika +ppr +# paprastai +pranc +pr +# prancūzų, prūsų +priet +# prietaras +prek +# prekyba +prk +# perkeltine +prs +# persona, asmuo +psn +# pasenęs žodis +psich +# psichologija +pvz +# pavyzdžiui +r +# rytai +rad +# radiotechnika +rel +# religija +ret +# retai +rus +# rusų +sen +# senasis +sl +# slengas, slavų +sov +# sovietinis +spec +# specialus +sport +stat +# statyba +sudurt +# sudurtinis +sutr +# sutrumpintas +suv +# suvalkiečių +š +# šiaurė +šach +# šachmatai +šiaur +škot +# škotiškai +šnek +# šnekamoji +teatr +tech +techn +# technika +teig +# teigiamas +teis +# teisė +tekst +# tekstilė +tel +# telefonas +teol +# teologija +v +# tik vyriškosios, vakarai +t.p +t +p +# ir taip pat +t.t +# ir taip toliau +t.y +# tai yra +vaik +# vaikų +vart +# vartojama +vet +# veterinarija +vid +# vidurinis +vksm +# veiksmažodis +vns +# vienaskaita +vok +# vokiečių +vulg +# vulgariai +zool +# zoologija +žr +# žiūrėk +ž.ū +ž +ū +# žemės ūkis + +# List of titles. These are often followed by upper-case names, but do +# not indicate sentence breaks +# +# Jo Eminencija +Em. +# Gerbiamasis +Gerb +gerb +# malonus +malon +# profesorius +Prof +prof +# daktaras (mokslų) +Dr +dr +habil +med +# inž inžinierius +inž +Inž + + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv new file mode 100644 index 0000000000000000000000000000000000000000..81754a17ae92e78e706295caaff274a3eddbc99e --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv @@ -0,0 +1,100 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +Ā +B +C +Č +D +E +Ē +F +G +Ģ +H +I +Ī +J +K +Ķ +L +Ļ +M +N +Ņ +O +P +Q +R +S +Š +T +U +Ū +V +W +X +Y +Z +Ž + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +dr +Dr +med +prof +Prof +inž +Inž +ist.loc +Ist.loc +kor.loc +Kor.loc +v.i +vietn +Vietn + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +a.l +t.p +pārb +Pārb +vec +Vec +inv +Inv +sk +Sk +spec +Spec +vienk +Vienk +virz +Virz +māksl +Māksl +mūz +Mūz +akad +Akad +soc +Soc +galv +Galv +vad +Vad +sertif +Sertif +folkl +Folkl +hum +Hum + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +Nr #NUMERIC_ONLY# diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ml b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ml new file mode 100644 index 0000000000000000000000000000000000000000..35ffc8f97501515a2629c69066a88c82d65fb0ca --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ml @@ -0,0 +1,67 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Dr +ഡോ +# Mr +ശ്രീ + +#others + + +#phonetics +# A +എ +# B +ബി +# C +സി +# D +ഡി +# E +ഇ +# F +എഫ് +# G +ജി +# H +എച്ച് +# I +ഐ +# J +ജെ +# K +കെ +# L +എൽ +# M +എം +# N +എൻ +# O +ഒ +# P +പി +# Q +ക്യൂ +# R +ആർ +# S +എസ് +# T +ടി +# U +യു +# V +വി +# W +ഡബ്ല്യു +# X +എക്സ് +# Y +വൈ +# Z +സെഡ് + +#consonants + diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mni b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mni new file mode 100644 index 0000000000000000000000000000000000000000..22ecbae705712d988505a780af9811dd597ed6df --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mni @@ -0,0 +1,65 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Dr +দা + +#others + + +#phonetics +# A +এ +# B +বি +# C +সি +# D +ডি +# E +ই +# F +এফ +# G +জি +# H +এইচ +# I +আম +# J +জে +# K +কে +# L +এল +# M +এম +# N +এন +# O +হে +# P +পি +# Q +কিউ +# R +আর +# S +এস +# T +টি +# U +ইউ +# V +ভি +# W +ডব্লু +# X +এক্স +# Y +ওয়াই +# Z +জেড + +#consonants + diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mr b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mr new file mode 100644 index 0000000000000000000000000000000000000000..1ece23c1289bab2624ed4ac6dd03c63b4a3e93a1 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mr @@ -0,0 +1,113 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Rs +रु +# Dr +डॉ +# Dr +डा +# Mr +श्री + +#others + + +#phonetics +# A +ए +ऐ +# B +बी +# C +सी +# D +डी +# E +ई +# F +ऐफ +एफ +# G +जी +# H +ऐच +एच +# I +आइ +# J +जे +# K +के +# L +ऐल +एल +# M +ऐम +एम +# N +ऐन +एन +# O +ओ +# P +पी +# Q +क्यू +# R +आर +# S +ऐस +एस +# T +टी +# U +यू +# V +वी +# W +डब्ल्यू +# X +ऐक्स +एक्स +# Y +वाय +वाई +# Z +ज़ैड + +#consonants +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +प +फ +ब +भ +म +य +र +ल +व +श +ष +स +ह + diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.nl b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.nl new file mode 100644 index 0000000000000000000000000000000000000000..c80c417722a7685170d7fc67cb18fa2eccb1bfaf --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.nl @@ -0,0 +1,115 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. +#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen +# http://nl.wikipedia.org/wiki/Aanspreekvorm +# http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +bacc +bc +bgen +c.i +dhr +dr +dr.h.c +drs +drs +ds +eint +fa +Fa +fam +gen +genm +ing +ir +jhr +jkvr +jr +kand +kol +lgen +lkol +Lt +maj +Mej +mevr +Mme +mr +mr +Mw +o.b.s +plv +prof +ritm +tint +Vz +Z.D +Z.D.H +Z.E +Z.Em +Z.H +Z.K.H +Z.K.M +Z.M +z.v + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence +a.g.v +bijv +bijz +bv +d.w.z +e.c +e.g +e.k +ev +i.p.v +i.s.m +i.t.t +i.v.m +m.a.w +m.b.t +m.b.v +m.h.o +m.i +m.i.v +v.w.t + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +Nr #NUMERIC_ONLY# +Nrs +nrs +nr #NUMERIC_ONLY# diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.or b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.or new file mode 100644 index 0000000000000000000000000000000000000000..8442c0b77470b3b2c9904f6d3b0b48846964a710 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.or @@ -0,0 +1,101 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Mr +ରୀ + +#others + + +#phonetics +# A + +# B + +# C + +# D + +# E + +# F + +# G + +# H + +# I + +# J + +# K + +# L + +# M + +# N + +# O + +# P + +# Q + +# R + +# S + +# T + +# U + +# V + +# W + +# X + +# Y + +# Z + + +#consonants +କ +ଖ +ଗ +ଘ +ଙ +ଚ +ଛ +ଜ +ଝ +ଞ +ଟ +ଠ +ଡ +ଢ +ଣ +ତ +ଥ +ଦ +ଧ +ନ +ପ +ଫ +ବ +ଵ +ଭ +ମ +ଯ +ୟ +ର +ଲ +ଳ +ୱ +ଶ +ଷ +ସ +ହ + diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ru b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ru new file mode 100644 index 0000000000000000000000000000000000000000..8a9e873133961448d6a460ed52260c15f756e342 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ru @@ -0,0 +1,293 @@ +# added Cyrillic uppercase letters [А-Я] +# removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes) +# edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013 +А +Б +В +Г +Д +Е +Ж +З +И +Й +К +Л +М +Н +О +П +Р +С +Т +У +Ф +Х +Ц +Ч +Ш +Щ +Ъ +Ы +Ь +Э +Ю +Я +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +0гг +1гг +2гг +3гг +4гг +5гг +6гг +7гг +8гг +9гг +0г +1г +2г +3г +4г +5г +6г +7г +8г +9г +Xвв +Vвв +Iвв +Lвв +Mвв +Cвв +Xв +Vв +Iв +Lв +Mв +Cв +0м +1м +2м +3м +4м +5м +6м +7м +8м +9м +0мм +1мм +2мм +3мм +4мм +5мм +6мм +7мм +8мм +9мм +0см +1см +2см +3см +4см +5см +6см +7см +8см +9см +0дм +1дм +2дм +3дм +4дм +5дм +6дм +7дм +8дм +9дм +0л +1л +2л +3л +4л +5л +6л +7л +8л +9л +0км +1км +2км +3км +4км +5км +6км +7км +8км +9км +0га +1га +2га +3га +4га +5га +6га +7га +8га +9га +0кг +1кг +2кг +3кг +4кг +5кг +6кг +7кг +8кг +9кг +0т +1т +2т +3т +4т +5т +6т +7т +8т +9т +0г +1г +2г +3г +4г +5г +6г +7г +8г +9г +0мг +1мг +2мг +3мг +4мг +5мг +6мг +7мг +8мг +9мг +бульв +в +вв +г +га +гг +гл +гос +д +дм +доп +др +е +ед +ед +зам +и +инд +исп +Исп +к +кап +кг +кв +кл +км +кол +комн +коп +куб +л +лиц +лл +м +макс +мг +мин +мл +млн +млрд +мм +н +наб +нач +неуд +ном +о +обл +обр +общ +ок +ост +отл +п +пер +перераб +пл +пос +пр +просп +проф +р +ред +руб +с +сб +св +см +соч +ср +ст +стр +т +тел +Тел +тех +тт +туп +тыс +уд +ул +уч +физ +х +хор +ч +чел +шт +экз +э diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.sk b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.sk new file mode 100644 index 0000000000000000000000000000000000000000..1198d482964ef7f5b24b7031df02775c1649bd86 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.sk @@ -0,0 +1,474 @@ +Bc +Mgr +RNDr +PharmDr +PhDr +JUDr +PaedDr +ThDr +Ing +MUDr +MDDr +MVDr +Dr +ThLic +PhD +ArtD +ThDr +Dr +DrSc +CSs +prof +obr +Obr +Č +č +absol +adj +admin +adr +Adr +adv +advok +afr +ak +akad +akc +akuz +et +al +alch +amer +anat +angl +Angl +anglosas +anorg +ap +apod +arch +archeol +archit +arg +art +astr +astrol +astron +atp +atď +austr +Austr +aut +belg +Belg +bibl +Bibl +biol +bot +bud +bás +býv +cest +chem +cirk +csl +čs +Čs +dat +dep +det +dial +diaľ +dipl +distrib +dokl +dosl +dopr +dram +duš +dv +dvojčl +dór +ekol +ekon +el +elektr +elektrotech +energet +epic +est +etc +etonym +eufem +európ +Európ +ev +evid +expr +fa +fam +farm +fem +feud +fil +filat +filoz +fi +fon +form +fot +fr +Fr +franc +Franc +fraz +fut +fyz +fyziol +garb +gen +genet +genpor +geod +geogr +geol +geom +germ +gr +Gr +gréc +Gréc +gréckokat +hebr +herald +hist +hlav +hosp +hromad +hud +hypok +ident +i.e +ident +imp +impf +indoeur +inf +inform +instr +int +interj +inšt +inštr +iron +jap +Jap +jaz +jedn +juhoamer +juhových +juhozáp +juž +kanad +Kanad +kanc +kapit +kpt +kart +katastr +knih +kniž +komp +konj +konkr +kozmet +krajč +kresť +kt +kuch +lat +latinskoamer +lek +lex +lingv +lit +litur +log +lok +max +Max +maď +Maď +medzinár +mest +metr +mil +Mil +min +Min +miner +ml +mld +mn +mod +mytol +napr +nar +Nar +nasl +nedok +neg +negat +neklas +nem +Nem +neodb +neos +neskl +nesklon +nespis +nespráv +neved +než +niekt +niž +nom +náb +nákl +námor +nár +obch +obj +obv +obyč +obč +občian +odb +odd +ods +ojed +okr +Okr +opt +opyt +org +os +osob +ot +ovoc +par +part +pejor +pers +pf +Pf +P.f +p.f +pl +Plk +pod +podst +pokl +polit +politol +polygr +pomn +popl +por +porad +porov +posch +potrav +použ +poz +pozit +poľ +poľno +poľnohosp +poľov +pošt +pož +prac +predl +pren +prep +preuk +priezv +Priezv +privl +prof +práv +príd +príj +prík +príp +prír +prísl +príslov +príč +psych +publ +pís +písm +pôv +refl +reg +rep +resp +rozk +rozlič +rozpráv +roč +Roč +ryb +rádiotech +rím +samohl +semest +sev +severoamer +severových +severozáp +sg +skr +skup +sl +Sloven +soc +soch +sociol +sp +spol +Spol +spoloč +spoluhl +správ +spôs +st +star +starogréc +starorím +s.r.o +stol +stor +str +stredoamer +stredoškol +subj +subst +superl +sv +sz +súkr +súp +súvzť +tal +Tal +tech +tel +Tel +telef +teles +telev +teol +trans +turist +tuzem +typogr +tzn +tzv +ukaz +ul +Ul +umel +univ +ust +ved +vedľ +verb +veter +vin +viď +vl +vod +vodohosp +pnl +vulg +vyj +vys +vysokoškol +vzťaž +vôb +vých +výd +výrob +výsk +výsl +výtv +výtvar +význ +včel +vš +všeob +zahr +zar +zariad +zast +zastar +zastaráv +zb +zdravot +združ +zjemn +zlat +zn +Zn +zool +zr +zried +zv +záhr +zák +zákl +zám +záp +západoeur +zázn +územ +účt +čast +čes +Čes +čl +čísl +živ +pr +fak +Kr +p.n.l +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.sl b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.sl new file mode 100644 index 0000000000000000000000000000000000000000..4c759e7360607087041fbe0b31021debe21b3b5b --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.sl @@ -0,0 +1,78 @@ +dr +Dr +itd +itn +št #NUMERIC_ONLY# +Št #NUMERIC_ONLY# +d +jan +Jan +feb +Feb +mar +Mar +apr +Apr +jun +Jun +jul +Jul +avg +Avg +sept +Sept +sep +Sep +okt +Okt +nov +Nov +dec +Dec +tj +Tj +npr +Npr +sl +Sl +op +Op +gl +Gl +oz +Oz +prev +dipl +ing +prim +Prim +cf +Cf +gl +Gl +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta new file mode 100644 index 0000000000000000000000000000000000000000..8e8bbcd3e74d34cdf326e0eebcc4687685948e37 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta @@ -0,0 +1,71 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Rs +ர +# Rs +ூ +# Mr +திரு + +#others + + +#phonetics +# A +ஏ +# B +பீ +# C +சீ +# D +டீ +# E +ஈ +# F +எஃப் +# G +ஜீ +# H +எச் +ஹெச் +# I +ஐ +# J +ஜே +ஜை +# K +கே +# L +எல் +# M +எம் +# N +என் +# O +ஓ +# P +ப்பீ +# Q +கியூ +# R +ஆர் +# S +எஸ் +# T +ட்டீ +# U +யூ +# V +வீ +# W +டபிள்-யூ +# X +எக்ஸ் +# Y +வை +# Z +செட் + +#consonants + diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt new file mode 100644 index 0000000000000000000000000000000000000000..07cbdb3e197000519c22a38eba1bb1e992cb84c6 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt @@ -0,0 +1,211 @@ +#File adapted for TDT from PT by Raphael Merx. Last update: 10.11.2009. +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + + +#Roman Numerals. A dot after one of these is not a sentence break in Portuguese. +I +II +III +IV +V +VI +VII +VIII +IX +X +XI +XII +XIII +XIV +XV +XVI +XVII +XVIII +XIX +XX +i +ii +iii +iv +v +vi +vii +viii +ix +x +xi +xii +xiii +xiv +xv +xvi +xvii +xviii +xix +xx + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Adj +Adm +Adv +Art +Ca +Capt +Cmdr +Col +Comdr +Con +Corp +Cpl +DR +DRA +Dr +Dra +Dras +Drs +Eng +Enga +Engas +Engos +Ex +Exo +Exmo +Fig +Gen +Hosp +Insp +Lda +MM +MR +MRS +MS +Maj +Mrs +Ms +Msgr +Op +Ord +Pfc +Ph +Prof +Pvt +Rep +Reps +Res +Rev +Rt +Sen +Sens +Sfc +Sgt +Sr +Sra +Sras +Srs +Sto +Supt +Surg +adj +adm +adv +art +cit +col +con +corp +cpl +dr +dra +dras +drs +eng +enga +engas +engos +ex +exo +exmo +fig +op +prof +sr +sra +sras +srs +sto + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +v +vs +i.e +rev +e.g + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# +Nu #NUMERIC_ONLY# +Nos +Art #NUMERIC_ONLY# +Nr +p #NUMERIC_ONLY# +pp #NUMERIC_ONLY# + diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.te b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.te new file mode 100644 index 0000000000000000000000000000000000000000..a596aab65f5af1180a5573574d0936227e9fd2e5 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.te @@ -0,0 +1,70 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Rs +ర +# Rs +ూ +# Mr +శ్రీ + +#others + + +#phonetics +# A +ఎ +# B +బి +# C +సి +# D +డి +# E +ఇ +# F +ఎఫ్ +# G +జి +# H +హెచ్‌ +# I +ఐ +# J +జె +# K +కె +# L +ఎల్ +# M +ఎం +ఎమ్ +# N +ఎన్ +# O +ఓ +# P +పి +# Q +క్యూ +# R +ఆర్ +# S +ఎస్ +# T +టి +# U +యు +# V +వి +# W +డబ్ల్యూ +# X +ఎక్స్ +# Y +వై +# Z +జెడ్ + +#consonants + diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.yue b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.yue new file mode 100644 index 0000000000000000000000000000000000000000..37942ade9e3d1ca60df9d66c2baec01c89d7a75c --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.yue @@ -0,0 +1,53 @@ +# +# Cantonese (Chinese) +# +# Anything in this file, followed by a period, +# does NOT indicate an end-of-sentence marker. +# +# English/Euro-language given-name initials (appearing in +# news, periodicals, etc.) +A +Ā +B +C +Č +D +E +Ē +F +G +Ģ +H +I +Ī +J +K +Ķ +L +Ļ +M +N +Ņ +O +P +Q +R +S +Š +T +U +Ū +V +W +X +Y +Z +Ž + +# Numbers only. These should only induce breaks when followed by +# a numeric sequence. +# Add NUMERIC_ONLY after the word for this function. This case is +# mostly for the english "No." which can either be a sentence of its +# own, or if followed by a number, a non-breaking prefix. +No #NUMERIC_ONLY# +Nr #NUMERIC_ONLY# diff --git a/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.zh b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.zh new file mode 100644 index 0000000000000000000000000000000000000000..df4c2ff8895988c7545ac3f2fe8cf91fbcb5d600 --- /dev/null +++ b/mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.zh @@ -0,0 +1,53 @@ +# +# Mandarin (Chinese) +# +# Anything in this file, followed by a period, +# does NOT indicate an end-of-sentence marker. +# +# English/Euro-language given-name initials (appearing in +# news, periodicals, etc.) +A +Ā +B +C +Č +D +E +Ē +F +G +Ģ +H +I +Ī +J +K +Ķ +L +Ļ +M +N +Ņ +O +P +Q +R +S +Š +T +U +Ū +V +W +X +Y +Z +Ž + +# Numbers only. These should only induce breaks when followed by +# a numeric sequence. +# Add NUMERIC_ONLY after the word for this function. This case is +# mostly for the english "No." which can either be a sentence of its +# own, or if followed by a number, a non-breaking prefix. +No #NUMERIC_ONLY# +Nr #NUMERIC_ONLY# diff --git a/mosesdecoder/scripts/training/corpus-sizes.perl b/mosesdecoder/scripts/training/corpus-sizes.perl new file mode 100644 index 0000000000000000000000000000000000000000..1a6db669b5f919ab1bec1ff1df182b92312bca58 --- /dev/null +++ b/mosesdecoder/scripts/training/corpus-sizes.perl @@ -0,0 +1,20 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $ + +use warnings; +use strict; + +my ($in,$out,@PART) = @ARGV; + +foreach my $part (@PART) { + die("ERROR: no part $part.$in or $part.$out") if (! -e "$part.$in" || ! -e "$part.$out"); + my $in_size = `cat $part.$in | wc -l`; + my $out_size = `cat $part.$out | wc -l`; + die("number of lines don't match: '$part.$in' ($in_size) != '$part.$out' ($out_size)") + if $in_size != $out_size; + print "$in_size"; +} diff --git a/mosesdecoder/scripts/training/giza2bal.pl b/mosesdecoder/scripts/training/giza2bal.pl new file mode 100644 index 0000000000000000000000000000000000000000..ad9edb584755d99821dbbf49ac7a3eead0a1f5d7 --- /dev/null +++ b/mosesdecoder/scripts/training/giza2bal.pl @@ -0,0 +1,117 @@ +#!/usr/bin/env perl + +# $Id$ +#Converts direct and inverted alignments into a more compact +#bi-alignment format. It optionally reads the counting file +#produced by giza containing the frequency of each traning sentence. + +#Copyright Marcello Federico, November 2004 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +#use warnings; + +($cnt,$dir,$inv)=(); + +while ($w=shift @ARGV){ + $dir=shift(@ARGV),next if $w eq "-d"; + $inv=shift(@ARGV),next if $w eq "-i"; + $cnt=shift(@ARGV),next if $w eq "-c"; +} + +my $lc = 0; + +if (!$dir || !$inv){ + print "usage: giza2bal.pl [-c ] -d -i \n"; + print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n"; + exit(0); +} + +$|=1; + +open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n"; +open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n"; + +if ($cnt){ +open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n"; +} + + +sub ReadBiAlign{ + local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_; + local($dummy,$n); + + chop($c=<$fd0>); ## count + $dummy=<$fd0>; ## header + $dummy=<$fd0>; ## header + $c=1 if !$c; + + $dummy=<$fd1>; ## header + chop($s1=<$fd1>); + chop($t1=<$fd1>); + + $dummy=<$fd2>; ## header + chop($s2=<$fd2>); + chop($t2=<$fd2>); + + @a=@b=(); + $lc++; + + #get target statistics + $n=1; + $t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; + while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ + grep($a[$_]=$n,split(/\s+/,$2)); + $n++; + } + + $m=1; + $t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; + while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ + grep($b[$_]=$m,split(/\s+/,$2)); + $m++; + } + + $M=split(/\s+/,$s1); + $N=split(/\s+/,$s2); + + if ($m != ($M+1) || $n != ($N+1)) { + print STDERR "Sentence mismatch error! Line #$lc\n"; + $s1 = "ALIGN_ERR"; + $s2 = "ALIGN_ERR"; + @a=(); @b=(); + for ($j=1;$j<2;$j++){ $a[$j]=1; } + for ($i=1;$i<2;$i++){ $b[$i]=1; } + return 1; + } + + for ($j=1;$j<$m;$j++){ + $a[$j]=0 if !$a[$j]; + } + + for ($i=1;$i<$n;$i++){ + $b[$i]=0 if !$b[$i]; + } + + + return 1; +} + +$skip=0; +$ccc=0; +while(!eof(DIR)){ + + if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c)) + { + $ccc++; + print "$c\n"; + print $#a," $src \# @a[1..$#a]\n"; + print $#b," $tgt \# @b[1..$#b]\n"; + } + else{ + print "\n"; + print STDERR "." if !(++$skip % 1000); + } +}; +print STDERR "skip=<$skip> counts=<$ccc>\n";