sleepyhead111 commited on Apr 20, 2025

Commit

7221d50

verified ·

1 Parent(s): 4b25173

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
mosesdecoder/defer/Joint.h +139 -0
mosesdecoder/defer/PhraseDictionaryInterpolated.cpp +186 -0
mosesdecoder/defer/PhraseLengthFeatureTest.cpp +104 -0
mosesdecoder/lm/builder/corpus_count.hh +53 -0
mosesdecoder/lm/builder/dump_counts_main.cc +36 -0
mosesdecoder/lm/builder/lmplz_main.cc +220 -0
mosesdecoder/lm/common/CMakeLists.txt +40 -0
mosesdecoder/lm/common/Jamfile +2 -0
mosesdecoder/lm/common/joint_order.hh +71 -0
mosesdecoder/lm/common/ngram.hh +77 -0
mosesdecoder/lm/common/print.cc +62 -0
mosesdecoder/lm/common/renumber.cc +17 -0
mosesdecoder/lm/common/renumber.hh +30 -0
mosesdecoder/mert/ReferenceTest.cpp +123 -0
mosesdecoder/mert/ScoreArray.cpp +169 -0
mosesdecoder/mert/ScoreArray.h +113 -0
mosesdecoder/mert/Util.h +149 -0
mosesdecoder/moses/TranslationModel/UG/util/ibm1-align +3 -0
mosesdecoder/scripts/Jamfile +23 -0
mosesdecoder/scripts/README +15 -0
mosesdecoder/scripts/generic/binarize4moses2.perl +88 -0
mosesdecoder/scripts/generic/bsbleu.py +179 -0
mosesdecoder/scripts/generic/compound-splitter.perl +295 -0
mosesdecoder/scripts/generic/extract-factors.pl +24 -0
mosesdecoder/scripts/generic/extract-parallel.perl +385 -0
mosesdecoder/scripts/generic/fsa-sample.fsa +10 -0
mosesdecoder/scripts/generic/fsa2fsal.pl +53 -0
mosesdecoder/scripts/generic/fsa2plf.pl +182 -0
mosesdecoder/scripts/generic/generic-parallel.perl +119 -0
mosesdecoder/scripts/generic/giza-parallel.perl +134 -0
mosesdecoder/scripts/generic/lopar2pos.pl +20 -0
mosesdecoder/scripts/generic/moses_sim_pe.py +452 -0
mosesdecoder/scripts/generic/mteval-v11b.pl +761 -0
mosesdecoder/scripts/generic/mteval-v12.pl +784 -0
mosesdecoder/scripts/generic/mteval-v13a.pl +1170 -0
mosesdecoder/scripts/generic/mteval-v14.pl +1179 -0
mosesdecoder/scripts/generic/multi-bleu-detok.perl +214 -0
mosesdecoder/scripts/generic/multi-bleu.perl +177 -0
mosesdecoder/scripts/generic/multi_moses.py +332 -0
mosesdecoder/scripts/generic/ph_numbers.perl +106 -0
mosesdecoder/scripts/generic/reverse-alignment.perl +24 -0
mosesdecoder/scripts/generic/score-parallel.perl +428 -0
mosesdecoder/scripts/generic/score_parallel.py +776 -0
mosesdecoder/scripts/generic/strip-xml.perl +48 -0
mosesdecoder/scripts/generic/trainlm-irst2.perl +72 -0
mosesdecoder/scripts/share/nonbreaking_prefixes/README.txt +8 -0
mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0

.gitattributes CHANGED Viewed

@@ -37,3 +37,4 @@ fairseq-0.10.2/fairseq/libbleu.cpython-310-x86_64-linux-gnu.so filter=lfs diff=l
 fairseq-0.10.2/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 fairseq-0.10.2/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 fairseq-0.10.2/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text

 fairseq-0.10.2/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 fairseq-0.10.2/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 fairseq-0.10.2/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/moses/TranslationModel/UG/util/ibm1-align filter=lfs diff=lfs merge=lfs -text

mosesdecoder/defer/Joint.h ADDED Viewed

	@@ -0,0 +1,139 @@

+// $Id$
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_LanguageModelJoint_h
+#define moses_LanguageModelJoint_h
+#include <vector>
+#include <string>
+#include <sstream>
+#include "SingleFactor.h"
+#include "MultiFactor.h"
+#include "moses/Word.h"
+#include "moses/FactorTypeSet.h"
+#include "moses/FactorCollection.h"
+namespace Moses
+{
+class Phrase;
+class FactorCollection;
+/** LM of multiple factors. A simple extension of single factor LM - factors backoff together.
+ *	Rather slow as this uses string concatenation/split.
+ *  Not used for a long time
+ */
+class LanguageModelJoint : public LanguageModelMultiFactor
+{
+protected:
+  LanguageModelSingleFactor *m_lmImpl;
+  std::vector<FactorType> m_factorTypesOrdered;
+  size_t m_implFactor;
+public:
+  LanguageModelJoint(const std::string &line, LanguageModelSingleFactor *lmImpl)
+    :LanguageModelMultiFactor(line) {
+    m_lmImpl = lmImpl;
+  }
+  ~LanguageModelJoint() {
+    delete m_lmImpl;
+  }
+  bool Load(AllOptions const& opts, const std::string &filePath
+            , const std::vector<FactorType> &factorTypes
+            , size_t nGramOrder) {
+    m_factorTypes				= FactorMask(factorTypes);
+    m_filePath 					= filePath;
+    m_nGramOrder 				= nGramOrder;
+    m_factorTypesOrdered= factorTypes;
+    m_implFactor				= 0;
+    FactorCollection &factorCollection = FactorCollection::Instance();
+    // sentence markers
+    for (size_t index = 0 ; index < factorTypes.size() ; ++index) {
+      FactorType factorType = factorTypes[index];
+      m_sentenceStartWord[factorType] 	= factorCollection.AddFactor(Output, factorType, BOS_);
+      m_sentenceEndWord[factorType] 		= factorCollection.AddFactor(Output, factorType, EOS_);
+    }
+    m_lmImpl->Load(AllOptions const& opts);
+  }
+  LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const {
+    if (contextFactor.size() == 0) {
+      LMResult ret;
+      ret.score = 0.0;
+      ret.unknown = false;
+      return ret;
+    }
+    // joint context for internal LM
+    std::vector<const Word*> jointContext;
+    for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos ) {
+      const Word &word = *contextFactor[currPos];
+      // add word to chunked context
+      std::stringstream stream("");
+      const Factor *factor = word[ m_factorTypesOrdered[0] ];
+      stream << factor->GetString();
+      for (size_t index = 1 ; index < m_factorTypesOrdered.size() ; ++index) {
+        FactorType factorType = m_factorTypesOrdered[index];
+        const Factor *factor = word[factorType];
+        stream << "|" << factor->GetString();
+      }
+      factor = FactorCollection::Instance().AddFactor(Output, m_implFactor, stream.str());
+      Word* jointWord = new Word;
+      jointWord->SetFactor(m_implFactor, factor);
+      jointContext.push_back(jointWord);
+    }
+    // calc score on chunked phrase
+    LMResult ret = m_lmImpl->GetValueForgotState(jointContext, outState);
+    RemoveAllInColl(jointContext);
+    return ret;
+  }
+  const FFState *GetNullContextState() const {
+    return m_lmImpl->GetNullContextState();
+  }
+  const FFState *GetBeginSentenceState() const {
+    return m_lmImpl->GetBeginSentenceState();
+  }
+  FFState *NewState(const FFState *from) const {
+    return m_lmImpl->NewState(from);
+  }
+};
+}
+#endif

mosesdecoder/defer/PhraseDictionaryInterpolated.cpp ADDED Viewed

	@@ -0,0 +1,186 @@

+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2013- University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include <boost/lexical_cast.hpp>
+#include <boost/unordered_set.hpp>
+#include "util/exception.hh"
+#include "util/tokenize_piece.hh"
+#include "moses/TranslationModel/PhraseDictionaryInterpolated.h"
+using namespace std;
+namespace Moses
+{
+PhraseDictionaryInterpolated::PhraseDictionaryInterpolated
+(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature):
+  PhraseDictionary(numScoreComponent,feature),
+  m_targetPhrases(NULL),
+  m_languageModels(NULL) {}
+bool PhraseDictionaryInterpolated::Load(
+  const std::vector<FactorType> &input
+  , const std::vector<FactorType> &output
+  , const std::vector<std::string>& config
+  , const std::vector<float> &weightT
+  , size_t tableLimit
+  , const LMList &languageModels
+  , float weightWP)
+{
+  m_languageModels = &languageModels;
+  m_weightT = weightT;
+  m_tableLimit = tableLimit;
+  m_weightWP = weightWP;
+  //The config should be as follows:
+  //0-3: type factor factor num-components (as usual)
+  //4: combination mode (e.g. naive)
+  //5-(length-2): List of phrase-table files
+  //length-1: Weight string, in the same format as used for tmcombine
+  UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7");
+  UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'");
+  // Create the dictionaries
+  for (size_t i = 5; i < config.size()-1; ++i) {
+    m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor(
+                               GetFeature()->GetNumScoreComponents(),
+                               GetFeature()->GetNumInputScores(),
+                               GetFeature())));
+    bool ret = m_dictionaries.back()->Load(
+                 input,
+                 output,
+                 config[i],
+                 weightT,
+                 0,
+                 languageModels,
+                 weightWP);
+    if (!ret) return ret;
+  }
+  //Parse the weight strings
+  for (util::TokenIter<util::SingleCharacter, false> featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) {
+    m_weights.push_back(vector<float>());
+    float sum = 0;
+    for (util::TokenIter<util::SingleCharacter, false> tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) {
+      const float weight = boost::lexical_cast<float>(*tableWeights);
+      m_weights.back().push_back(weight);
+      sum += weight;
+    }
+    UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception,
+                  "Number of weights (" << m_weights.back().size() <<
+                  ") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")");
+    UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised");
+  }
+  //check number of weight sets. Make sure there is a weight for every score component
+  //except for the last - which is assumed to be the phrase penalty.
+  UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets");
+  //if 1 weight set, then repeat
+  if (m_weights.size() == 1) {
+    while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) {
+      m_weights.push_back(m_weights[0]);
+    }
+  }
+  return true;
+}
+void PhraseDictionaryInterpolated::InitializeForInput(ttasksptr const& ttask)
+{
+  for (size_t i = 0; i < m_dictionaries.size(); ++i) {
+    m_dictionaries[i]->InitializeForInput(ttask);
+  }
+}
+typedef
+boost::unordered_set<TargetPhrase*,PhrasePtrHasher,PhrasePtrComparator> PhraseSet;
+TargetPhraseCollection::shared_ptr
+PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const
+{
+  delete m_targetPhrases;
+  m_targetPhrases = new TargetPhraseCollection();
+  PhraseSet allPhrases;
+  vector<PhraseSet> phrasesByTable(m_dictionaries.size());
+  for (size_t i = 0; i < m_dictionaries.size(); ++i) {
+    TargetPhraseCollection::shared_ptr  phrases = m_dictionaries[i]->GetTargetPhraseCollection(src);
+    if (phrases) {
+      for (TargetPhraseCollection::const_iterator j = phrases->begin();
+           j != phrases->end(); ++j) {
+        allPhrases.insert(*j);
+        phrasesByTable[i].insert(*j);
+      }
+    }
+  }
+  ScoreComponentCollection sparseVector;
+  for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) {
+    TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i);
+    //combinedPhrase->ResetScore();
+    //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
+    combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase());
+    combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm()));
+    combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm()));
+    Scores combinedScores(GetFeature()->GetNumScoreComponents());
+    for (size_t j = 0; j < phrasesByTable.size(); ++j) {
+      PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase);
+      if (tablePhrase != phrasesByTable[j].end()) {
+        Scores tableScores = (*tablePhrase)->GetScoreBreakdown()
+                             .GetScoresForProducer(GetFeature());
+        //cerr << "Scores from " << j << " table: ";
+        for (size_t k = 0; k < tableScores.size()-1; ++k) {
+          //cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") ";
+          combinedScores[k] += m_weights[k][j] * exp(tableScores[k]);
+          //cerr << m_weights[k][j] * exp(tableScores[k]) << " ";
+        }
+        //cerr << endl;
+      }
+    }
+    //map back to log space
+    //cerr << "Combined ";
+    for (size_t k = 0; k < combinedScores.size()-1; ++k) {
+      //cerr << combinedScores[k] << " ";
+      combinedScores[k] = log(combinedScores[k]);
+      //cerr << combinedScores[k] << " ";
+    }
+    //cerr << endl;
+    combinedScores.back() = 1; //assume last is penalty
+    combinedPhrase->SetScore(
+      GetFeature(),
+      combinedScores,
+      sparseVector,
+      m_weightT,
+      m_weightWP,
+      *m_languageModels);
+    //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() <<  endl;
+    m_targetPhrases->Add(combinedPhrase);
+  }
+  m_targetPhrases->Prune(true,m_tableLimit);
+  return m_targetPhrases;
+}
+}

mosesdecoder/defer/PhraseLengthFeatureTest.cpp ADDED Viewed

	@@ -0,0 +1,104 @@

+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2010 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include <boost/test/unit_test.hpp>
+#include "moses/FF/PhraseLengthFeature.h"
+#include "moses/FactorCollection.h"
+#include "moses/Sentence.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TranslationOption.h"
+using namespace Moses;
+using namespace std;
+BOOST_AUTO_TEST_SUITE(phrase_length_feature)
+//TODO: Factor out setup code so that it can be reused
+static Word MakeWord(string text)
+{
+  FactorCollection &factorCollection = FactorCollection::Instance();
+  const Factor* f = factorCollection.AddFactor(Input,0,text);
+  Word w;
+  w.SetFactor(0,f);
+  return w;
+}
+BOOST_AUTO_TEST_CASE(evaluate)
+{
+  Word w1 = MakeWord("w1");
+  Word w2 = MakeWord("y2");
+  Word w3 = MakeWord("x3");
+  Word w4 = MakeWord("w4");
+  Phrase p1;
+  p1.AddWord(w1);
+  p1.AddWord(w3);
+  p1.AddWord(w4);
+  Phrase p2;
+  p2.AddWord(w1);
+  p2.AddWord(w2);
+  Phrase p3;
+  p3.AddWord(w2);
+  p3.AddWord(w1);
+  p3.AddWord(w4);
+  p3.AddWord(w4);
+  TargetPhrase tp1(p1);
+  TargetPhrase tp2(p2);
+  TargetPhrase tp3(p3);
+  Sentence sentence;
+  vector<FactorType> order;
+  order.push_back(0);
+  stringstream in("the input sentence has 6 words");
+  sentence.Read(in, order);
+  TranslationOption topt1(WordsRange(0,0), tp1);
+  TranslationOption topt2(WordsRange(1,3), tp2);
+  TranslationOption topt3(WordsRange(2,3), tp3);
+  PhraseBasedFeatureContext context1(topt1,sentence);
+  PhraseBasedFeatureContext context2(topt2,sentence);
+  PhraseBasedFeatureContext context3(topt3,sentence);
+  PhraseLengthFeature plf;
+  ScoreComponentCollection acc1,acc2,acc3;
+  plf.Evaluate(context1, &acc1);
+  BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "s1"),1);
+  BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "t3"),1);
+  BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "1,3"),1);
+  plf.Evaluate(context2, &acc2);
+  BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "s3"),1);
+  BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "t2"),1);
+  BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "3,2"),1);
+  plf.Evaluate(context3, &acc3);
+  BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "s2"),1);
+  BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "t4"),1);
+  BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "2,4"),1);
+}
+BOOST_AUTO_TEST_SUITE_END()

mosesdecoder/lm/builder/corpus_count.hh ADDED Viewed

	@@ -0,0 +1,53 @@

+#ifndef LM_BUILDER_CORPUS_COUNT_H
+#define LM_BUILDER_CORPUS_COUNT_H
+#include "lm/lm_exception.hh"
+#include "lm/word_index.hh"
+#include "util/scoped.hh"
+#include <cstddef>
+#include <string>
+#include <stdint.h>
+#include <vector>
+namespace util {
+class FilePiece;
+namespace stream {
+class ChainPosition;
+} // namespace stream
+} // namespace util
+namespace lm {
+namespace builder {
+class CorpusCount {
+  public:
+    // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size
+    static float DedupeMultiplier(std::size_t order);
+    // How much memory vocabulary will use based on estimated size of the vocab.
+    static std::size_t VocabUsage(std::size_t vocab_estimate);
+    // token_count: out.
+    // type_count aka vocabulary size.  Initialize to an estimate.  It is set to the exact value.
+    CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol);
+    void Run(const util::stream::ChainPosition &position);
+  private:
+    util::FilePiece &from_;
+    int vocab_write_;
+    uint64_t &token_count_;
+    WordIndex &type_count_;
+    std::vector<bool>& prune_words_;
+    const std::string& prune_vocab_filename_;
+    std::size_t dedupe_mem_size_;
+    util::scoped_malloc dedupe_mem_;
+    WarningAction disallowed_symbol_action_;
+};
+} // namespace builder
+} // namespace lm
+#endif // LM_BUILDER_CORPUS_COUNT_H

mosesdecoder/lm/builder/dump_counts_main.cc ADDED Viewed

	@@ -0,0 +1,36 @@

+#include "lm/common/print.hh"
+#include "lm/word_index.hh"
+#include "util/file.hh"
+#include "util/read_compressed.hh"
+#include <boost/lexical_cast.hpp>
+#include <iostream>
+#include <vector>
+int main(int argc, char *argv[]) {
+  if (argc != 4) {
+    std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n"
+    "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n"
+    "counts.  Each record has order many vocabulary ids.\n"
+    "The vocabulary file contains the words delimited by NULL in order of id.\n"
+    "The vocabulary file may not be compressed because it is mmapped but the counts\n"
+    "file can be compressed.\n";
+    return 1;
+  }
+  util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
+  util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
+  lm::VocabReconstitute vocab(vocab_file.get());
+  unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
+  std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
+  while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
+    UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size());
+    const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin());
+    for (const lm::WordIndex *i = words; i != words + order; ++i) {
+      UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ".  Are you sure you have the right order and vocab file for these counts?");
+      std::cout << vocab.Lookup(*i) << ' ';
+    }
+    // TODO don't use std::cout because it is slow.  Add fast uint64_t printing support to FileStream.
+    std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
+  }
+}

mosesdecoder/lm/builder/lmplz_main.cc ADDED Viewed

	@@ -0,0 +1,220 @@

+#include "lm/builder/output.hh"
+#include "lm/builder/pipeline.hh"
+#include "lm/common/size_option.hh"
+#include "lm/lm_exception.hh"
+#include "util/file.hh"
+#include "util/file_piece.hh"
+#include "util/usage.hh"
+#include <iostream>
+#include <boost/program_options.hpp>
+#include <boost/version.hpp>
+#include <vector>
+namespace {
+// Parse and validate pruning thresholds then return vector of threshold counts
+// for each n-grams order.
+std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::size_t order) {
+  // convert to vector of integers
+  std::vector<uint64_t> prune_thresholds;
+  prune_thresholds.reserve(order);
+  for (std::vector<std::string>::const_iterator it(param.begin()); it != param.end(); ++it) {
+    try {
+      prune_thresholds.push_back(boost::lexical_cast<uint64_t>(*it));
+    } catch(const boost::bad_lexical_cast &) {
+      UTIL_THROW(util::Exception, "Bad pruning threshold " << *it);
+    }
+  }
+  // Fill with zeros by default.
+  if (prune_thresholds.empty()) {
+    prune_thresholds.resize(order, 0);
+    return prune_thresholds;
+  }
+  // validate pruning threshold if specified
+  // throw if each n-gram order has not  threshold specified
+  UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
+  // threshold for unigram can only be 0 (no pruning)
+  // check if threshold are not in decreasing order
+  uint64_t lower_threshold = 0;
+  for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
+    UTIL_THROW_IF(lower_threshold > *it, util::Exception, "Pruning thresholds should be in non-decreasing order.  Otherwise substrings would be removed, which is bad for query-time data structures.");
+    lower_threshold = *it;
+  }
+  // Pad to all orders using the last value.
+  prune_thresholds.resize(order, prune_thresholds.back());
+  return prune_thresholds;
+}
+lm::builder::Discount ParseDiscountFallback(const std::vector<std::string> &param) {
+  lm::builder::Discount ret;
+  UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+");
+  UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified");
+  ret.amount[0] = 0.0;
+  for (unsigned i = 0; i < 3; ++i) {
+    float discount = boost::lexical_cast<float>(param[i < param.size() ? i : (param.size() - 1)]);
+    UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "].");
+    ret.amount[i + 1] = discount;
+  }
+  return ret;
+}
+} // namespace
+int main(int argc, char *argv[]) {
+  try {
+    namespace po = boost::program_options;
+    po::options_description options("Language model building options");
+    lm::builder::PipelineConfig pipeline;
+    std::string text, intermediate, arpa;
+    std::vector<std::string> pruning;
+    std::vector<std::string> discount_fallback;
+    std::vector<std::string> discount_fallback_default;
+    discount_fallback_default.push_back("0.5");
+    discount_fallback_default.push_back("1");
+    discount_fallback_default.push_back("1.5");
+    bool verbose_header;
+    options.add_options()
+      ("help,h", po::bool_switch(), "Show this help message")
+      ("order,o", po::value<std::size_t>(&pipeline.order)
+#if BOOST_VERSION >= 104200
+         ->required()
+#endif
+         , "Order of the model")
+      ("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI.  If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
+      ("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
+      ("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
+      ("memory,S", lm:: SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
+      ("minimum_block", lm::SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
+      ("sort_block", lm::SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
+      ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
+      ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
+      ("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
+      ("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
+      ("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
+      ("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
+      ("intermediate", po::value<std::string>(&intermediate), "Write ngrams to intermediate files.  Turns off ARPA output (which can be reactivated by --arpa file).  Forces --renumber on.")
+      ("renumber", po::bool_switch(&pipeline.renumber_vocabulary), "Rrenumber the vocabulary identifiers so that they are monotone with the hash of each string.  This is consistent with the ordering used by the trie data structure.")
+      ("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities.  See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
+      ("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold.  Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above.  The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.")
+      ("limit_vocab_file", po::value<std::string>(&pipeline.prune_vocab_file)->default_value(""), "Read allowed vocabulary separated by whitespace. N-grams that contain vocabulary items not in this list will be pruned. Can be combined with --prune arg")
+      ("discount_fallback", po::value<std::vector<std::string> >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons.  It can also fail if these values are out of range.  This option falls back to user-specified discounts when the closed-form estimate fails.  Note that this option is generally a bad idea: you should deduplicate your corpus instead.  However, class-based models need custom discounts because they lack singleton unigrams.  Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail.");
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, options), vm);
+    if (argc == 1 || vm["help"].as<bool>()) {
+      std::cerr <<
+        "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
+        "Please cite:\n"
+        "@inproceedings{Heafield-estimate,\n"
+        "  author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
+        "  title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
+        "  year = {2013},\n"
+        "  month = {8},\n"
+        "  booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
+        "  address = {Sofia, Bulgaria},\n"
+        "  url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
+        "}\n\n"
+        "Provide the corpus on stdin.  The ARPA file will be written to stdout.  Order of\n"
+        "the model (-o) is the only mandatory option.  As this is an on-disk program,\n"
+        "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
+        "Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
+        "Valid units are \% for percentage of memory (supported platforms only) and (in\n"
+        "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y.  Default is K (*1024).\n";
+      uint64_t mem = util::GuessPhysicalMemory();
+      if (mem) {
+        std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
+      } else {
+        std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
+      }
+      std::cerr << options << std::endl;
+      return 1;
+    }
+    po::notify(vm);
+    // required() appeared in Boost 1.42.0.
+#if BOOST_VERSION < 104200
+    if (!vm.count("order")) {
+      std::cerr << "the option '--order' is required but missing" << std::endl;
+      return 1;
+    }
+#endif
+    if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) {
+      std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl;
+      return 1;
+    }
+    if (vm["skip_symbols"].as<bool>()) {
+      pipeline.disallowed_symbol_action = lm::COMPLAIN;
+    } else {
+      pipeline.disallowed_symbol_action = lm::THROW_UP;
+    }
+    if (vm.count("discount_fallback")) {
+      pipeline.discount.fallback = ParseDiscountFallback(discount_fallback);
+      pipeline.discount.bad_action = lm::COMPLAIN;
+    } else {
+      // Unused, just here to prevent the compiler from complaining about uninitialized.
+      pipeline.discount.fallback = lm::builder::Discount();
+      pipeline.discount.bad_action = lm::THROW_UP;
+    }
+    // parse pruning thresholds.  These depend on order, so it is not done as a notifier.
+    pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order);
+    if (!vm["limit_vocab_file"].as<std::string>().empty()) {
+      pipeline.prune_vocab = true;
+    }
+    else {
+      pipeline.prune_vocab = false;
+    }
+    util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
+    lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
+    // TODO: evaluate options for these.
+    initial.adder_in.total_memory = 32768;
+    initial.adder_in.block_count = 2;
+    initial.adder_out.total_memory = 32768;
+    initial.adder_out.block_count = 2;
+    pipeline.read_backoffs = initial.adder_out;
+    // Read from stdin, write to stdout by default
+    util::scoped_fd in(0), out(1);
+    if (vm.count("text")) {
+      in.reset(util::OpenReadOrThrow(text.c_str()));
+    }
+    if (vm.count("arpa")) {
+      out.reset(util::CreateOrThrow(arpa.c_str()));
+    }
+    try {
+      bool writing_intermediate = vm.count("intermediate");
+      if (writing_intermediate) {
+        pipeline.renumber_vocabulary = true;
+      }
+      lm::builder::Output output(writing_intermediate ? intermediate : pipeline.sort.temp_prefix, writing_intermediate, pipeline.output_q);
+      if (!writing_intermediate || vm.count("arpa")) {
+        output.Add(new lm::builder::PrintHook(out.release(), verbose_header));
+      }
+      lm::builder::Pipeline(pipeline, in.release(), output);
+    } catch (const util::MallocException &e) {
+      std::cerr << e.what() << std::endl;
+      std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
+      return 1;
+    }
+    util::PrintUsage(std::cerr);
+  } catch (const std::exception &e) {
+    std::cerr << e.what() << std::endl;
+    return 1;
+  }
+}

mosesdecoder/lm/common/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+cmake_minimum_required(VERSION 2.8.8)
+#
+# The KenLM cmake files make use of add_library(... OBJECTS ...)
+#
+# This syntax allows grouping of source files when compiling
+# (effectively creating "fake" libraries based on source subdirs).
+#
+# This syntax was only added in cmake version 2.8.8
+#
+# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
+# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
+# Explicitly list the source files for this subdirectory
+#
+# If you add any source files to this subdirectory
+#    that should be included in the kenlm library,
+#        (this excludes any unit test files)
+#    you should add them to the following list:
+#
+# In order to set correct paths to these files
+#    in case this variable is referenced by CMake files in the parent directory,
+#    we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
+#
+set(KENLM_COMMON_SOURCE
+		${CMAKE_CURRENT_SOURCE_DIR}/model_buffer.cc
+		${CMAKE_CURRENT_SOURCE_DIR}/print.cc
+		${CMAKE_CURRENT_SOURCE_DIR}/renumber.cc
+		${CMAKE_CURRENT_SOURCE_DIR}/size_option.cc
+	)
+# Group these objects together for later use.
+#
+# Given add_library(foo OBJECT ${my_foo_sources}),
+# refer to these objects as $<TARGET_OBJECTS:foo>
+#
+add_library(kenlm_common OBJECT ${KENLM_COMMON_SOURCE})

mosesdecoder/lm/common/Jamfile ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ fakelib common : [ glob .cc : test.cc *main.cc ]
2	+ ../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm /top//boost_program_options ;

mosesdecoder/lm/common/joint_order.hh ADDED Viewed

	@@ -0,0 +1,71 @@

+#ifndef LM_COMMON_JOINT_ORDER_H
+#define LM_COMMON_JOINT_ORDER_H
+#include "lm/common/ngram_stream.hh"
+#include "lm/lm_exception.hh"
+#ifdef DEBUG
+#include "util/fixed_array.hh"
+#include <iostream>
+#endif
+#include <cstring>
+namespace lm {
+template <class Callback, class Compare> void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) {
+  // Allow matching to reference streams[-1].
+  util::FixedArray<ProxyStream<NGramHeader> > streams_with_dummy(positions.size() + 1);
+  // A bogus stream for [-1].
+  streams_with_dummy.push_back();
+  for (std::size_t i = 0; i < positions.size(); ++i) {
+    streams_with_dummy.push_back(positions[i], NGramHeader(NULL, i + 1));
+  }
+  ProxyStream<NGramHeader> *streams = streams_with_dummy.begin() + 1;
+  std::size_t order;
+  for (order = 0; order < positions.size() && streams[order]; ++order) {}
+  assert(order); // should always have <unk>.
+  // Debugging only: call comparison function to sanity check order.
+#ifdef DEBUG
+  util::FixedArray<Compare> less_compare(order);
+  for (unsigned i = 0; i < order; ++i)
+    less_compare.push_back(i + 1);
+#endif // DEBUG
+  std::size_t current = 0;
+  while (true) {
+    // Does the context match the lower one?
+    if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
+      callback.Enter(current, streams[current].Get());
+      // Transition to looking for extensions.
+      if (++current < order) continue;
+    }
+#ifdef DEBUG
+    // match_check[current - 1] matches current-grams
+    // The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams).
+    else if (!less_compare[current - 1](streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) {
+      std::cerr << "Stream out of order detected" << std::endl;
+      abort();
+    }
+#endif // DEBUG
+    // No extension left.
+    while(true) {
+      assert(current > 0);
+      --current;
+      callback.Exit(current, streams[current].Get());
+      if (++streams[current]) break;
+      UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
+      order = current;
+      if (!order) return;
+    }
+  }
+}
+} // namespaces
+#endif // LM_COMMON_JOINT_ORDER_H

mosesdecoder/lm/common/ngram.hh ADDED Viewed

	@@ -0,0 +1,77 @@

+#ifndef LM_COMMON_NGRAM_H
+#define LM_COMMON_NGRAM_H
+#include "lm/weights.hh"
+#include "lm/word_index.hh"
+#include <cstddef>
+#include <cassert>
+#include <stdint.h>
+#include <cstring>
+namespace lm {
+class NGramHeader {
+  public:
+    NGramHeader(void *begin, std::size_t order)
+      : begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {}
+    NGramHeader() : begin_(NULL), end_(NULL) {}
+    const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); }
+    uint8_t *Base() { return reinterpret_cast<uint8_t*>(begin_); }
+    void ReBase(void *to) {
+      std::size_t difference = end_ - begin_;
+      begin_ = reinterpret_cast<WordIndex*>(to);
+      end_ = begin_ + difference;
+    }
+    // These are for the vocab index.
+    // Lower-case in deference to STL.
+    const WordIndex *begin() const { return begin_; }
+    WordIndex *begin() { return begin_; }
+    const WordIndex *end() const { return end_; }
+    WordIndex *end() { return end_; }
+    std::size_t size() const { return end_ - begin_; }
+    std::size_t Order() const { return end_ - begin_; }
+  private:
+    WordIndex *begin_, *end_;
+};
+template <class PayloadT> class NGram : public NGramHeader {
+  public:
+    typedef PayloadT Payload;
+    NGram() : NGramHeader(NULL, 0) {}
+    NGram(void *begin, std::size_t order) : NGramHeader(begin, order) {}
+    // Would do operator++ but that can get confusing for a stream.
+    void NextInMemory() {
+      ReBase(&Value() + 1);
+    }
+    static std::size_t TotalSize(std::size_t order) {
+      return order * sizeof(WordIndex) + sizeof(Payload);
+    }
+    std::size_t TotalSize() const {
+      // Compiler should optimize this.
+      return TotalSize(Order());
+    }
+    static std::size_t OrderFromSize(std::size_t size) {
+      std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex);
+      assert(size == TotalSize(ret));
+      return ret;
+    }
+    const Payload &Value() const { return *reinterpret_cast<const Payload *>(end()); }
+    Payload &Value() { return *reinterpret_cast<Payload *>(end()); }
+};
+} // namespace lm
+#endif // LM_COMMON_NGRAM_H

mosesdecoder/lm/common/print.cc ADDED Viewed

	@@ -0,0 +1,62 @@

+#include "lm/common/print.hh"
+#include "lm/common/ngram_stream.hh"
+#include "util/file_stream.hh"
+#include "util/file.hh"
+#include "util/mmap.hh"
+#include "util/scoped.hh"
+#include <sstream>
+#include <cstring>
+namespace lm {
+VocabReconstitute::VocabReconstitute(int fd) {
+  uint64_t size = util::SizeOrThrow(fd);
+  util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
+  const char *const start = static_cast<const char*>(memory_.get());
+  const char *i;
+  for (i = start; i != start + size; i += strlen(i) + 1) {
+    map_.push_back(i);
+  }
+  // Last one for LookupPiece.
+  map_.push_back(i);
+}
+namespace {
+template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FileStream &out) {
+  out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin());
+  for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
+    out << ' ' << vocab.Lookup(*i);
+  }
+}
+} // namespace
+void PrintARPA::Run(const util::stream::ChainPositions &positions) {
+  VocabReconstitute vocab(vocab_fd_);
+  util::FileStream out(out_fd_);
+  out << "\\data\\\n";
+  for (size_t i = 0; i < positions.size(); ++i) {
+    out << "ngram " << (i+1) << '=' << counts_[i] << '\n';
+  }
+  out << '\n';
+  for (unsigned order = 1; order < positions.size(); ++order) {
+    out << "\\" << order << "-grams:" << '\n';
+    for (ProxyStream<NGram<ProbBackoff> > stream(positions[order - 1], NGram<ProbBackoff>(NULL, order)); stream; ++stream) {
+      PrintLead(vocab, stream, out);
+      out << '\t' << stream->Value().backoff << '\n';
+    }
+    out << '\n';
+  }
+  out << "\\" << positions.size() << "-grams:" << '\n';
+  for (ProxyStream<NGram<Prob> > stream(positions.back(), NGram<Prob>(NULL, positions.size())); stream; ++stream) {
+    PrintLead(vocab, stream, out);
+    out << '\n';
+  }
+  out << '\n';
+  out << "\\end\\\n";
+}
+} // namespace lm

mosesdecoder/lm/common/renumber.cc ADDED Viewed

	@@ -0,0 +1,17 @@

+#include "lm/common/renumber.hh"
+#include "lm/common/ngram.hh"
+#include "util/stream/stream.hh"
+namespace lm {
+void Renumber::Run(const util::stream::ChainPosition &position) {
+  for (util::stream::Stream stream(position); stream; ++stream) {
+    NGramHeader gram(stream.Get(), order_);
+    for (WordIndex *w = gram.begin(); w != gram.end(); ++w) {
+      *w = new_numbers_[*w];
+    }
+  }
+}
+} // namespace lm

mosesdecoder/lm/common/renumber.hh ADDED Viewed

	@@ -0,0 +1,30 @@

+/* Map vocab ids.  This is useful to merge independently collected counts or
+ * change the vocab ids to the order used by the trie.
+ */
+#ifndef LM_COMMON_RENUMBER_H
+#define LM_COMMON_RENUMBER_H
+#include "lm/word_index.hh"
+#include <cstddef>
+namespace util { namespace stream { class ChainPosition; }}
+namespace lm {
+class Renumber {
+  public:
+    // Assumes the array is large enough to map all words and stays alive while
+    // the thread is active.
+    Renumber(const WordIndex *new_numbers, std::size_t order)
+      : new_numbers_(new_numbers), order_(order) {}
+    void Run(const util::stream::ChainPosition &position);
+  private:
+    const WordIndex *new_numbers_;
+    std::size_t order_;
+};
+} // namespace lm
+#endif // LM_COMMON_RENUMBER_H

mosesdecoder/mert/ReferenceTest.cpp ADDED Viewed

	@@ -0,0 +1,123 @@

+#include "Reference.h"
+#define BOOST_TEST_MODULE MertReference
+#include <boost/test/unit_test.hpp>
+using namespace MosesTuning;
+BOOST_AUTO_TEST_CASE(refernece_count)
+{
+  Reference ref;
+  BOOST_CHECK(ref.get_counts() != NULL);
+}
+BOOST_AUTO_TEST_CASE(refernece_length_iterator)
+{
+  Reference ref;
+  ref.push_back(4);
+  ref.push_back(2);
+  BOOST_REQUIRE(ref.num_references() == 2);
+  Reference::iterator it = ref.begin();
+  BOOST_CHECK_EQUAL(*it, 4);
+  ++it;
+  BOOST_CHECK_EQUAL(*it, 2);
+  ++it;
+  BOOST_CHECK(it == ref.end());
+}
+BOOST_AUTO_TEST_CASE(refernece_length_average)
+{
+  {
+    Reference ref;
+    ref.push_back(4);
+    ref.push_back(1);
+    BOOST_CHECK_EQUAL(2, ref.CalcAverage());
+  }
+  {
+    Reference ref;
+    ref.push_back(4);
+    ref.push_back(3);
+    BOOST_CHECK_EQUAL(3, ref.CalcAverage());
+  }
+  {
+    Reference ref;
+    ref.push_back(4);
+    ref.push_back(3);
+    ref.push_back(4);
+    ref.push_back(5);
+    BOOST_CHECK_EQUAL(4, ref.CalcAverage());
+  }
+}
+BOOST_AUTO_TEST_CASE(refernece_length_closest)
+{
+  {
+    Reference ref;
+    ref.push_back(4);
+    ref.push_back(1);
+    BOOST_REQUIRE(ref.num_references() == 2);
+    BOOST_CHECK_EQUAL(1, ref.CalcClosest(2));
+    BOOST_CHECK_EQUAL(1, ref.CalcClosest(1));
+    BOOST_CHECK_EQUAL(4, ref.CalcClosest(3));
+    BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
+    BOOST_CHECK_EQUAL(4, ref.CalcClosest(5));
+  }
+  {
+    Reference ref;
+    ref.push_back(4);
+    ref.push_back(3);
+    BOOST_REQUIRE(ref.num_references() == 2);
+    BOOST_CHECK_EQUAL(3, ref.CalcClosest(1));
+    BOOST_CHECK_EQUAL(3, ref.CalcClosest(2));
+    BOOST_CHECK_EQUAL(3, ref.CalcClosest(3));
+    BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
+    BOOST_CHECK_EQUAL(4, ref.CalcClosest(5));
+  }
+  {
+    Reference ref;
+    ref.push_back(4);
+    ref.push_back(3);
+    ref.push_back(4);
+    ref.push_back(5);
+    BOOST_REQUIRE(ref.num_references() == 4);
+    BOOST_CHECK_EQUAL(3, ref.CalcClosest(1));
+    BOOST_CHECK_EQUAL(3, ref.CalcClosest(2));
+    BOOST_CHECK_EQUAL(3, ref.CalcClosest(3));
+    BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
+    BOOST_CHECK_EQUAL(5, ref.CalcClosest(5));
+  }
+}
+BOOST_AUTO_TEST_CASE(refernece_length_shortest)
+{
+  {
+    Reference ref;
+    ref.push_back(4);
+    ref.push_back(1);
+    BOOST_CHECK_EQUAL(1, ref.CalcShortest());
+  }
+  {
+    Reference ref;
+    ref.push_back(4);
+    ref.push_back(3);
+    BOOST_CHECK_EQUAL(3, ref.CalcShortest());
+  }
+  {
+    Reference ref;
+    ref.push_back(4);
+    ref.push_back(3);
+    ref.push_back(4);
+    ref.push_back(5);
+    BOOST_CHECK_EQUAL(3, ref.CalcShortest());
+  }
+}

mosesdecoder/mert/ScoreArray.cpp ADDED Viewed

	@@ -0,0 +1,169 @@

+/*
+ *  ScoreArray.cpp
+ *  mert - Minimum Error Rate Training
+ *
+ *  Created by Nicola Bertoldi on 13/05/08.
+ *
+ */
+#include "ScoreArray.h"
+#include "Util.h"
+#include "FileStream.h"
+using namespace std;
+namespace MosesTuning
+{
+ScoreArray::ScoreArray()
+  : m_num_scores(0), m_index(0) {}
+void ScoreArray::savetxt(ostream* os, const string& sctype)
+{
+  *os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size()
+      << " " << m_num_scores << " " << sctype << endl;
+  for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
+    i->savetxt(os);
+    *os << endl;
+  }
+  *os << SCORES_TXT_END << endl;
+}
+void ScoreArray::savebin(ostream* os, const string& score_type)
+{
+  *os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size()
+      << " " << m_num_scores << " " << score_type << endl;
+  for (scorearray_t::iterator i = m_array.begin();
+       i != m_array.end(); i++) {
+    i->savebin(os);
+  }
+  *os << SCORES_BIN_END << endl;
+}
+void ScoreArray::save(ostream* os, const string& score_type, bool bin)
+{
+  if (size() <= 0) return;
+  if (bin) {
+    savebin(os, score_type);
+  } else {
+    savetxt(os, score_type);
+  }
+}
+void ScoreArray::save(const string &file, const string& score_type, bool bin)
+{
+  ofstream ofs(file.c_str(), ios::out);
+  if (!ofs) {
+    cerr << "Failed to open " << file << endl;
+    exit(1);
+  }
+  ostream* os = &ofs;
+  save(os, score_type, bin);
+  ofs.close();
+}
+void ScoreArray::save(const string& score_type, bool bin)
+{
+  save(&cout, score_type, bin);
+}
+void ScoreArray::loadbin(istream* is, size_t n)
+{
+  ScoreStats entry(m_num_scores);
+  for (size_t i = 0; i < n; i++) {
+    entry.loadbin(is);
+    add(entry);
+  }
+}
+void ScoreArray::loadtxt(istream* is, size_t n)
+{
+  ScoreStats entry(m_num_scores);
+  for (size_t i = 0; i < n; i++) {
+    entry.loadtxt(is);
+    add(entry);
+  }
+}
+void ScoreArray::load(istream* is)
+{
+  size_t number_of_entries = 0;
+  bool binmode = false;
+  string substring, stringBuf;
+  string::size_type loc;
+  getline(*is, stringBuf);
+  if (!is->good()) {
+    return;
+  }
+  if (!stringBuf.empty()) {
+    if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0) {
+      binmode=false;
+    } else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0) {
+      binmode=true;
+    } else {
+      TRACE_ERR("ERROR: ScoreArray::load(): Wrong header");
+      return;
+    }
+    getNextPound(stringBuf, substring);
+    getNextPound(stringBuf, substring);
+    m_index = atoi(substring.c_str());
+    getNextPound(stringBuf, substring);
+    number_of_entries = atoi(substring.c_str());
+    getNextPound(stringBuf, substring);
+    m_num_scores = atoi(substring.c_str());
+    getNextPound(stringBuf, substring);
+    m_score_type = substring;
+  }
+  if (binmode) {
+    loadbin(is, number_of_entries);
+  } else {
+    loadtxt(is, number_of_entries);
+  }
+  getline(*is, stringBuf);
+  if (!stringBuf.empty()) {
+    if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 &&
+        (loc = stringBuf.find(SCORES_BIN_END)) != 0) {
+      TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer");
+      return;
+    }
+  }
+}
+void ScoreArray::load(const string &file)
+{
+  TRACE_ERR("loading data from " << file << endl);
+  inputfilestream input_stream(file); // matches a stream with a file. Opens the file
+  istream* is = &input_stream;
+  load(is);
+  input_stream.close();
+}
+void ScoreArray::merge(ScoreArray& e)
+{
+  //dummy implementation
+  for (size_t i=0; i<e.size(); i++)
+    add(e.get(i));
+}
+bool ScoreArray::check_consistency() const
+{
+  const size_t sz = NumberOfScores();
+  if (sz == 0)
+    return true;
+  for (scorearray_t::const_iterator i = m_array.begin();
+       i != m_array.end(); ++i) {
+    if (i->size() != sz)
+      return false;
+  }
+  return true;
+}
+}

mosesdecoder/mert/ScoreArray.h ADDED Viewed

	@@ -0,0 +1,113 @@

+/*
+ *  ScoreArray.h
+ *  mert - Minimum Error Rate Training
+ *
+ *  Created by Nicola Bertoldi on 13/05/08.
+ *
+ */
+#ifndef MERT_SCORE_ARRAY_H_
+#define MERT_SCORE_ARRAY_H_
+#include <vector>
+#include <iostream>
+#include <string>
+#include "ScoreStats.h"
+namespace MosesTuning
+{
+const char SCORES_TXT_BEGIN[] = "SCORES_TXT_BEGIN_0";
+const char SCORES_TXT_END[] = "SCORES_TXT_END_0";
+const char SCORES_BIN_BEGIN[] = "SCORES_BIN_BEGIN_0";
+const char SCORES_BIN_END[] = "SCORES_BIN_END_0";
+class ScoreArray
+{
+private:
+  scorearray_t m_array;
+  std::string m_score_type;
+  std::size_t m_num_scores;
+  // indexx to identify the utterance.
+  // It can differ from the index inside the vector.
+  int m_index;
+public:
+  ScoreArray();
+  ~ScoreArray() {}
+  void clear() {
+    m_array.clear();
+  }
+  int getIndex() const {
+    return m_index;
+  }
+  void setIndex(int value) {
+    m_index = value;
+  }
+  ScoreStats& get(std::size_t i) {
+    return m_array.at(i);
+  }
+  const ScoreStats& get(std::size_t i) const {
+    return m_array.at(i);
+  }
+  void add(const ScoreStats& e) {
+    m_array.push_back(e);
+  }
+  //ADDED BY TS
+  void swap(std::size_t i, std::size_t j) {
+    std::swap(m_array[i], m_array[j]);
+  }
+  void resize(std::size_t new_size) {
+    m_array.resize(std::min(new_size, m_array.size()));
+  }
+  //END_ADDED
+  void merge(ScoreArray& e);
+  std::string name() const {
+    return m_score_type;
+  }
+  void name(std::string &score_type) {
+    m_score_type = score_type;
+  }
+  std::size_t size() const {
+    return m_array.size();
+  }
+  std::size_t NumberOfScores() const {
+    return m_num_scores;
+  }
+  void NumberOfScores(std::size_t v) {
+    m_num_scores = v;
+  }
+  void savetxt(std::ostream* os, const std::string& score_type);
+  void savebin(std::ostream* os, const std::string& score_type);
+  void save(std::ostream* os, const std::string& score_type, bool bin=false);
+  void save(const std::string &file, const std::string& score_type, bool bin=false);
+  void save(const std::string& score_type, bool bin=false);
+  void loadtxt(std::istream* is, std::size_t n);
+  void loadbin(std::istream* is, std::size_t n);
+  void load(std::istream* is);
+  void load(const std::string &file);
+  bool check_consistency() const;
+};
+}
+#endif  // MERT_SCORE_ARRAY_H_

mosesdecoder/mert/Util.h ADDED Viewed

	@@ -0,0 +1,149 @@

+/*
+ *  Util.h
+ *  mert - Minimum Error Rate Training
+ *
+ *  Created by Nicola Bertoldi on 13/05/08.
+ *
+ */
+#ifndef MERT_UTIL_H_
+#define MERT_UTIL_H_
+#include <cmath>
+#include <cstdlib>
+#include <stdexcept>
+#include <limits>
+#include <vector>
+#include <map>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <cstring>
+#include "Types.h"
+namespace MosesTuning
+{
+#ifdef TRACE_ENABLE
+#define TRACE_ERR(str) { std::cerr << str; }
+#else
+#define TRACE_ERR(str) { }
+#endif
+#if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2)
+// gcc nth_element() bug
+#define NTH_ELEMENT3(begin, middle, end) std::sort(begin, end)
+#define NTH_ELEMENT4(begin, middle, end, orderer) std::sort(begin, end, orderer)
+#else
+#define NTH_ELEMENT3(begin, middle, end) std::nth_element(begin, middle, end)
+#define NTH_ELEMENT4(begin, middle, end, orderer) std::nth_element(begin, middle, end, orderer)
+#endif
+const char kDefaultDelimiterSymbol[] = " ";
+int verboselevel();
+int setverboselevel(int v);
+const float kEPS = 0.0001f;
+template <typename T>
+bool IsAlmostEqual(T expected, T actual, float round=kEPS)
+{
+  if (std::abs(expected - actual) < round) {
+    return true;
+  } else {
+    std::cerr << "Fail: expected = " << expected
+              << " (actual = " << actual << ")" << std::endl;
+    return false;
+  }
+}
+/**
+ * Find the specified delimiter for the string 'str', and 'str' is assigned
+ * to a substring object that starts at the position of first occurrence of
+ * the delimiter in 'str'. 'substr' is copied from 'str' ranging from
+ * the start position of 'str' to the position of first occurrence of
+ * the delimiter.
+ *
+ * It returns the position of first occurrence in the queried string.
+ * If the content is not found, std::string::npos is returned.
+ */
+size_t getNextPound(std::string &str, std::string &substr,
+                    const std::string &delimiter = kDefaultDelimiterSymbol);
+void split(const std::string &s, char delim, std::vector<std::string> &elems);
+/**
+ * Split the string 'str' with specified delimitter 'delim' into tokens.
+ * The resulting tokens are set to 'res'.
+ *
+ * ex. "a,b,c" => {"a", "b", "c"}.
+ */
+void Tokenize(const char *str, const char delim, std::vector<std::string> *res);
+template<typename T>
+inline T Scan(const std::string &input)
+{
+  std::stringstream stream(input);
+  T ret;
+  stream >> ret;
+  return ret;
+}
+/**
+ * Returns true iff "str" ends with "suffix".
+ * e.g., Given str = "abc:" and suffix = ":", this function returns true.
+ */
+inline bool EndsWith(const std::string& str, const char* suffix)
+{
+  return str.find_last_of(suffix) == str.size() - 1;
+}
+template<typename T>
+inline std::string stringify(T x)
+{
+  std::ostringstream o;
+  if (!(o << x))
+    throw std::runtime_error("stringify(template<typename T>)");
+  return o.str();
+}
+inline ScoreStatsType ConvertCharToScoreStatsType(const char *str)
+{
+  return std::atoi(str);
+}
+inline ScoreStatsType ConvertStringToScoreStatsType(const std::string& str)
+{
+  return ConvertCharToScoreStatsType(str.c_str());
+}
+inline FeatureStatsType ConvertCharToFeatureStatsType(const char *str)
+{
+  return static_cast<FeatureStatsType>(std::atof(str));
+}
+inline FeatureStatsType ConvertStringToFeatureStatsType(const std::string &str)
+{
+  return ConvertCharToFeatureStatsType(str.c_str());
+}
+inline std::string trimStr(const std::string& Src, const std::string& c = " \r\n")
+{
+  size_t p2 = Src.find_last_not_of(c);
+  if (p2 == std::string::npos) return std::string();
+  size_t p1 = Src.find_first_not_of(c);
+  if (p1 == std::string::npos) p1 = 0;
+  return Src.substr(p1, (p2-p1)+1);
+}
+// Utilities to measure decoding time
+void ResetUserTime();
+void PrintUserTime(const std::string &message);
+double GetUserTime();
+}
+#endif  // MERT_UTIL_H_

mosesdecoder/moses/TranslationModel/UG/util/ibm1-align ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67f9b51b84f1b18fefcfe58feba9a9879648529fed29fbfb90ec0cec4f42a80e
+size 1062799

mosesdecoder/scripts/Jamfile ADDED Viewed

	@@ -0,0 +1,23 @@

+#See ../Jamroot for options.
+import option path ;
+build-project training ;
+prefix = [ option.get "prefix" ] ;
+if $(prefix) {
+  prefix = [ path.root $(prefix) [ path.pwd ] ] ;
+  location = [ option.get "install-scripts" : : $(prefix)$(GITTAG)/scripts ] ;
+} else {
+  location = [ option.get "install-scripts" ] ;
+}
+if $(location) {
+  location = [ path.root $(location) [ path.pwd ] ] ;
+  install scripts :
+    [ glob-tree README *.js *.pl *.perl *.pm *.py *.sh *.php : tests regression-testing other bin ]
+    [ glob share/nonbreaking_prefixes/* ems/example/*.* ems/example/data/* ems/web/* analysis/smtgui/* : ems/web/javascripts ]
+    generic/fsa-sample.fsa
+    ems/experiment.machines
+    ems/experiment.meta
+    : <install-source-root>. <location>$(location) ;
+}

mosesdecoder/scripts/README ADDED Viewed

	@@ -0,0 +1,15 @@

+2006-07-29
+This directory should contain all multi-purpose scripts for:
+- training    ... training moses (including BLEU evaluation needed for MERT)
+- analysis    ... analyzing MT output (for human analysis)
+- generic     ... script for handling generic issues (parallelization)
+- lib         ... perl modules used by various scripts
+The Jamfile then takes care of proper 'release' from your git directory to
+the shared directories.
+The released scripts should remain in the *same directory structure*.

mosesdecoder/scripts/generic/binarize4moses2.perl ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env perl
+use strict;
+use Getopt::Long;
+use File::Basename;
+use FindBin qw($RealBin);
+sub systemCheck($);
+my $mosesDir = "$RealBin/../..";
+my $ptPath;
+my $lexRoPath;
+my $outPath;
+my $numScores = 4;
+my $numLexScores;
+my $pruneNum = 100;
+my $scfg = 0;
+GetOptions("phrase-table=s"  => \$ptPath,
+           "lex-ro=s"   => \$lexRoPath,
+           "output-dir=s" => \$outPath,
+           "num-scores=s" => \$numScores,
+           "num-lex-scores=i" => \$numLexScores,
+           "prune=i" => \$pruneNum,
+           "scfg" => \$scfg
+	   ) or exit 1;
+#print STDERR "scfg=$scfg \n";
+die("ERROR: please set --phrase-table") unless defined($ptPath);
+#die("ERROR: please set --lex-ro") unless defined($lexRoPath);
+die("ERROR: please set --output-dir") unless defined($outPath);
+#die("ERROR: please set --num-lex-scores") unless defined($numLexScores);
+die("ERROR: compile contrib/sigtest-filter") if (!-X "$mosesDir/contrib/sigtest-filter/filter-pt");
+die("ERROR: compile with bjam --with-cmph") if (!-X "$mosesDir/bin/processLexicalTableMin");
+die("ERROR: compile with bjam --with-xmlrpc-c") if (!-X "$mosesDir/bin/CreateProbingPT");
+my $cmd;
+my $tempPath = dirname($outPath)  ."/tmp.$$";
+`mkdir -p $tempPath`;
+$cmd = "gzip -dc $ptPath |  $mosesDir/contrib/sigtest-filter/filter-pt -n $pruneNum | gzip -c > $tempPath/pt.gz";
+systemCheck($cmd);
+if (defined($lexRoPath)) {
+  die("ERROR: please set --num-lex-scores") unless defined($numLexScores);
+  $cmd = "$mosesDir/bin/processLexicalTableMin  -in $lexRoPath -out $tempPath/lex-ro -T . -threads all";
+  systemCheck($cmd);
+  $cmd = "$mosesDir/bin/addLexROtoPT $tempPath/pt.gz $tempPath/lex-ro.minlexr  | gzip -c > $tempPath/pt.withLexRO.gz";
+  systemCheck($cmd);
+  $cmd = "ln -s pt.withLexRO.gz $tempPath/pt.txt.gz";
+  systemCheck($cmd);
+}
+else {
+    $cmd = "ln -s pt.gz $tempPath/pt.txt.gz";
+    systemCheck($cmd);
+}
+$cmd = "$mosesDir/bin/CreateProbingPT2 --num-scores $numScores --log-prob --input-pt $tempPath/pt.txt.gz --output-dir $outPath";
+if (defined($lexRoPath)) {
+    $cmd .= " --num-lex-scores $numLexScores";
+}
+if ($scfg) {
+    $cmd .= " --scfg";
+}
+systemCheck($cmd);
+exit(0);
+#####################################################
+sub systemCheck($)
+{
+  my $cmd = shift;
+  print STDERR "Executing: $cmd\n";
+  my $retVal = system($cmd);
+  if ($retVal != 0)
+  {
+    exit(1);
+  }
+}

mosesdecoder/scripts/generic/bsbleu.py ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/usr/bin/env python
+# compute Bleu scores with confidence intervals via boostrap resampling
+# written by Ulrich Germann
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+from argparse import ArgumentParser
+import math
+import os
+from random import randint
+import sys, gzip
+def count_ngrams(snt, max_n):
+    """
+    Return a dictionary of ngram counts (up to length /max_n/)
+    for sentence (list of words) /snt/.
+    """
+    ret = {}
+    for i in xrange(len(snt)):
+        for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)):
+            key = tuple(snt[i:k])
+            ret[key] = ret.get(key, 0) + 1
+    return ret
+def max_counts(ng1, ng2):
+    """
+    Return a dicitonary of ngram counts such that
+    each count is the greater of the two individual counts
+    for each ngram in the input ngram count dictionaries
+    /ng1/ and /ng2/.
+    """
+    ret = ng1.copy()
+    for k, v in ng2.items():
+        ret[k] = max(ret.get(k, 0), v)
+    return ret
+def ng_hits(hyp, ref, max_n):
+    """
+    Return a list of ngram counts such that each ngram count
+    is the minimum of the counts in hyp and ref, up to ngram
+    length /max_n/.
+    """
+    ret = [0 for i in xrange(max_n)]
+    for ng, cnt in hyp.items():
+        k = ng
+        if len(k) <= max_n:
+            ret[len(k) - 1] += min(cnt, ref.get(ng, 0))
+    return ret
+class BleuScore:
+    def __init__(self, hyp, ref, max_n=4, bootstrap=1000):
+        # print len(hyp.ngrams), len(ref.ngrams), "X"
+        self.hits = [
+            ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n)
+            for i in xrange(len(hyp.ngrams))]
+        self.max_n = max_n
+        self.hyp = hyp
+        self.ref = ref
+        self.lower = None
+        self.upper = None
+        self.median = None
+        self.actual = self.score([i for i in xrange(len(hyp.snt))])
+        if bootstrap:
+            self.bootstrap = [self.score([randint(0, len(hyp.snt) - 1)
+                                          for s in hyp.snt])
+                              for i in xrange(bootstrap)]
+            self.bootstrap.sort()
+        else:
+            self.bootstrap = [self.actual]
+            pass
+    def score(self, sample):
+        hits = [0 for i in xrange(self.max_n)]
+        self.hyplen = 0
+        self.reflen = 0
+        self.total = [0 for i in hits]
+        for i in sample:
+            self.hyplen += len(self.hyp.snt[i])
+            self.reflen += len(self.ref.snt[i])
+            for n in xrange(self.max_n):
+                hits[n] += self.hits[i][n]
+                self.total[n] += max(len(self.hyp.snt[i]) - n, 0)
+                pass
+        self.prec = [float(hits[n]) / self.total[n]
+                     for n in xrange(self.max_n)]
+        ret = sum([math.log(x) for x in self.prec]) / self.max_n
+        self.BP = min(
+            1, math.exp(1. - float(self.reflen) / float(self.hyplen)))
+        ret += math.log(self.BP)
+        return math.exp(ret)
+class Document:
+    def __init__(self, fname=None):
+        self.fname = fname
+        if fname:
+            if fname[-3:] == ".gz":
+                self.snt = [line.strip().split() for line in gzip.open(fname).readlines()]
+            else:
+                self.snt = [line.strip().split() for line in open(fname)]
+                pass
+            self.ngrams = [count_ngrams(snt, 4) for snt in self.snt]
+            # print self.snt
+        else:
+            self.snt = None
+            self.ngrams = None
+    def merge(self, R):
+        self.fname = "multi-ref"
+        self.ngrams = [x for x in R[0].ngrams]
+        self.snt = [x for x in R[0].snt]
+        for i in xrange(len(R[0].ngrams)):
+            for k in xrange(1, len(R)):
+                self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i])
+    def update(self, hyp, R):
+        for i, hyp_snt in enumerate(hyp.snt):
+            clen = len(hyp_snt)
+            K = 0
+            for k in xrange(1, len(R)):
+                k_snt = R[k].snt[i]
+                assert len(R[k].snt) == len(hyp.snt), (
+                    "Mismatch in number of sentences " +
+                    "between reference and candidate")
+                if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen):
+                    if len(k_snt) < len(R[K].snt[i]):
+                        K = k
+                elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen):
+                    K = k
+            self.snt[i] = R[K].snt[i]
+if __name__ == "__main__":
+    argparser = ArgumentParser()
+    argparser.add_argument(
+        "-r", "--ref", nargs='+', help="Reference translation(s).")
+    argparser.add_argument(
+        "-c", "--cand", nargs='+', help="Candidate translations.")
+    argparser.add_argument(
+        "-i", "--individual", action='store_true',
+        help="Compute BLEU scores for individual references.")
+    argparser.add_argument(
+        "-b", "--bootstrap", type=int, default=1000,
+        help="Sample size for bootstrap resampling.")
+    argparser.add_argument(
+        "-a", "--alpha", type=float, default=.05,
+        help="1-alpha = confidence interval.")
+    args = argparser.parse_args(sys.argv[1:])
+    R = [Document(fname) for fname in args.ref]
+    C = [Document(fname) for fname in args.cand]
+    Rx = Document()  # for multi-reference BLEU
+    Rx.merge(R)
+    for c in C:
+        # compute multi-reference BLEU
+        Rx.update(c, R)
+        bleu = BleuScore(c, Rx, bootstrap=args.bootstrap)
+        print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % (
+            100 * bleu.actual,
+            os.path.basename(Rx.fname),
+            100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)],
+            100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)],
+            100 * bleu.bootstrap[int(.5 * args.bootstrap)],
+            c.fname)  # os.path.basename(c.fname))
+        if args.individual:
+            for r in R:
+                bleu = BleuScore(c, r, bootstrap=args.bootstrap)
+                print "  %5.2f %s" % (
+                    100 * bleu.actual, os.path.basename(r.fname))
+                # print bleu.prec, bleu.hyplen, bleu.reflen, bleu.BP
+        # print [
+        #     sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))])
+        #     for n in xrange(4)]

mosesdecoder/scripts/generic/compound-splitter.perl ADDED Viewed

	@@ -0,0 +1,295 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use strict;
+use Getopt::Long "GetOptions";
+my ($CORPUS,$MODEL,$TRAIN,$HELP,$VERBOSE);
+my $FILLER = ":s:es";
+my $MIN_SIZE = 3;
+my $MIN_COUNT = 5;
+my $MAX_COUNT = 5;
+my $FACTORED = 0;
+my $SYNTAX = 0;
+my $MARK_SPLIT = 0;
+my $BINARIZE = 0;
+$HELP = 1
+    unless &GetOptions('corpus=s' => \$CORPUS,
+		       'model=s' => \$MODEL,
+		       'filler=s' => \$FILLER,
+		       'factored' => \$FACTORED,
+		       'min-size=i' => \$MIN_SIZE,
+		       'min-count=i' => \$MIN_COUNT,
+		       'max-count=i' => \$MAX_COUNT,
+		       'help' => \$HELP,
+		       'verbose' => \$VERBOSE,
+		       'syntax' => \$SYNTAX,
+		       'binarize' => \$BINARIZE,
+		       'mark-split' => \$MARK_SPLIT,
+		       'train' => \$TRAIN);
+if ($HELP ||
+    ( $TRAIN && !$CORPUS) ||
+    (!$TRAIN && !$MODEL)) {
+    print "Compound splitter\n";
+    print "-----------------\n\n";
+    print "train:   compound-splitter -train -corpus txt-file -model new-model\n";
+    print "apply:   compound-splitter -model trained-model < in > out\n";
+    print "options: -min-size: minimum word size (default $MIN_SIZE)\n";
+    print "         -min-count: minimum word count (default $MIN_COUNT)\n";
+    print "         -filler: filler letters between words (default $FILLER)\n";
+    print "         -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n";
+    print "         -syntax: syntactically parsed data (default $SYNTAX)\n";
+    print "         -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n";
+    print "         -binarize: binarize subtree for split word (default $BINARIZE)\n";
+    exit;
+}
+if ($TRAIN) {
+    if ($SYNTAX)      { &train_syntax(); }
+    elsif ($FACTORED) { &train_factored(); }
+    else              { &train(); }
+}
+else {
+    &apply();
+}
+sub train {
+    my %COUNT;
+    open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
+    while(<CORPUS>) {
+	chop; s/\s+/ /g; s/^ //; s/ $//;
+	foreach (split) {
+	    $COUNT{$_}++;
+	}
+    }
+    close(CORPUS);
+    &save_trained_model(\%COUNT);
+}
+sub save_trained_model {
+    my ($COUNT) = @_;
+    my $id = 0;
+    open(MODEL,">".$MODEL);
+    foreach my $word (keys %$COUNT) {
+	print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n";
+    }
+    close(MODEL);
+    print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n";
+}
+sub train_factored {
+  my (%COUNT,%FACTORED_COUNT);
+  # collect counts for interpretations for each surface word
+  open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
+  while(<CORPUS>) {
+    chop; s/\s+/ /g; s/^ //; s/ $//;
+    foreach my $factored_word (split) {
+      my $word = $factored_word;
+      $word =~ s/\|.+//g; # just first factor
+      $FACTORED_COUNT{$word}{$factored_word}++;
+	  }
+  }
+  close(CORPUS);
+  # only preserve most frequent interpretation, assign sum of counts
+  foreach my $word (keys %FACTORED_COUNT) {
+    my ($max,$best,$total) = (0,"",0);
+    foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) {
+      my $count = $FACTORED_COUNT{$word}{$factored_word};
+      $total += $count;
+      if ($count > $max) {
+        $max = $count;
+        $best = $factored_word;
+      }
+    }
+    $COUNT{$best} = $total;
+  }
+  &save_trained_model(\%COUNT);
+}
+sub train_syntax {
+  my (%COUNT,%LABELED_COUNT);
+  # collect counts for interpretations for each surface word
+  open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
+  while(<CORPUS>) {
+    chop; s/\s+/ /g; s/^ //; s/ $//;
+    my $label;
+    foreach (split) {
+      if (/^label="([^\"]+)"/) {
+        $label = $1;
+      }
+      elsif (! /^</) {
+        $LABELED_COUNT{$_}{$label}++;
+      }
+	  }
+  }
+  close(CORPUS);
+  # only preserve most frequent label, assign sum of counts
+  foreach my $word (keys %LABELED_COUNT) {
+    my ($max,$best,$total) = (0,"",0);
+    foreach my $label (keys %{$LABELED_COUNT{$word}}) {
+      my $count = $LABELED_COUNT{$word}{$label};
+      $total += $count;
+      if ($count > $max) {
+        $max = $count;
+        $best = "$word $label";
+      }
+    }
+    $COUNT{$best} = $total;
+  }
+  &save_trained_model(\%COUNT);
+}
+sub apply {
+    my (%COUNT,%TRUECASE,%LABEL);
+    open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'");
+    while(<MODEL>) {
+	chomp;
+	my ($id,$factored_word,$count) = split(/\t/);
+        my $label;
+        ($factored_word,$label) = split(/ /,$factored_word);
+        my $word = $factored_word;
+        $word =~ s/\|.+//g; # just first factor
+        my $lc = lc($word);
+	# if word exists with multipe casings, only record most frequent
+        next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
+	$COUNT{$lc} = $count;
+	$TRUECASE{$lc} = $factored_word;
+	$LABEL{$lc} = $label if $SYNTAX;
+    }
+    close(MODEL);
+    while(<STDIN>) {
+	my $first = 1;
+	chop; s/\s+/ /g; s/^ //; s/ $//;
+	my @BUFFER; # for xml tags
+	foreach my $factored_word (split) {
+	    print " " unless $first;
+	    $first = 0;
+	    # syntax: don't split xml
+	    if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
+		push @BUFFER,$factored_word;
+		$first = 1;
+		next;
+	    }
+	    # get case class
+	    my $word = $factored_word;
+	    $word =~ s/\|.+//g; # just first factor
+	    my $lc = lc($word);
+	    print STDERR "considering $word ($lc)...\n" if $VERBOSE;
+	    # don't split frequent words
+	    if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
+	        $lc !~ /[a-zA-Z]/) {; # has to have at least one letter
+		print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+		print $factored_word;
+		print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
+		next;
+	    }
+	    # consider possible splits
+	    my $final = length($word)-1;
+	    my %REACHABLE;
+	    for(my $i=0;$i<=$final;$i++) { $REACHABLE{$i} = (); }
+	    print STDERR "splitting $word:\n" if $VERBOSE;
+	    for(my $end=$MIN_SIZE;$end<length($word);$end++) {
+		for(my $start=0;$start<=$end-$MIN_SIZE;$start++) {
+		    next unless $start == 0 || defined($REACHABLE{$start-1});
+		    foreach my $filler (split(/:/,$FILLER)) {
+			next if $start == 0 && $filler ne "";
+			next if lc(substr($word,$start,length($filler))) ne $filler;
+			my $subword = lc(substr($word,
+					        $start+length($filler),
+					        $end-$start+1-length($filler)));
+			next unless defined($COUNT{$subword});
+			next unless $COUNT{$subword} >= $MIN_COUNT;
+			print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE;
+			push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}";
+		    }
+		}
+	    }
+	    # no matches at all?
+	    if (!defined($REACHABLE{$final})) {
+    print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+		print $factored_word;
+		next;
+	    }
+	    my ($best_split,$best_score) = ("",0);
+	    my %ITERATOR;
+	    for(my $i=0;$i<=$final;$i++) { $ITERATOR{$i}=0; }
+	    my $done = 0;
+	    while(1) {
+		# read off word
+		my ($pos,$decomp,$score,$num,@INDEX) = ($final,"",1,0);
+		while($pos>0) {
+		    last unless scalar @{$REACHABLE{$pos}} > $ITERATOR{$pos}; # dead end?
+		    my ($nextpos,$subword,$count)
+			= split(/ /,$REACHABLE{$pos}[ $ITERATOR{$pos} ]);
+		    $decomp = $subword." ".$decomp;
+		    $score *= $count;
+		    $num++;
+		    push @INDEX,$pos;
+#		    print STDERR "($nextpos-$pos,$decomp,$score,$num)\n";
+		    $pos = $nextpos-1;
+		}
+		chop($decomp);
+		print STDERR "\tsplit: $decomp ($score ** 1/$num) = ".($score ** (1/$num))."\n" if $VERBOSE;
+		$score **= 1/$num;
+		if ($score>$best_score) {
+		    $best_score = $score;
+		    $best_split = $decomp;
+		}
+		# increase iterator
+		my $increase = -1;
+		while($increase<$final) {
+		    $increase = pop @INDEX;
+		    $ITERATOR{$increase}++;
+		    last if scalar @{$REACHABLE{$increase}} > $ITERATOR{$increase};
+		}
+		last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final};
+		for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; }
+	    }
+      if ($best_split !~ / /) {
+        print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+        print $factored_word; # do not change case for unsplit words
+        next;
+      }
+      if (!$SYNTAX) {
+        print $best_split;
+      }
+      else {
+        $BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT;
+        $BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n");
+        my $pos = $1;
+        print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+        my @SPLIT = split(/ /,$best_split);
+        my @OUT = ();
+        if ($BINARIZE) {
+          for(my $w=0;$w<scalar(@SPLIT)-2;$w++) {
+            push @OUT,"<tree label=\"\@$pos\">";
+          }
+        }
+        for(my $w=0;$w<scalar(@SPLIT);$w++) {
+          if ($BINARIZE && $w>=2) { push @OUT, "</tree>"; }
+          push @OUT,"<tree label=\"".$LABEL{lc($SPLIT[$w])}."\"> $SPLIT[$w] </tree>";
+        }
+        print join(" ",@OUT);
+      }
+	}
+  print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer
+	print "\n";
+    }
+}

mosesdecoder/scripts/generic/extract-factors.pl ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+# $Id$
+#extract-factors.pl: extract only the desired factors from a factored corpus
+#usage: extract-factors corpusfile factor-index factor-index ... > outfile
+#factor indices start at 0
+#factor indices too large ought to be ignored
+use warnings;
+use strict;
+my ($filename, @factors) = @ARGV;
+my %indices = map {$_ => 1} @factors;
+open(INFILE, "<$filename") or die "couldn't open '$filename' for read: $!\n";
+while(my $line = <INFILE>)
+{
+	chop $line;
+	print join(' ', map {my $i = 0; join('|', grep($indices{$i++}, split(/\|/, $_)))} split(/\s+/, $line)) . "\n";
+}
+close(INFILE);

mosesdecoder/scripts/generic/extract-parallel.perl ADDED Viewed

	@@ -0,0 +1,385 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+# example
+#  ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput
+use warnings;
+use strict;
+use File::Basename;
+sub RunFork($);
+sub systemCheck($);
+sub NumStr($);
+sub DigitStr($);
+sub CharStr($);
+sub GetSplitVersion($);
+my $alph = "abcdefghijklmnopqrstuvwxyz";
+my @alph = (split(//,$alph));
+print "Started ".localtime() ."\n";
+my $numParallel= $ARGV[0];
+$numParallel = 1 if $numParallel < 1;
+my $splitCmd= $ARGV[1];
+my $sortCmd= $ARGV[2];
+my $extractCmd= $ARGV[3];
+my $target = $ARGV[4]; # 1st arg of extract argument
+my $source = $ARGV[5]; # 2nd arg of extract argument
+my $align = $ARGV[6]; # 3rd arg of extract argument
+my $extract = $ARGV[7]; # 4th arg of extract argument
+my $makeTTable = 1; # whether to build the ttable extract files
+my $otherExtractArgs= "";
+my $weights = "";
+my $baselineExtract;
+my $glueFile;
+my $phraseOrientation = 0;
+my $phraseOrientationPriorsFile;
+my $splitCmdOption = "";
+my $GZIP_EXEC;
+if(`which pigz 2> /dev/null`) {
+  $GZIP_EXEC = 'pigz';
+}
+else {
+  $GZIP_EXEC = 'gzip';
+}
+print STDERR "using $GZIP_EXEC \n";
+my $isBSDSplit = GetSplitVersion($splitCmd);
+print STDERR "isBSDSplit=$isBSDSplit \n";
+if ($isBSDSplit == 0) {
+	$splitCmdOption .= "-d";
+}
+my $gzOut = 0;
+for (my $i = 8; $i < $#ARGV + 1; ++$i)
+{
+  $makeTTable = 0 if $ARGV[$i] eq "--NoTTable";
+  if ($ARGV[$i] eq '--BaselineExtract') {
+    $baselineExtract = $ARGV[++$i];
+    next;
+  }
+  if ($ARGV[$i] eq '--InstanceWeights') {
+    $weights = $ARGV[++$i];
+    next;
+  }
+  if ($ARGV[$i] eq '--GlueGrammar') {
+    $glueFile = $ARGV[++$i];
+    next;
+  }
+  $phraseOrientation = 1 if $ARGV[$i] eq "--PhraseOrientation";
+  if ($ARGV[$i] eq '--PhraseOrientationPriors') {
+    $phraseOrientationPriorsFile = $ARGV[++$i];
+    next;
+  }
+  if ($ARGV[$i] eq '--GZOutput') {
+  	$gzOut = 1;
+  }
+  $otherExtractArgs .= $ARGV[$i] ." ";
+}
+die("Need to specify --GZOutput for parallel extract") if ($gzOut == 0);
+my $cmd;
+my $TMPDIR=dirname($extract)  ."/tmp.$$";
+$cmd = "mkdir -p $TMPDIR; ls -l $TMPDIR";
+print STDERR "Executing: $cmd \n";
+`$cmd`;
+my $totalLines = int(`cat $align | wc -l`);
+my $linesPerSplit = int($totalLines / $numParallel) + 1;
+print "total=$totalLines line-per-split=$linesPerSplit \n";
+my @children;
+my $pid;
+if ($numParallel > 1)
+{
+	$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $target $TMPDIR/target.";
+	$pid = RunFork($cmd);
+	push(@children, $pid);
+	$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $source $TMPDIR/source.";
+	$pid = RunFork($cmd);
+	push(@children, $pid);
+	$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $align $TMPDIR/align.";
+	$pid = RunFork($cmd);
+	push(@children, $pid);
+  if ($weights) {
+    $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $weights $TMPDIR/weights.";
+    $pid = RunFork($cmd);
+    push(@children, $pid);
+  }
+	# wait for everything is finished
+	foreach (@children) {
+		waitpid($_, 0);
+	}
+}
+else
+{
+  my $numStr = NumStr(0);
+  $cmd = "ln -s $target $TMPDIR/target.$numStr";
+	`$cmd`;
+  $cmd = "ln -s $source $TMPDIR/source.$numStr";
+	`$cmd`;
+  $cmd = "ln -s $align $TMPDIR/align.$numStr";
+	`$cmd`;
+  if ($weights) {
+    $cmd = "ln -s $weights $TMPDIR/weights.$numStr";
+    `$cmd`;
+  }
+}
+# run extract
+@children = ();
+for (my $i = 0; $i < $numParallel; ++$i)
+{
+  my $pid = fork();
+  if ($pid == 0)
+  { # child
+    my $numStr = NumStr($i);
+    my $weightsCmd = "";
+    if ($weights) {
+      $weightsCmd = "--InstanceWeights $TMPDIR/weights.$numStr";
+    }
+    my $glueArg = "";
+    if (defined($glueFile)) {
+      $glueArg = "--GlueGrammar $TMPDIR/glue.$numStr";
+    }
+    #print STDERR "glueArg=$glueArg \n";
+    my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $glueArg $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n";
+    `$cmd`;
+    exit();
+  }
+  else
+  { # parent
+  	push(@children, $pid);
+  }
+}
+# wait for everything is finished
+foreach (@children) {
+	waitpid($_, 0);
+}
+# merge
+my $catCmd = "gunzip -c ";
+my $catInvCmd = $catCmd;
+my $catOCmd = $catCmd;
+my $catContextCmd = $catCmd;
+my $catContextInvCmd = $catCmd;
+for (my $i = 0; $i < $numParallel; ++$i)
+{
+		my $numStr = NumStr($i);
+		$catCmd .= "$TMPDIR/extract.$numStr.gz ";
+		$catInvCmd .= "$TMPDIR/extract.$numStr.inv.gz ";
+		$catOCmd .= "$TMPDIR/extract.$numStr.o.gz ";
+		$catContextCmd .= "$TMPDIR/extract.$numStr.context ";
+		$catContextInvCmd .= "$TMPDIR/extract.$numStr.context.inv ";
+}
+if (defined($baselineExtract)) {
+		my $sorted = -e "$baselineExtract.sorted.gz" ? ".sorted" : "";
+		$catCmd .= "$baselineExtract$sorted.gz ";
+		$catInvCmd .= "$baselineExtract.inv$sorted.gz ";
+		$catOCmd .= "$baselineExtract.o$sorted.gz ";
+}
+$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.sorted.gz 2>> /dev/stderr \n";
+$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.inv.sorted.gz 2>> /dev/stderr \n";
+$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.o.sorted.gz 2>> /dev/stderr \n";
+$catContextCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.sorted.gz 2>> /dev/stderr \n";
+$catContextInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.inv.sorted.gz 2>> /dev/stderr \n";
+@children = ();
+if ($makeTTable)
+{
+  print STDERR "merging extract / extract.inv\n";
+  $pid = RunFork($catCmd);
+  push(@children, $pid);
+  $pid = RunFork($catInvCmd);
+  push(@children, $pid);
+}
+else {
+  print STDERR "skipping extract, doing only extract.o\n";
+}
+if ($otherExtractArgs =~ /--FlexibilityScore/) {
+  $pid = RunFork($catContextCmd);
+  push(@children, $pid);
+  $pid = RunFork($catContextInvCmd);
+  push(@children, $pid);
+  }
+my $numStr = NumStr(0);
+if (-e "$TMPDIR/extract.$numStr.o.gz")
+{
+	$pid = RunFork($catOCmd);
+	push(@children, $pid);
+}
+# wait for all sorting to finish
+foreach (@children) {
+	waitpid($_, 0);
+}
+# merge glue rules
+if (defined($glueFile)) {
+  my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
+  print STDERR "Merging glue rules: $cmd \n";
+  print STDERR `$cmd`;
+}
+# merge phrase orientation priors (GHKM extraction)
+if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
+  print STDERR "Merging phrase orientation priors\n";
+  my @orientationPriorsCountFiles = glob("$TMPDIR/*.phraseOrientationPriors");
+  my %priorCounts;
+  foreach my $filenamePhraseOrientationPriors (@orientationPriorsCountFiles) {
+    if (-f $filenamePhraseOrientationPriors) {
+      open my $infilePhraseOrientationPriors, '<', $filenamePhraseOrientationPriors or die "cannot open $filenamePhraseOrientationPriors: $!";
+      while (my $line = <$infilePhraseOrientationPriors>) {
+        print $line;
+        my ($key, $value) = split / /, $line;
+        $priorCounts{$key} += $value;
+      }
+      close $infilePhraseOrientationPriors;
+    }
+  }
+  open my $outPhraseOrientationPriors, '>', $phraseOrientationPriorsFile or die "cannot open $phraseOrientationPriorsFile: $!";
+  foreach my $key (sort keys %priorCounts) {
+    print $outPhraseOrientationPriors $key." ".$priorCounts{$key}."\n";
+  }
+  close($outPhraseOrientationPriors);
+}
+# delete temporary files
+$cmd = "rm -rf $TMPDIR \n";
+systemCheck($cmd);
+print STDERR "Finished ".localtime() ."\n";
+# -----------------------------------------
+# -----------------------------------------
+sub RunFork($)
+{
+  my $cmd = shift;
+  my $pid = fork();
+  if ($pid == 0)
+  { # child
+    print STDERR $cmd;
+    systemCheck($cmd);
+    exit();
+  }
+  return $pid;
+}
+sub systemCheck($)
+{
+  my $cmd = shift;
+  my $retVal = system($cmd);
+  if ($retVal != 0)
+  {
+    exit(1);
+  }
+}
+sub DigitStr($)
+{
+    my $i = shift;
+    my $numStr;
+    if ($i < 10) {
+	$numStr = "000000$i";
+    }
+    elsif ($i < 100) {
+	$numStr = "00000$i";
+    }
+    elsif ($i < 1000) {
+	$numStr = "0000$i";
+    }
+    elsif ($i < 10000) {
+	$numStr = "000$i";
+    }
+    elsif ($i < 100000) {
+	$numStr = "00$i";
+    }
+    elsif ($i < 1000000) {
+	$numStr = "0$i";
+    }
+    else {
+	$numStr = $i;
+    }
+    return $numStr;
+}
+sub CharStr($)
+{
+    my $i = shift;
+    my $charStr;
+    my @bit=();
+    while ($i>0){
+        push @bit, $i%26;
+        $i=int($i/26);
+    }
+    my $offset=scalar(@bit);
+    my $h;
+    for ($h=6;$h>=$offset;--$h) { $charStr.="a"; }
+    for ($h=$offset-1;$h>=0;--$h) { $charStr.="$alph[$bit[$h]]"; }
+    return $charStr;
+}
+sub NumStr($)
+{
+    my $i = shift;
+    if ($isBSDSplit){
+        return CharStr($i);
+    }else{
+        return DigitStr($i);
+    }
+}
+sub GetSplitVersion($)
+{
+	my $splitCmd = shift;
+	my $retVal = system("$splitCmd --help > /dev/null");
+	if ($retVal != 0) {
+		return 1;
+	}
+	else {
+		return 0;
+	}
+}

mosesdecoder/scripts/generic/fsa-sample.fsa ADDED Viewed

	@@ -0,0 +1,10 @@

+0 1 Prague 0.5
+1 2 Stock 1
+2 6 Market 1
+0 3 New 0.5
+3 4 York 1
+4 5 Stock 1
+5 6 Exchange 1
+6 7 falls 0.5
+6 7 drops 0.5
+7 8 . 1

mosesdecoder/scripts/generic/fsa2fsal.pl ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env perl
+# A very simple script that converts fsa format (openfst lattices) to the same
+# thing represented one sentence per line. It uses '|||' to delimit columns and
+# ' ' to delimit nodes (i.e. original lines).
+# Some rudimentary sanity checks are done on the fly.
+# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use strict;
+my $errs = 0;
+sub err {
+  my $nr = shift;
+  my $msg = shift;
+  print STDERR "$nr:$msg\n";
+  $errs++;
+}
+my $onr = 0;
+my @lines = ();
+sub flush {
+  return if 0 == scalar @lines;
+  print join(" ", @lines);
+  print "\n";
+  $onr++;
+  @lines = ();
+}
+my $nr = 0;
+my $numscores = undef;
+while (<>) {
+  chomp;
+  if ($_ eq "") {
+    flush();
+    next;
+  }
+  my ($a, $b, $label, $scores, $rest) = split /\s+/, $_, 5;
+  err($nr, "The delimiter '|||' can't appear in the input!") if /\|\|\|/;
+  err($nr, "Node id not numeric: $a") if $a !~ /^\d+$/;
+  err($nr, "Node id not numeric: $b") if $b !~ /^\d+$/;
+  err($nr, "Unexpected tail: '$rest'") if defined $rest && $rest !~ /^\s*$/;
+  my $thisnumscores = ($scores =~ tr/,/,/);
+  $numscores = $thisnumscores if !defined $numscores;
+  err($nr, "Incompatible number of arc scores, previous lines had ".($numscores+1).", now ".($thisnumscores+1))
+    if $numscores != $thisnumscores;
+  push @lines, join("|||", ($a,$b,$label,$scores));
+}
+flush();
+exit 1 if $errs;

mosesdecoder/scripts/generic/fsa2plf.pl ADDED Viewed

	@@ -0,0 +1,182 @@

+#!/usr/bin/env perl
+# Converts AT&T FSA format to 'python lattice format'.
+# Note that the input FSA needs to be epsilon-free and topologically sorted.
+# This script checks for topological sortedness.
+# The start node has to have the index 0.
+# All path ends are assumed to be final nodes, not just the explicitly stated
+# final nodes.
+# Note that the output format may not contain any spaces.
+# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use strict;
+use Getopt::Long;
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+my $filelist;
+my $ignore_final_state_cost = 0;
+my $mangle_weights = undef;
+GetOptions(
+  "ignore-final-state-cost" => \$ignore_final_state_cost,
+    # sometimes, final states have a cost (e.g. "45 0.05\n")
+    # instead of dying there, ignore the problem
+  "filelist|fl=s" => \$filelist,
+  "mangle-weights=s" => \$mangle_weights,
+) or exit 1;
+my @infiles;
+if (defined $filelist) {
+  my $fh = my_open($filelist);
+  while (<$fh>) {
+    chomp;
+    push @infiles, $_;
+  }
+  close $fh;
+}
+push @infiles, @ARGV;
+@ARGV = ();
+if (0 == scalar(@infiles)) {
+  print STDERR "Reading input from stdin\n";
+  push @infiles, "-";
+}
+my $err = 0;
+foreach my $inf (@infiles) {
+  my $nr = 0;
+  NEXTLATTICE:
+  my %usedids = (); # collect all used ids for densification
+  my %usedtgtids = (); # collect all used ids for densification
+  my @outnodes = ();
+  my $fh = my_open($inf);
+  my %is_final; # remember which nodes were final
+  while (<$fh>) {
+    chomp;
+    $nr++;
+    last if $_ eq ""; # assume a blank line delimits lattices
+    my ($src, $tgt, $label, $weight) = split /\s+/;
+    die "$inf:$nr:Bad src node index: $src" if $src !~ /^[0-9]+$/;
+    if (!defined $label && !defined $weight) {
+      # explicit final node, warn at the end if there are any intermed. final
+      # nodes
+      $is_final{$src};
+      # final nodes can have a cost
+      die "$inf:$nr:Final state $src has cost $tgt. Unsupported, use --ignore-final-state-cost"
+        if defined $tgt && !$ignore_final_state_cost;
+      next;
+    }
+    $weight = 0 if !defined $weight;
+    $usedids{$src} = 1;
+    $usedtgtids{$tgt} = 1;
+    # process the weight
+    # when reading RWTH FSA output, the weights are negated natural logarithms
+    # we need to negate them back
+    if (defined $mangle_weights) {
+      if ($mangle_weights eq "expneg") {
+        $weight = join(",", map {exp(-$_)} split /,/, $weight);
+      } else {
+        die "Bad weights mangling: $mangle_weights";
+      }
+    }
+    # remember the node
+    my $targetnode = $tgt-$src;
+    die "$inf:$nr:Not topologically sorted, got arc from $src to $tgt"
+      if $targetnode <= 0;
+    push @{$outnodes[$src]}, [ $label, $weight, $tgt ];
+  }
+  if (eof($fh)) {
+    close $fh;
+    $fh = undef;
+  }
+  # Assign our dense IDs: source node ids are assigned first
+  my %denseids = (); # maps node ids from the file to dense ids
+  my $nextid = 0;
+  foreach my $id (sort {$a<=>$b} keys %usedids) {
+    $denseids{$id} = $nextid;
+    $nextid++;
+  }
+  # All unseen target nodes then get the same next id, the final node id
+  foreach my $id (keys %usedtgtids) {
+    next if defined $denseids{$id};
+    $denseids{$id} = $nextid;
+  }
+  foreach my $f (keys %is_final) {
+    if (defined $outnodes[$f]) {
+      print STDERR "$inf:Node $f is final but it has outgoing edges!\n";
+      $err = 1;
+    }
+  }
+#   # Verbose: print original to dense IDs mapping
+#   foreach my $src (sort {$a<=>$b} keys %denseids) {
+#     print STDERR "$src  ...> $denseids{$src}\n";
+#   }
+  print "(";
+  for(my $origsrc = 0; $origsrc < @outnodes; $origsrc++) {
+    my $src = $denseids{$origsrc};
+    next if !defined $src; # this original node ID is not used at all
+    next if $src == $nextid; # this is the ultimate merged final node
+    my $outnode = $outnodes[$origsrc];
+    print "(";
+    foreach my $arc (@$outnode) {
+      my $origtgt = $arc->[2];
+      my $tgt = $denseids{$origtgt};
+      if (!defined $tgt) {
+        # this was a final node only
+        $tgt = $denseids{$origtgt} = $nextid;
+        $nextid++;
+      }
+      my $step_to_target = $tgt - $src;
+      die "$inf:Bug, I damaged top-sortedness (orig $origsrc .. $origtgt; curr $src .. $tgt)." if $step_to_target <= 0;
+      print "('".apo($arc->[0])."',$arc->[1],$step_to_target),";
+    }
+    print "),";
+  }
+  print ")\n";
+  goto NEXTLATTICE if defined $fh && ! eof($fh);
+}
+die "There were errors." if $err;
+sub apo {
+  my $s = shift;
+  # protects apostrophy and backslash
+  $s =~ s/\\/\\\\/g;
+  $s =~ s/(['])/\\$1/g;
+  return $s;
+}
+sub my_open {
+  my $f = shift;
+  if ($f eq "-") {
+    binmode(STDIN, ":utf8");
+    return *STDIN;
+  }
+  die "Not found: $f" if ! -e $f;
+  my $opn;
+  my $hdl;
+  my $ft = `file '$f'`;
+  # file might not recognize some files!
+  if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) {
+    $opn = "zcat '$f' |";
+  } elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) {
+    $opn = "bzcat '$f' |";
+  } else {
+    $opn = "$f";
+  }
+  open $hdl, $opn or die "Can't open '$opn': $!";
+  binmode $hdl, ":utf8";
+  return $hdl;
+}

mosesdecoder/scripts/generic/generic-parallel.perl ADDED Viewed

	@@ -0,0 +1,119 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use strict;
+use utf8;
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+sub NumStr($);
+my $NUM_SPLIT_LINES = $ARGV[0];
+my $TMPDIR = $ARGV[1];
+$TMPDIR = "$TMPDIR/tmp.$$";
+mkdir $TMPDIR;
+print STDERR "TMPDIR=$TMPDIR \n";
+my $cmd = "";
+for (my $i = 2; $i < scalar(@ARGV); ++$i)
+{
+  $cmd .= $ARGV[$i] ." ";
+}
+# split input file
+open (INPUT_ALL, "> $TMPDIR/input.all");
+binmode INPUT_ALL, ":utf8";
+while (my $line = <STDIN>)
+{
+  chomp($line);
+  print INPUT_ALL $line."\n";
+}
+close(INPUT_ALL);
+my $cmd2 = "split -l $NUM_SPLIT_LINES -a 5 -d  $TMPDIR/input.all $TMPDIR/x";
+`$cmd2`;
+# create exec file
+open (EXEC, "> $TMPDIR/exec");
+binmode EXEC, ":utf8";
+# execute in parallel
+print STDERR "executing\n";
+my $i = 0;
+my $filePath = "$TMPDIR/x" .NumStr($i);
+while (-f $filePath)
+{
+  print EXEC "$cmd < $filePath > $filePath.out\n";
+  ++$i;
+  $filePath = "$TMPDIR/x" .NumStr($i);
+}
+close (EXEC);
+$cmd2 = "parallel < $TMPDIR/exec";
+`$cmd2`;
+# concatenate
+print STDERR "concatenating\n";
+$i = 1;
+my $firstPath = "$TMPDIR/x" .NumStr(0) .".out";
+$filePath = "$TMPDIR/x" .NumStr($i) .".out";
+while (-f $filePath)
+{
+  $cmd = "cat $filePath >> $firstPath";
+  `$cmd`;
+  ++$i;
+  $filePath = "$TMPDIR/x" .NumStr($i) .".out";
+}
+# output
+open (OUTPUT_ALL, "$firstPath");
+binmode OUTPUT_ALL, ":utf8";
+while (my $line = <OUTPUT_ALL>)
+{
+  chomp($line);
+  print "$line\n";
+}
+close(OUTPUT_ALL);
+$cmd = "rm -rf $TMPDIR/";
+`$cmd`;
+###########################################
+sub NumStr($)
+{
+    my $i = shift;
+    my $numStr;
+    if ($i < 10) {
+	$numStr = "000000$i";
+    }
+    elsif ($i < 100) {
+	$numStr = "00000$i";
+    }
+    elsif ($i < 1000) {
+	$numStr = "0000$i";
+    }
+    elsif ($i < 10000) {
+	$numStr = "000$i";
+    }
+    elsif ($i < 100000) {
+	$numStr = "00$i";
+    }
+    elsif ($i < 1000000) {
+	$numStr = "0$i";
+    }
+    else {
+	$numStr = $i;
+    }
+    return $numStr;
+}

mosesdecoder/scripts/generic/giza-parallel.perl ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+# example
+# ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align
+use warnings;
+use strict;
+use File::Basename;
+sub NumStr($);
+print "Started ".localtime() ."\n";
+my $numParallel		= $ARGV[0];
+my $splitCmd		= $ARGV[1];
+my $trainCmd		= $ARGV[2];
+my $inputExt		= $ARGV[3];
+my $outputExt		= $ARGV[4];
+my $corpus			= $ARGV[5];
+my $align			= $ARGV[6];
+my $TMPDIR=dirname($align)  ."/tmp.$$";
+mkdir $TMPDIR;
+my $scriptDir=dirname($trainCmd) ."/..";
+# split corpus file
+my $totalLines = int(`wc -l $corpus.$inputExt`);
+my $linesPerSplit = int($totalLines / $numParallel) + 1;
+my $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$inputExt $TMPDIR/source.";
+`$cmd`;
+$cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$outputExt $TMPDIR/target.";
+`$cmd`;
+for (my $i = 0; $i < $numParallel; ++$i)
+{
+  my $numStr = NumStr($i);
+  rename("$TMPDIR/source.$numStr", "$TMPDIR/$numStr.source");
+  rename("$TMPDIR/target.$numStr", "$TMPDIR/$numStr.target");
+}
+#fork & run giza & friends
+my $isParent = 1;
+my @childs;
+for (my $i = 0; $i < $numParallel; ++$i)
+{
+  my $pid = fork();
+	if ($pid == 0)
+	{ # child
+	  $isParent = 0;
+    my $numStr = NumStr($i);
+    my $cmd = "$trainCmd -dont-zip -last-step 1 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus $TMPDIR/$numStr -corpus-dir $TMPDIR/prepared.$numStr \n";
+    print $cmd;
+    `$cmd`;
+    $cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-e2f $TMPDIR/giza.$numStr -direction 2 \n";
+    print $cmd;
+    `$cmd`;
+    $cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -direction 1 \n";
+    print $cmd;
+    `$cmd`;
+    $cmd = "$trainCmd -dont-zip -first-step 3 -last-step 3 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -giza-e2f $TMPDIR/giza.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -alignment-file $TMPDIR/aligned.$numStr -alignment grow-diag-final-and \n";
+    print $cmd;
+    `$cmd`;
+    exit();
+  }
+	else
+	{ # parent
+		push(@childs, $pid);
+	}
+}
+# wait for everything is finished
+if ($isParent)
+{
+  foreach (@childs) {
+    waitpid($_, 0);
+  }
+}
+else
+{
+  die "shouldn't be here";
+}
+# cat all aligned files together. Voila
+my $cmd = "cat ";
+for (my $i = 0; $i < $numParallel; ++$i)
+{
+		my $numStr = NumStr($i);
+		$cmd 		.= "$TMPDIR/aligned.$numStr.grow-diag-final-and ";
+}
+$cmd .= " > $align \n";
+print $cmd;
+`$cmd`;
+sub NumStr($)
+{
+    my $i = shift;
+    my $numStr;
+    if ($i < 10) {
+	$numStr = "000000$i";
+    }
+    elsif ($i < 100) {
+	$numStr = "00000$i";
+    }
+    elsif ($i < 1000) {
+	$numStr = "0000$i";
+    }
+    elsif ($i < 10000) {
+	$numStr = "000$i";
+    }
+    elsif ($i < 100000) {
+	$numStr = "00$i";
+    }
+    elsif ($i < 1000000) {
+	$numStr = "0$i";
+    }
+    else {
+	$numStr = $i;
+    }
+    return $numStr;
+}

mosesdecoder/scripts/generic/lopar2pos.pl ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+# $Id$
+#lopar2pos: extract POSs from LOPAR output
+#usage: lopar2pos.pl CORPUS.lopar > CORPUS.pos
+use warnings;
+my $infilename = shift @ARGV;
+open(INFILE, "<$infilename") or die "couldn't open '$infilename' for read: $!\n";
+while(my $line = <INFILE>)
+{
+	my @words = split(/\s+/, $line);
+	my @tags = map {$_ =~ /^[^_]*_([A-Z]+)/; $1} @words;
+	print join(' ', @tags) . "\n";
+}
+close(INFILE);

mosesdecoder/scripts/generic/moses_sim_pe.py ADDED Viewed

	@@ -0,0 +1,452 @@

+#!/usr/bin/env python
+# Written by Michael Denkowski
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+"""Parallelize decoding with simulated post-editing via moses XML input.
+(XML entities need to be escaped in tokenization).  Memory mapped
+dynamic phrase tables (Ulrich Germann,
+www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models
+(Kenneth Heafield,
+http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19)
+facilitate memory efficient multi process decoding.  Input is divided into
+batches, each of which is decoded sequentially.  Each batch pre-loads the
+data from previous batches.
+To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the
+alignment from input to references.  Specify the number of jobs with
+--decoder-flags="-threads N".
+"""
+import gzip
+import itertools
+import math
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+HELP = '''Moses with simulated post-editing
+Usage:
+    {} moses-cmd -config moses.ini -input-file text.src -ref text.tgt \
+    -symal text.src-tgt.symal [options] [decoder flags]
+Options:
+    -threads N: number of decoders to run in parallel \
+(default read from moses.ini, 1 if not present)
+    -n-best-list nbest.out N [distinct]: location and size of N-best list
+    -show-weights: for mert-moses.pl, just call moses and exit
+    -tmp: location of temp directory (default /tmp)
+Other options (decoder flags) are passed through to moses-cmd\n'''
+class ProgramFailure(Exception):
+    """Known kind of failure, with a known presentation to the user.
+    Error message will be printed, and the program will return an error,
+    but no traceback will be shown to the user.
+    """
+class Progress:
+    """Provides progress bar."""
+    def __init__(self):
+        self.i = 0
+        self.lock = threading.Lock()
+    def inc(self):
+        self.lock.acquire()
+        self.i += 1
+        if self.i % 100 == 0:
+            sys.stderr.write('.')
+            if self.i % 1000 == 0:
+                sys.stderr.write(' [{}]\n'.format(self.i))
+            sys.stderr.flush()
+        self.lock.release()
+    def done(self):
+        self.lock.acquire()
+        if self.i % 1000 != 0:
+            sys.stderr.write('\n')
+        self.lock.release()
+def atomic_io(cmd, in_file, out_file, err_file, prog=None):
+    """Run with atomic (synchronous) I/O."""
+    with open(in_file, 'r') as inp, open(out_file, 'w') as out, open(err_file, 'w') as err:
+        p = subprocess.Popen(
+            cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=err)
+        while True:
+            line = inp.readline()
+            if not line:
+                break
+            p.stdin.write(line)
+            out.write(p.stdout.readline())
+            out.flush()
+            if prog:
+                prog.inc()
+        p.stdin.close()
+        p.wait()
+def gzopen(f):
+    """Open plain or gzipped text."""
+    return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r')
+def wc(f):
+    """Word count."""
+    i = 0
+    for line in gzopen(f):
+        i += 1
+    return i
+def write_gzfile(lines, f):
+    """Write lines to gzipped file."""
+    out = gzip.open(f, 'wb')
+    for line in lines:
+        out.write('{}\n'.format(line))
+    out.close()
+def main(argv):
+    # Defaults
+    moses_ini = None
+    moses_ini_lines = None
+    text_src = None
+    text_tgt = None
+    text_symal = None
+    text_len = None
+    threads_found = False
+    threads = 1
+    n_best_out = None
+    n_best_size = None
+    n_best_distinct = False
+    hg_ext = None
+    hg_dir = None
+    tmp_dir = '/tmp'
+    xml_found = False
+    xml_input = 'exclusive'
+    show_weights = False
+    mmsapt_dynamic = []
+    mmsapt_static = []
+    mmsapt_l1 = None
+    mmsapt_l2 = None
+    # Decoder command
+    cmd = argv[1:]
+    # Parse special options and remove from cmd
+    i = 1
+    while i < len(cmd):
+        if cmd[i] in ('-f', '-config'):
+            moses_ini = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] in ('-i', '-input-file'):
+            text_src = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] == '-ref':
+            text_tgt = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] == '-symal':
+            text_symal = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] in ('-th', '-threads'):
+            threads_found = True
+            threads = int(cmd[i + 1])
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] == '-n-best-list':
+            n_best_out = cmd[i + 1]
+            n_best_size = cmd[i + 2]
+            # Optional "distinct"
+            if i + 3 < len(cmd) and cmd[i + 3] == 'distinct':
+                n_best_distinct = True
+                cmd = cmd[:i] + cmd[i + 4:]
+            else:
+                cmd = cmd[:i] + cmd[i + 3:]
+        elif cmd[i] == '-output-search-graph-hypergraph':
+            # cmd[i + 1] == true
+            hg_ext = cmd[i + 2]
+            if i + 3 < len(cmd) and cmd[i + 3][0] != '-':
+                hg_dir = cmd[i + 3]
+                cmd = cmd[:i] + cmd[i + 4:]
+            else:
+                hg_dir = 'hypergraph'
+                cmd = cmd[:i] + cmd[i + 3:]
+        elif cmd[i] == '-tmp':
+            tmp_dir = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        # Handled specially to make sure XML input is turned on somewhere
+        elif cmd[i] in ('-xi', '-xml-input'):
+            xml_found = True
+            xml_input = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        # Handled specially for mert-moses.pl
+        elif cmd[i] == '-show-weights':
+            show_weights = True
+            # Do not remove from cmd
+            i += 1
+        else:
+            i += 1
+    # Read moses.ini
+    if moses_ini:
+        moses_ini_lines = [line.strip() for line in open(moses_ini, 'r')]
+        i = 0
+        while i < len(moses_ini_lines):
+            # PhraseDictionaryBitextSampling name=TranslationModel0
+            # output-factor=0 num-features=7 path=corpus. L1=src L2=tgt
+            # pfwd=g pbwd=g smooth=0 sample=1000 workers=1
+            if moses_ini_lines[i].startswith('PhraseDictionaryBitextSampling'):
+                for (k, v) in (pair.split('=') for pair in moses_ini_lines[i].split()[1:]):
+                    if k == 'name':
+                        # Dynamic means update this model
+                        if v.startswith('Dynamic'):
+                            mmsapt_dynamic.append(v)
+                            moses_ini_lines[i] += '{mmsapt_extra}'
+                        else:
+                            mmsapt_static.append(v)
+                    elif k == 'L1':
+                        if mmsapt_l1 and v != mmsapt_l1:
+                            raise ProgramFailure(
+                                'Error: All PhraseDictionaryBitextSampling '
+                                'entries should have same L1: '
+                                '{} != {}\n'.format(v, mmsapt_l1))
+                        mmsapt_l1 = v
+                    elif k == 'L2':
+                        if mmsapt_l2 and v != mmsapt_l2:
+                            raise ProgramFailure(
+                                'Error: All PhraseDictionaryBitextSampling '
+                                'entries should have same L2: '
+                                '{} != {}\n'.format(v, mmsapt_l2))
+                        mmsapt_l2 = v
+            # [threads]
+            # 8
+            elif moses_ini_lines[i] == '[threads]':
+                # Prefer command line over moses.ini
+                if not threads_found:
+                    threads = int(moses_ini_lines[i + 1])
+                i += 1
+            # [xml-input]
+            # exclusive
+            elif moses_ini_lines[i] == '[xml-input]':
+                # Prefer command line over moses.ini
+                if not xml_found:
+                    xml_found = True
+                    xml_input = moses_ini_lines[i + 1]
+                i += 1
+            i += 1
+    # If mert-moses.pl passes -show-weights, just call moses
+    if show_weights:
+        # re-append original moses.ini
+        cmd.append('-config')
+        cmd.append(moses_ini)
+        sys.stdout.write(subprocess.check_output(cmd))
+        sys.stdout.flush()
+        sys.exit(0)
+    # Input length
+    if text_src:
+        text_len = wc(text_src)
+    # Check inputs
+    if not (len(cmd) > 0 and all((moses_ini, text_src, text_tgt, text_symal))):
+        sys.stderr.write(HELP.format(argv[0]))
+        sys.exit(2)
+    if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)):
+        raise ProgramFailure(
+            'Error: moses-cmd "{}" is not executable\n'.format(cmd[0]))
+    if not mmsapt_dynamic:
+        raise ProgramFailure((
+            'Error: no PhraseDictionaryBitextSampling entries named '
+            '"Dynamic..." found in {}.  See '
+            'http://www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40\n'
+            ).format(moses_ini))
+    if wc(text_tgt) != text_len or wc(text_symal) != text_len:
+        raise ProgramFailure(
+            'Error: length mismatch between "{}", "{}", and "{}"\n'.format(
+                text_src, text_tgt, text_symal))
+    # Setup
+    work_dir = tempfile.mkdtemp(prefix='moses.', dir=os.path.abspath(tmp_dir))
+    threads = min(threads, text_len)
+    batch_size = int(math.ceil(float(text_len) / threads))
+    # Report settings
+    sys.stderr.write(
+        'Moses flags: {}\n'.format(
+            ' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:])))
+    for (i, n) in enumerate(mmsapt_dynamic):
+        sys.stderr.write(
+            'Dynamic mmsapt {}: {} {} {}\n'.format(
+                i, n, mmsapt_l1, mmsapt_l2))
+    for (i, n) in enumerate(mmsapt_static):
+        sys.stderr.write(
+            'Static mmsapt {}: {} {} {}\n'.format(i, n, mmsapt_l1, mmsapt_l2))
+    sys.stderr.write('XML mode: {}\n'.format(xml_input))
+    sys.stderr.write(
+        'Inputs: {} {} {} ({})\n'.format(
+            text_src, text_tgt, text_symal, text_len))
+    sys.stderr.write('Jobs: {}\n'.format(threads))
+    sys.stderr.write('Batch size: {}\n'.format(batch_size))
+    if n_best_out:
+        sys.stderr.write(
+            'N-best list: {} ({}{})\n'.format(
+                n_best_out, n_best_size,
+                ', distinct' if n_best_distinct else ''))
+    if hg_dir:
+        sys.stderr.write('Hypergraph dir: {} ({})\n'.format(hg_dir, hg_ext))
+    sys.stderr.write('Temp dir: {}\n'.format(work_dir))
+    # Accumulate seen lines
+    src_lines = []
+    tgt_lines = []
+    symal_lines = []
+    # Current XML source file
+    xml_out = None
+    # Split into batches.  Each batch after 0 gets extra files with data from
+    # previous batches.
+    # Data from previous lines in the current batch is added using XML input.
+    job = -1
+    lc = -1
+    lines = itertools.izip(
+        gzopen(text_src), gzopen(text_tgt), gzopen(text_symal))
+    for (src, tgt, symal) in lines:
+        (src, tgt, symal) = (src.strip(), tgt.strip(), symal.strip())
+        lc += 1
+        if lc % batch_size == 0:
+            job += 1
+            xml_file = os.path.join(work_dir, 'input.{}.xml'.format(job))
+            extra_src_file = os.path.join(
+                work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l1))
+            extra_tgt_file = os.path.join(
+                work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l2))
+            extra_symal_file = os.path.join(
+                work_dir, 'extra.{}.{}-{}.symal.gz'.format(
+                    job, mmsapt_l1, mmsapt_l2))
+            if job > 0:
+                xml_out.close()
+                write_gzfile(src_lines, extra_src_file)
+                write_gzfile(tgt_lines, extra_tgt_file)
+                write_gzfile(symal_lines, extra_symal_file)
+            xml_out = open(xml_file, 'w')
+            ini_file = os.path.join(work_dir, 'moses.{}.ini'.format(job))
+            with open(ini_file, 'w') as moses_ini_out:
+                if job == 0:
+                    extra = ''
+                else:
+                    extra = ' extra={}'.format(
+                        os.path.join(work_dir, 'extra.{}.'.format(job)))
+                moses_ini_out.write(
+                    '{}\n'.format(
+                        '\n'.join(moses_ini_lines).format(mmsapt_extra=extra)))
+        src_lines.append(src)
+        tgt_lines.append(tgt)
+        symal_lines.append(symal)
+        # Lines after first start with update tag including previous
+        # translation.
+        # Translation of last line of each batch is included in extra for
+        # next batch.
+        xml_tags = []
+        if lc % batch_size != 0:
+            tag_template = (
+                '<update '
+                'name="{}" source="{}" target="{}" alignment="{}" /> ')
+            for n in mmsapt_dynamic:
+                # Note: space after tag.
+                xml_tags.append(
+                    tag_template.format(
+                        n, src_lines[-2], tgt_lines[-2], symal_lines[-2]))
+        xml_out.write('{}{}\n'.format(''.join(xml_tags), src))
+    xml_out.close()
+    # Run decoders in parallel
+    workers = []
+    prog = Progress()
+    for i in range(threads):
+        work_cmd = cmd[:]
+        work_cmd.append('-config')
+        work_cmd.append(os.path.join(work_dir, 'moses.{}.ini'.format(i)))
+        # Workers use 1 CPU each
+        work_cmd.append('-threads')
+        work_cmd.append('1')
+        if not xml_found:
+            work_cmd.append('-xml-input')
+            work_cmd.append(xml_input)
+        if n_best_out:
+            work_cmd.append('-n-best-list')
+            work_cmd.append(os.path.join(work_dir, 'nbest.{}'.format(i)))
+            work_cmd.append(str(n_best_size))
+            if n_best_distinct:
+                work_cmd.append('distinct')
+        if hg_dir:
+            work_cmd.append('-output-search-graph-hypergraph')
+            work_cmd.append('true')
+            work_cmd.append(hg_ext)
+            work_cmd.append(os.path.join(work_dir, 'hg.{}'.format(i)))
+        in_file = os.path.join(work_dir, 'input.{}.xml'.format(i))
+        out_file = os.path.join(work_dir, 'out.{}'.format(i))
+        err_file = os.path.join(work_dir, 'err.{}'.format(i))
+        t = threading.Thread(
+            target=atomic_io,
+            args=(work_cmd, in_file, out_file, err_file, prog))
+        workers.append(t)
+        t.start()
+    # Wait for all to finish
+    for t in workers:
+        t.join()
+    prog.done()
+    # Gather N-best lists
+    if n_best_out:
+        with open(n_best_out, 'w') as out:
+            for i in range(threads):
+                path = os.path.join(work_dir, 'nbest.{}'.format(i))
+                for line in open(path, 'r'):
+                    entry = line.partition(' ')
+                    out.write(
+                        '{} {}'.format(
+                            int(entry[0]) + (i * batch_size), entry[2]))
+    # Gather hypergraphs
+    if hg_dir:
+        if not os.path.exists(hg_dir):
+            os.mkdir(hg_dir)
+        shutil.copy(
+            os.path.join(work_dir, 'hg.0', 'weights'),
+            os.path.join(hg_dir, 'weights'))
+        for i in range(threads):
+            for j in range(batch_size):
+                shutil.copy(
+                    os.path.join(
+                        work_dir, 'hg.{}'.format(i),
+                        '{}.{}'.format(j, hg_ext)),
+                    os.path.join(
+                        hg_dir, '{}.{}'.format((i * batch_size) + j, hg_ext)))
+    # Gather stdout
+    for i in range(threads):
+        for line in open(os.path.join(work_dir, 'out.{}'.format(i)), 'r'):
+            sys.stdout.write(line)
+    # Cleanup
+    shutil.rmtree(work_dir)
+if __name__ == '__main__':
+    try:
+        main(sys.argv)
+    except ProgramFailure as error:
+        sys.stderr.write("%s\n" % error)
+        sys.exit(1)

mosesdecoder/scripts/generic/mteval-v11b.pl ADDED Viewed

	@@ -0,0 +1,761 @@

+#!/usr/bin/perl -w
+use strict;
+#################################
+# History:
+#
+# version 11b -- text normalization modified:
+#    * take out the join digit line because it joins digits
+#      when it shouldn't have
+#      $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
+#
+# version 11a -- corrected output of individual n-gram precision values
+#
+# version 11 -- bug fixes:
+#    * make filehandle operate in binary mode to prevent Perl from operating
+#      (by default in Red Hat 9) in UTF-8
+#    * fix failure on joining digits
+# version 10 -- updated output to include more details of n-gram scoring.
+#    Defaults to generate both NIST and BLEU scores.  Use -b for BLEU
+#    only, use -n for NIST only
+#
+# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
+#    being the max, regardless what was entered on the command line.)
+#
+# version 09c -- bug fix (During the calculation of ngram information,
+#    each ngram was being counted only once for each segment.  This has
+#    been fixed so that each ngram is counted correctly in each segment.)
+#
+# version 09b -- text normalization modified:
+#    * option flag added to preserve upper case
+#    * non-ASCII characters left in place.
+#
+# version 09a -- text normalization modified:
+#    * &quot; and &amp; converted to "" and &, respectively
+#    * non-ASCII characters kept together (bug fix)
+#
+# version 09 -- modified to accommodate sgml tag and attribute
+#    names revised to conform to default SGML conventions.
+#
+# version 08 -- modifies the NIST metric in accordance with the
+#    findings on the 2001 Chinese-English dry run corpus.  Also
+#    incorporates the BLEU metric as an option and supports the
+#    output of ngram detail.
+#
+# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
+#    Keep strings of non-ASCII characters together as one word
+#    (rather than splitting them into one-character words).
+#    Change length penalty so that translations that are longer than
+#    the average reference translation are not penalized.
+#
+# version 06
+#    Prevent divide-by-zero when a segment has no evaluation N-grams.
+#    Correct segment index for level 3 debug output.
+#
+# version 05
+#    improve diagnostic error messages
+#
+# version 04
+#    tag segments
+#
+# version 03
+#    add detailed output option (intermediate document and segment scores)
+#
+# version 02
+#    accommodation of modified sgml tags and attributes
+#
+# version 01
+#    same as bleu version 15, but modified to provide formal score output.
+#
+# original IBM version
+#    Author: Kishore Papineni
+#    Date: 06/10/2001
+#################################
+######
+# Intro
+my ($date, $time) = date_time_stamp();
+print "MT evaluation scorer began on $date at $time\n";
+print "command line:  ", $0, " ", join(" ", @ARGV), "\n";
+my $usage = "\n\nUsage: $0 [-h] -r <ref_file> -s src_file -t <tst_file>\n\n".
+    "Description:  This Perl script evaluates MT system performance.\n".
+    "\n".
+    "Required arguments:\n".
+    "  -r <ref_file> is a file containing the reference translations for\n".
+    "      the documents to be evaluated.\n".
+    "  -s <src_file> is a file containing the source documents for which\n".
+    "      translations are to be evaluated\n".
+    "  -t <tst_file> is a file containing the translations to be evaluated\n".
+    "\n".
+    "Optional arguments:\n".
+    "  -c preserves upper-case alphabetic characters\n".
+    "  -b generate BLEU scores only\n".
+    "  -n generate NIST scores only\n".
+    "  -d detailed output flag used in conjunction with \"-b\" or \"-n\" flags:\n".
+    "         0 (default) for system-level score only\n".
+    "         1 to include document-level scores\n".
+    "         2 to include segment-level scores\n".
+    "         3 to include ngram-level scores\n".
+    "  -h prints this help message to STDOUT\n".
+    "\n";
+use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x);
+use Getopt::Std;
+getopts ('r:s:t:d:hbncx:');
+die $usage if defined($opt_h);
+die "Error in command line:  ref_file not defined$usage" unless defined $opt_r;
+die "Error in command line:  src_file not defined$usage" unless defined $opt_s;
+die "Error in command line:  tst_file not defined$usage" unless defined $opt_t;
+my $max_Ngram = 9;
+my $detail = defined $opt_d ? $opt_d : 0;
+my $preserve_case = defined $opt_c ? 1 : 0;
+my $METHOD = "BOTH";
+if (defined $opt_b) { $METHOD = "BLEU"; }
+if (defined $opt_n) { $METHOD = "NIST"; }
+my $method;
+my ($ref_file) = $opt_r;
+my ($src_file) = $opt_s;
+my ($tst_file) = $opt_t;
+######
+# Global variables
+my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
+my (%tst_data, %ref_data); # the data -- with structure:  {system}{document}[segments]
+my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
+my %eval_docs;     # document information for the evaluation data set
+my %ngram_info;    # the information obtained from (the last word in) the ngram
+######
+# Get source document ID's
+($src_id) = get_source_info ($src_file);
+######
+# Get reference translations
+($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
+compute_ngram_info ();
+######
+# Get translations to evaluate
+($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
+######
+# Check data for completeness and correctness
+check_MT_data ();
+######
+#
+my %NISTmt = ();
+my %BLEUmt = ();
+######
+# Evaluate
+print "  Evaluation of $src_lang-to-$tgt_lang translation using:\n";
+my $cum_seg = 0;
+foreach my $doc (sort keys %eval_docs) {
+    $cum_seg += @{$eval_docs{$doc}{SEGS}};
+}
+print "    src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
+print "    ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
+print "    tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
+foreach my $sys (sort @tst_sys) {
+    for (my $n=1; $n<=$max_Ngram; $n++) {
+        $NISTmt{$n}{$sys}{cum} = 0;
+        $NISTmt{$n}{$sys}{ind} = 0;
+        $BLEUmt{$n}{$sys}{cum} = 0;
+        $BLEUmt{$n}{$sys}{ind} = 0;
+    }
+    if (($METHOD eq "BOTH") || ($METHOD eq "NIST")) {
+        $method="NIST";
+        score_system ($sys, %NISTmt);
+    }
+    if (($METHOD eq "BOTH") || ($METHOD eq "BLEU")) {
+        $method="BLEU";
+        score_system ($sys, %BLEUmt);
+    }
+}
+######
+printout_report ();
+($date, $time) = date_time_stamp();
+print "MT evaluation scorer ended on $date at $time\n";
+exit 0;
+#################################
+sub get_source_info {
+    my ($file) = @_;
+    my ($name, $id, $src, $doc);
+    my ($data, $tag, $span);
+#read data from file
+    open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
+    binmode FILE;
+    $data .= $_ while <FILE>;
+    close (FILE);
+#get source set info
+    die "\n\nFATAL INPUT ERROR:  no 'src_set' tag in src_file '$file'\n\n"
+        unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
+    die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+        unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
+    die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+        unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
+    die "\n\nFATAL INPUT ERROR:  $name ('$src') in file '$file' inconsistent\n"
+        ."                    with $name in previous input data ('$src_lang')\n\n"
+            unless (not defined $src_lang or $src eq $src_lang);
+    $src_lang = $src;
+#get doc info -- ID and # of segs
+    $data = $span;
+    while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) {
+        die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+            unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
+        die "\n\nFATAL INPUT ERROR:  duplicate '$name' in file '$file'\n\n"
+            if defined $eval_docs{$doc};
+        $span =~ s/[\s\n\r]+/ /g;  # concatenate records
+        my $jseg=0, my $seg_data = $span;
+        while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
+            ($eval_docs{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
+        }
+        die "\n\nFATAL INPUT ERROR:  no segments in document '$doc' in file '$file'\n\n"
+            if $jseg == 0;
+    }
+    die "\n\nFATAL INPUT ERROR:  no documents in file '$file'\n\n"
+        unless keys %eval_docs > 0;
+    return $id;
+}
+#################################
+sub get_MT_data {
+    my ($docs, $set_tag, $file) = @_;
+    my ($name, $id, $src, $tgt, $sys, $doc);
+    my ($tag, $span, $data);
+#read data from file
+    open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
+    binmode FILE;
+    $data .= $_ while <FILE>;
+    close (FILE);
+#get tag info
+    while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) {
+        die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
+            ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
+        die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
+            ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
+        die "\n\nFATAL INPUT ERROR:  $name ('$src') in file '$file' inconsistent\n"
+            ."                    with $name of source ('$src_lang')\n\n"
+                unless $src eq $src_lang;
+        die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
+            ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
+        die "\n\nFATAL INPUT ERROR:  $name ('$tgt') in file '$file' inconsistent\n"
+            ."                    with $name of the evaluation ('$tgt_lang')\n\n"
+                unless (not defined $tgt_lang or $tgt eq $tgt_lang);
+        $tgt_lang = $tgt;
+        my $mtdata = $span;
+        while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) {
+            die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
+                (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
+            die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
+                $doc = extract_sgml_tag_attribute ($name="DocID", $tag);
+            die "\n\nFATAL INPUT ERROR:  document '$doc' for system '$sys' in file '$file'\n"
+                ."                    previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
+                    unless (not defined $docs->{$sys}{$doc});
+            $span =~ s/[\s\n\r]+/ /g;  # concatenate records
+            my $jseg=0, my $seg_data = $span;
+            while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
+                ($docs->{$sys}{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
+            }
+            die "\n\nFATAL INPUT ERROR:  no segments in document '$doc' in file '$file'\n\n"
+                if $jseg == 0;
+            $docs->{$sys}{$doc}{FILE} = $file;
+        }
+    }
+    return $id;
+}
+#################################
+sub check_MT_data {
+    @tst_sys = sort keys %tst_data;
+    @ref_sys = sort keys %ref_data;
+#every evaluation document must be represented for every system and every reference
+    foreach my $doc (sort keys %eval_docs) {
+        my $nseg_source = @{$eval_docs{$doc}{SEGS}};
+        foreach my $sys (@tst_sys) {
+            die "\n\nFATAL ERROR:  no document '$doc' for system '$sys'\n\n"
+                unless defined $tst_data{$sys}{$doc};
+            my $nseg = @{$tst_data{$sys}{$doc}{SEGS}};
+            die "\n\nFATAL ERROR:  translated documents must contain the same # of segments as the source, but\n"
+                ."              document '$doc' for system '$sys' contains $nseg segments, while\n"
+                ."              the source document contains $nseg_source segments.\n\n"
+                    unless $nseg == $nseg_source;
+        }
+        foreach my $sys (@ref_sys) {
+            die "\n\nFATAL ERROR:  no document '$doc' for reference '$sys'\n\n"
+                unless defined $ref_data{$sys}{$doc};
+            my $nseg = @{$ref_data{$sys}{$doc}{SEGS}};
+            die "\n\nFATAL ERROR:  translated documents must contain the same # of segments as the source, but\n"
+                ."              document '$doc' for system '$sys' contains $nseg segments, while\n"
+                ."              the source document contains $nseg_source segments.\n\n"
+                    unless $nseg == $nseg_source;
+        }
+    }
+}
+#################################
+sub compute_ngram_info {
+    my ($ref, $doc, $seg);
+    my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
+    my (%ngram_count, @tot_ngrams);
+    foreach $ref (keys %ref_data) {
+        foreach $doc (keys %{$ref_data{$ref}}) {
+            foreach $seg (@{$ref_data{$ref}{$doc}{SEGS}}) {
+                @wrds = split /\s+/, $seg;
+                $tot_wrds += @wrds;
+                %ngrams = %{Words2Ngrams (@wrds)};
+                foreach $ngram (keys %ngrams) {
+                    $ngram_count{$ngram} += $ngrams{$ngram};
+                }
+            }
+        }
+    }
+    foreach $ngram (keys %ngram_count) {
+        @wrds = split / /, $ngram;
+        pop @wrds, $mgram = join " ", @wrds;
+        $ngram_info{$ngram} = - log
+            ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram}
+                    : $ngram_count{$ngram}/$tot_wrds) / log 2;
+        if (defined $opt_x and $opt_x eq "ngram info") {
+            @wrds = split / /, $ngram;
+            printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
+                $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
+        }
+    }
+}
+#################################
+sub score_system {
+    my ($sys, $ref, $doc, %SCOREmt);
+    ($sys, %SCOREmt) = @_;
+    my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
+    my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
+    $cum_ref_length = 0;
+    for (my $j=1; $j<=$max_Ngram; $j++) {
+        $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
+    }
+    foreach $doc (sort keys %eval_docs) {
+        ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc);
+#output document summary score
+        if (($detail >= 1 ) && ($METHOD eq "NIST"))  {
+            my %DOCmt = ();
+            printf "$method score using   5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
+            nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt),
+            scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
+        }
+        if (($detail >= 1 ) && ($METHOD eq "BLEU"))  {
+            my %DOCmt = ();
+            printf "$method score using   4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
+            bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt),
+            scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
+        }
+        $cum_ref_length += $shortest_ref_length;
+        for (my $j=1; $j<=$max_Ngram; $j++) {
+            $cum_match[$j] += $match_cnt->[$j];
+            $cum_tst_cnt[$j] += $tst_cnt->[$j];
+            $cum_ref_cnt[$j] += $ref_cnt->[$j];
+            $cum_tst_info[$j] += $tst_info->[$j];
+            $cum_ref_info[$j] += $ref_info->[$j];
+            printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
+                $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
+                    if (defined $opt_x and $opt_x eq "document info");
+        }
+    }
+#x #output system summary score
+#x    printf "$method score = %.4f for system \"$sys\"\n",
+#x        $method eq "BLEU" ?  bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt) :
+#x          nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
+    if ($method eq "BLEU")  {
+        bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, %SCOREmt);
+    }
+    if ($method eq "NIST") {
+        nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
+    }
+}
+#################################
+sub score_document {
+    my ($sys, $ref, $doc);
+    ($sys, $doc) = @_;
+    my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
+    my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
+    $cum_ref_length = 0;
+    for (my $j=1; $j<=$max_Ngram; $j++) {
+        $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
+    }
+#score each segment
+    for (my $jseg=0; $jseg<@{$tst_data{$sys}{$doc}{SEGS}}; $jseg++) {
+        my @ref_segments = ();
+        foreach $ref (@ref_sys) {
+            push @ref_segments, $ref_data{$ref}{$doc}{SEGS}[$jseg];
+            printf "ref '$ref', seg %d: %s\n", $jseg+1, $ref_data{$ref}{$doc}{SEGS}[$jseg]
+                if $detail >= 3;
+        }
+        printf "sys '$sys', seg %d: %s\n", $jseg+1, $tst_data{$sys}{$doc}{SEGS}[$jseg]
+            if $detail >= 3;
+        ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) =
+            score_segment ($tst_data{$sys}{$doc}{SEGS}[$jseg], @ref_segments);
+#output segment summary score
+#x      printf "$method score = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
+#x            $method eq "BLEU" ?  bleu_score($shortest_ref_length, $match_cnt, $tst_cnt) :
+#x              nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info),
+#x              $jseg+1, $tst_cnt->[1]
+#x                  if $detail >= 2;
+        if (($detail >=2) && ($METHOD eq "BLEU")) {
+            my %DOCmt = ();
+            printf "  $method score using 4-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
+            bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
+        }
+        if (($detail >=2) && ($METHOD eq "NIST")) {
+            my %DOCmt = ();
+            printf "  $method score using 5-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
+            nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
+        }
+        $cum_ref_length += $shortest_ref_length;
+        for (my $j=1; $j<=$max_Ngram; $j++) {
+            $cum_match[$j] += $match_cnt->[$j];
+            $cum_tst_cnt[$j] += $tst_cnt->[$j];
+            $cum_ref_cnt[$j] += $ref_cnt->[$j];
+            $cum_tst_info[$j] += $tst_info->[$j];
+            $cum_ref_info[$j] += $ref_info->[$j];
+        }
+    }
+    return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
+}
+#################################
+sub score_segment {
+    my ($tst_seg, @ref_segs) = @_;
+    my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
+    my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
+    my ($ngram);
+    my (@nwrds_ref);
+    my $shortest_ref_length;
+    for (my $j=1; $j<= $max_Ngram; $j++) {
+        $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
+    }
+# get the ngram counts for the test segment
+    @tst_wrds = split /\s+/, $tst_seg;
+    %tst_ngrams = %{Words2Ngrams (@tst_wrds)};
+    for (my $j=1; $j<=$max_Ngram; $j++) { # compute ngram counts
+        $tst_count[$j]  = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
+    }
+# get the ngram counts for the reference segments
+    foreach $ref_seg (@ref_segs) {
+        @ref_wrds = split /\s+/, $ref_seg;
+        %ref_ngrams = %{Words2Ngrams (@ref_wrds)};
+        foreach $ngram (keys %ref_ngrams) { # find the maximum # of occurrences
+            my @wrds = split / /, $ngram;
+            $ref_info[@wrds] += $ngram_info{$ngram};
+            $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ?
+                max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) :
+                    $ref_ngrams{$ngram};
+        }
+        for (my $j=1; $j<=$max_Ngram; $j++) { # update ngram counts
+            $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
+        }
+        $shortest_ref_length = scalar @ref_wrds # find the shortest reference segment
+            if (not defined $shortest_ref_length) or @ref_wrds < $shortest_ref_length;
+    }
+# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
+    foreach $ngram (keys %tst_ngrams) {
+        next unless defined $ref_ngrams_max{$ngram};
+        my @wrds = split / /, $ngram;
+        $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
+        $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
+        printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
+            if $detail >= 3;
+    }
+    return ($shortest_ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
+}
+#################################
+sub bleu_score {
+    my ($shortest_ref_length, $matching_ngrams, $tst_ngrams, $sys, %SCOREmt) = @_;
+    my $score = 0;
+    my $iscore = 0;
+    my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
+    for (my $j=1; $j<=$max_Ngram; $j++) {
+        if ($matching_ngrams->[$j] == 0) {
+            $SCOREmt{$j}{$sys}{cum}=0;
+        } else {
+# Cumulative N-Gram score
+            $score += log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
+            $SCOREmt{$j}{$sys}{cum} = exp($score/$j + $len_score);
+# Individual N-Gram score
+            $iscore = log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
+            $SCOREmt{$j}{$sys}{ind} = exp($iscore);
+        }
+    }
+    return $SCOREmt{4}{$sys}{cum};
+}
+#################################
+sub nist_score {
+    my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, %SCOREmt) = @_;
+    my $score = 0;
+    my $iscore = 0;
+    for (my $n=1; $n<=$max_Ngram; $n++) {
+        $score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
+        $SCOREmt{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
+        $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
+        $SCOREmt{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
+    }
+    return $SCOREmt{5}{$sys}{cum};
+}
+#################################
+sub Words2Ngrams { #convert a string of words to an Ngram count hash
+    my %count = ();
+    for (; @_; shift) {
+        my ($j, $ngram, $word);
+        for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) {
+            $ngram .= defined $ngram ? " $word" : $word;
+            $count{$ngram}++;
+        }
+    }
+    return {%count};
+}
+#################################
+sub NormalizeText {
+    my ($norm_text) = @_;
+# language-independent part:
+    $norm_text =~ s/<skipped>//g; # strip "skipped" tags
+    $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
+    $norm_text =~ s/\n/ /g; # join lines
+    $norm_text =~ s/&quot;/"/g;  # convert SGML tag for quote to "
+    $norm_text =~ s/&amp;/&/g;   # convert SGML tag for ampersand to &
+    $norm_text =~ s/&lt;/</g;    # convert SGML tag for less-than to >
+    $norm_text =~ s/&gt;/>/g;    # convert SGML tag for greater-than to <
+# language-dependent part (assuming Western languages):
+    $norm_text = " $norm_text ";
+    $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
+    $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g;   # tokenize punctuation
+    $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
+    $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
+    $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
+    $norm_text =~ s/\s+/ /g; # one space only between words
+    $norm_text =~ s/^\s+//;  # no leading space
+    $norm_text =~ s/\s+$//;  # no trailing space
+    return $norm_text;
+}
+#################################
+sub nist_length_penalty {
+    my ($ratio) = @_;
+    return 1 if $ratio >= 1;
+    return 0 if $ratio <= 0;
+    my $ratio_x = 1.5;
+    my $score_x = 0.5;
+    my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
+    return exp (-$beta*log($ratio)*log($ratio));
+}
+#################################
+sub date_time_stamp {
+    my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
+    my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
+    my ($date, $time);
+    $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
+    $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
+    return ($date, $time);
+}
+#################################
+sub extract_sgml_tag_and_span {
+    my ($name, $data) = @_;
+    ($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
+}
+#################################
+sub extract_sgml_tag_attribute {
+    my ($name, $data) = @_;
+    ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
+}
+#################################
+sub max {
+    my ($max, $next);
+    return unless defined ($max=pop);
+    while (defined ($next=pop)) {
+        $max = $next if $next > $max;
+    }
+    return $max;
+}
+#################################
+sub min {
+    my ($min, $next);
+    return unless defined ($min=pop);
+    while (defined ($next=pop)) {
+        $min = $next if $next < $min;
+    }
+    return $min;
+}
+#################################
+sub printout_report
+{
+    if ( $METHOD eq "BOTH" ) {
+        foreach my $sys (sort @tst_sys) {
+            printf "NIST score = %2.4f  BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
+        }
+    } elsif ($METHOD eq "NIST" ) {
+        foreach my $sys (sort @tst_sys) {
+            printf "NIST score = %2.4f  for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
+        }
+    } elsif ($METHOD eq "BLEU" ) {
+        foreach my $sys (sort @tst_sys) {
+            printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
+        }
+    }
+    printf "\n# ------------------------------------------------------------------------\n\n";
+    printf "Individual N-gram scoring\n";
+    printf "        1-gram   2-gram   3-gram   4-gram   5-gram   6-gram   7-gram   8-gram   9-gram\n";
+    printf "        ------   ------   ------   ------   ------   ------   ------   ------   ------\n";
+    if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
+        foreach my $sys (sort @tst_sys) {
+            printf " NIST:";
+            for (my $i=1; $i<=$max_Ngram; $i++) {
+                printf "  %2.4f ",$NISTmt{$i}{$sys}{ind}
+            }
+            printf " \"$sys\"\n";
+        }
+        printf "\n";
+    }
+    if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
+        foreach my $sys (sort @tst_sys) {
+            printf " BLEU:";
+            for (my $i=1; $i<=$max_Ngram; $i++) {
+                printf "  %2.4f ",$BLEUmt{$i}{$sys}{ind}
+            }
+           printf " \"$sys\"\n";
+        }
+    }
+    printf "\n# ------------------------------------------------------------------------\n";
+    printf "Cumulative N-gram scoring\n";
+    printf "        1-gram   2-gram   3-gram   4-gram   5-gram   6-gram   7-gram   8-gram   9-gram\n";
+    printf "        ------   ------   ------   ------   ------   ------   ------   ------   ------\n";
+    if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
+        foreach my $sys (sort @tst_sys) {
+            printf " NIST:";
+            for (my $i=1; $i<=$max_Ngram; $i++) {
+                printf "  %2.4f ",$NISTmt{$i}{$sys}{cum}
+            }
+            printf " \"$sys\"\n";
+        }
+    }
+    printf "\n";
+    if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
+        foreach my $sys (sort @tst_sys) {
+            printf " BLEU:";
+            for (my $i=1; $i<=$max_Ngram; $i++) {
+                printf "  %2.4f ",$BLEUmt{$i}{$sys}{cum}
+            }
+            printf " \"$sys\"\n";
+        }
+    }
+}

mosesdecoder/scripts/generic/mteval-v12.pl ADDED Viewed

	@@ -0,0 +1,784 @@

+#!/usr/bin/env perl
+use warnings;
+use strict;
+use utf8;
+use Encode;
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+#################################
+# History:
+#
+# version 12
+#    * Text normalization changes:
+#        * convert entity references (only the entities declared in the DTD)
+#        * now uses unicode categories
+#        * tokenize punctuation unless followed AND preceded by digits
+#        * tokenize symbols
+#    * UTF-8 handling:
+#        * files are now read using utf8 mode
+#    * Added the '-e' command-line option to enclose non-ASCII characters between spaces
+#
+# version 11b -- text normalization modified:
+#    * take out the join digit line because it joins digits
+#      when it shouldn't have
+#      $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
+#
+# version 11a -- corrected output of individual n-gram precision values
+#
+# version 11 -- bug fixes:
+#    * make filehandle operate in binary mode to prevent Perl from operating
+#      (by default in Red Hat 9) in UTF-8
+#    * fix failure on joining digits
+# version 10 -- updated output to include more details of n-gram scoring.
+#    Defaults to generate both NIST and BLEU scores.  Use -b for BLEU
+#    only, use -n for NIST only
+#
+# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
+#    being the max, regardless what was entered on the command line.)
+#
+# version 09c -- bug fix (During the calculation of ngram information,
+#    each ngram was being counted only once for each segment.  This has
+#    been fixed so that each ngram is counted correctly in each segment.)
+#
+# version 09b -- text normalization modified:
+#    * option flag added to preserve upper case
+#    * non-ASCII characters left in place.
+#
+# version 09a -- text normalization modified:
+#    * &quot; and &amp; converted to "" and &, respectively
+#    * non-ASCII characters kept together (bug fix)
+#
+# version 09 -- modified to accommodate sgml tag and attribute
+#    names revised to conform to default SGML conventions.
+#
+# version 08 -- modifies the NIST metric in accordance with the
+#    findings on the 2001 Chinese-English dry run corpus.  Also
+#    incorporates the BLEU metric as an option and supports the
+#    output of ngram detail.
+#
+# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
+#    Keep strings of non-ASCII characters together as one word
+#    (rather than splitting them into one-character words).
+#    Change length penalty so that translations that are longer than
+#    the average reference translation are not penalized.
+#
+# version 06
+#    Prevent divide-by-zero when a segment has no evaluation N-grams.
+#    Correct segment index for level 3 debug output.
+#
+# version 05
+#    improve diagnostic error messages
+#
+# version 04
+#    tag segments
+#
+# version 03
+#    add detailed output option (intermediate document and segment scores)
+#
+# version 02
+#    accommodation of modified sgml tags and attributes
+#
+# version 01
+#    same as bleu version 15, but modified to provide formal score output.
+#
+# original IBM version
+#    Author: Kishore Papineni
+#    Date: 06/10/2001
+#################################
+######
+# Intro
+my ($date, $time) = date_time_stamp();
+print "MT evaluation scorer began on $date at $time\n";
+print "command line:  ", $0, " ", join(" ", @ARGV), "\n";
+my $usage = "\n\nUsage: $0 [-h] -r <ref_file> -s <src_file> -t <tst_file>\n\n".
+    "Description:  This Perl script evaluates MT system performance.\n".
+    "\n".
+    "Required arguments:\n".
+    "  -r <ref_file> is a file containing the reference translations for\n".
+    "      the documents to be evaluated.\n".
+    "  -s <src_file> is a file containing the source documents for which\n".
+    "      translations are to be evaluated\n".
+    "  -t <tst_file> is a file containing the translations to be evaluated\n".
+    "\n".
+    "Optional arguments:\n".
+    "  -c preserves upper-case alphabetic characters\n".
+    "  -b generate BLEU scores only\n".
+    "  -n generate NIST scores only\n".
+    "  -d detailed output flag used in conjunction with \"-b\" or \"-n\" flags:\n".
+    "         0 (default) for system-level score only\n".
+    "         1 to include document-level scores\n".
+    "         2 to include segment-level scores\n".
+    "         3 to include ngram-level scores\n".
+    "  -e enclose non-ASCII characters between spaces\n".
+    "  -h prints this help message to STDOUT\n".
+    "\n";
+use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e);
+use Getopt::Std;
+getopts ('r:s:t:d:hbncx:e');
+die $usage if defined($opt_h);
+die "Error in command line:  ref_file not defined$usage" unless defined $opt_r;
+die "Error in command line:  src_file not defined$usage" unless defined $opt_s;
+die "Error in command line:  tst_file not defined$usage" unless defined $opt_t;
+my $max_Ngram = 9;
+my $detail = defined $opt_d ? $opt_d : 0;
+my $preserve_case = defined $opt_c ? 1 : 0;
+my $split_non_ASCII = defined $opt_e ? 1 : 0;
+my $METHOD = "BOTH";
+if (defined $opt_b) { $METHOD = "BLEU"; }
+if (defined $opt_n) { $METHOD = "NIST"; }
+my $method;
+my ($ref_file) = $opt_r;
+my ($src_file) = $opt_s;
+my ($tst_file) = $opt_t;
+######
+# Global variables
+my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
+my (%tst_data, %ref_data); # the data -- with structure:  {system}{document}[segments]
+my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
+my %eval_docs;     # document information for the evaluation data set
+my %ngram_info;    # the information obtained from (the last word in) the ngram
+######
+# Get source document ID's
+($src_id) = get_source_info ($src_file);
+######
+# Get reference translations
+($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
+compute_ngram_info ();
+######
+# Get translations to evaluate
+($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
+######
+# Check data for completeness and correctness
+check_MT_data ();
+######
+#
+my %NISTmt = ();
+my %BLEUmt = ();
+######
+# Evaluate
+print "  Evaluation of $src_lang-to-$tgt_lang translation using:\n";
+my $cum_seg = 0;
+foreach my $doc (sort keys %eval_docs) {
+    $cum_seg += @{$eval_docs{$doc}{SEGS}};
+}
+print "    src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
+print "    ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
+print "    tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
+foreach my $sys (sort @tst_sys) {
+    for (my $n=1; $n<=$max_Ngram; $n++) {
+        $NISTmt{$n}{$sys}{cum} = 0;
+        $NISTmt{$n}{$sys}{ind} = 0;
+        $BLEUmt{$n}{$sys}{cum} = 0;
+        $BLEUmt{$n}{$sys}{ind} = 0;
+    }
+    if (($METHOD eq "BOTH") || ($METHOD eq "NIST")) {
+        $method="NIST";
+        score_system ($sys, %NISTmt);
+    }
+    if (($METHOD eq "BOTH") || ($METHOD eq "BLEU")) {
+        $method="BLEU";
+        score_system ($sys, %BLEUmt);
+    }
+}
+######
+printout_report ();
+($date, $time) = date_time_stamp();
+print "MT evaluation scorer ended on $date at $time\n";
+exit 0;
+#################################
+sub get_source_info {
+    my ($file) = @_;
+    my ($name, $id, $src, $doc);
+    my ($data, $tag, $span);
+#read data from file
+    open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
+    binmode FILE, ":utf8";
+    $data .= $_ while <FILE>;
+    close (FILE);
+#get source set info
+    die "\n\nFATAL INPUT ERROR:  no 'src_set' tag in src_file '$file'\n\n"
+        unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
+    die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+        unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
+    die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+        unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
+    die "\n\nFATAL INPUT ERROR:  $name ('$src') in file '$file' inconsistent\n"
+        ."                    with $name in previous input data ('$src_lang')\n\n"
+            unless (not defined $src_lang or $src eq $src_lang);
+    $src_lang = $src;
+#get doc info -- ID and # of segs
+    $data = $span;
+    while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) {
+        die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+            unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
+        die "\n\nFATAL INPUT ERROR:  duplicate '$name' in file '$file'\n\n"
+            if defined $eval_docs{$doc};
+        $span =~ s/[\s\n\r]+/ /g;  # concatenate records
+        my $jseg=0, my $seg_data = $span;
+        while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
+            ($eval_docs{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
+        }
+        die "\n\nFATAL INPUT ERROR:  no segments in document '$doc' in file '$file'\n\n"
+            if $jseg == 0;
+    }
+    die "\n\nFATAL INPUT ERROR:  no documents in file '$file'\n\n"
+        unless keys %eval_docs > 0;
+    return $id;
+}
+#################################
+sub get_MT_data {
+    my ($docs, $set_tag, $file) = @_;
+    my ($name, $id, $src, $tgt, $sys, $doc);
+    my ($tag, $span, $data);
+#read data from file
+    open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
+    binmode FILE, ":utf8";
+    $data .= $_ while <FILE>;
+    close (FILE);
+#get tag info
+    while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) {
+        die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
+            ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
+        die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
+            ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
+        die "\n\nFATAL INPUT ERROR:  $name ('$src') in file '$file' inconsistent\n"
+            ."                    with $name of source ('$src_lang')\n\n"
+                unless $src eq $src_lang;
+        die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
+            ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
+        die "\n\nFATAL INPUT ERROR:  $name ('$tgt') in file '$file' inconsistent\n"
+            ."                    with $name of the evaluation ('$tgt_lang')\n\n"
+                unless (not defined $tgt_lang or $tgt eq $tgt_lang);
+        $tgt_lang = $tgt;
+        my $mtdata = $span;
+        while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) {
+            die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
+                (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
+            die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
+                $doc = extract_sgml_tag_attribute ($name="DocID", $tag);
+            die "\n\nFATAL INPUT ERROR:  document '$doc' for system '$sys' in file '$file'\n"
+                ."                    previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
+                    unless (not defined $docs->{$sys}{$doc});
+            $span =~ s/[\s\n\r]+/ /g;  # concatenate records
+            my $jseg=0, my $seg_data = $span;
+            while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
+                ($docs->{$sys}{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
+            }
+            die "\n\nFATAL INPUT ERROR:  no segments in document '$doc' in file '$file'\n\n"
+                if $jseg == 0;
+            $docs->{$sys}{$doc}{FILE} = $file;
+        }
+    }
+    return $id;
+}
+#################################
+sub check_MT_data {
+    @tst_sys = sort keys %tst_data;
+    @ref_sys = sort keys %ref_data;
+#every evaluation document must be represented for every system and every reference
+    foreach my $doc (sort keys %eval_docs) {
+        my $nseg_source = @{$eval_docs{$doc}{SEGS}};
+        foreach my $sys (@tst_sys) {
+            die "\n\nFATAL ERROR:  no document '$doc' for system '$sys'\n\n"
+                unless defined $tst_data{$sys}{$doc};
+            my $nseg = @{$tst_data{$sys}{$doc}{SEGS}};
+            die "\n\nFATAL ERROR:  translated documents must contain the same # of segments as the source, but\n"
+                ."              document '$doc' for system '$sys' contains $nseg segments, while\n"
+                ."              the source document contains $nseg_source segments.\n\n"
+                    unless $nseg == $nseg_source;
+        }
+        foreach my $sys (@ref_sys) {
+            die "\n\nFATAL ERROR:  no document '$doc' for reference '$sys'\n\n"
+                unless defined $ref_data{$sys}{$doc};
+            my $nseg = @{$ref_data{$sys}{$doc}{SEGS}};
+            die "\n\nFATAL ERROR:  translated documents must contain the same # of segments as the source, but\n"
+                ."              document '$doc' for system '$sys' contains $nseg segments, while\n"
+                ."              the source document contains $nseg_source segments.\n\n"
+                    unless $nseg == $nseg_source;
+        }
+    }
+}
+#################################
+sub compute_ngram_info {
+    my ($ref, $doc, $seg);
+    my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
+    my (%ngram_count, @tot_ngrams);
+    foreach $ref (keys %ref_data) {
+        foreach $doc (keys %{$ref_data{$ref}}) {
+            foreach $seg (@{$ref_data{$ref}{$doc}{SEGS}}) {
+                @wrds = split /\s+/, $seg;
+                $tot_wrds += @wrds;
+                %ngrams = %{Words2Ngrams (@wrds)};
+                foreach $ngram (keys %ngrams) {
+                    $ngram_count{$ngram} += $ngrams{$ngram};
+                }
+            }
+        }
+    }
+    foreach $ngram (keys %ngram_count) {
+        @wrds = split / /, $ngram;
+        pop @wrds, $mgram = join " ", @wrds;
+        $ngram_info{$ngram} = - log
+            ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram}
+                    : $ngram_count{$ngram}/$tot_wrds) / log 2;
+        if (defined $opt_x and $opt_x eq "ngram info") {
+            @wrds = split / /, $ngram;
+            printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
+                $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
+        }
+    }
+}
+#################################
+sub score_system {
+    my ($sys, $ref, $doc, %SCOREmt);
+    ($sys, %SCOREmt) = @_;
+    my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
+    my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
+    $cum_ref_length = 0;
+    for (my $j=1; $j<=$max_Ngram; $j++) {
+        $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
+    }
+    foreach $doc (sort keys %eval_docs) {
+        ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc);
+#output document summary score
+        if (($detail >= 1 ) && ($METHOD eq "NIST"))  {
+            my %DOCmt = ();
+            printf "$method score using   5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
+            nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt),
+            scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
+        }
+        if (($detail >= 1 ) && ($METHOD eq "BLEU"))  {
+            my %DOCmt = ();
+            printf "$method score using   4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
+            bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt),
+            scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
+        }
+        $cum_ref_length += $shortest_ref_length;
+        for (my $j=1; $j<=$max_Ngram; $j++) {
+            $cum_match[$j] += $match_cnt->[$j];
+            $cum_tst_cnt[$j] += $tst_cnt->[$j];
+            $cum_ref_cnt[$j] += $ref_cnt->[$j];
+            $cum_tst_info[$j] += $tst_info->[$j];
+            $cum_ref_info[$j] += $ref_info->[$j];
+            printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
+                $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
+                    if (defined $opt_x and $opt_x eq "document info");
+        }
+    }
+#x #output system summary score
+#x    printf "$method score = %.4f for system \"$sys\"\n",
+#x        $method eq "BLEU" ?  bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt) :
+#x          nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
+    if ($method eq "BLEU")  {
+        bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, %SCOREmt);
+    }
+    if ($method eq "NIST") {
+        nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
+    }
+}
+#################################
+sub score_document {
+    my ($sys, $ref, $doc);
+    ($sys, $doc) = @_;
+    my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
+    my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
+    $cum_ref_length = 0;
+    for (my $j=1; $j<=$max_Ngram; $j++) {
+        $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
+    }
+#score each segment
+    for (my $jseg=0; $jseg<@{$tst_data{$sys}{$doc}{SEGS}}; $jseg++) {
+        my @ref_segments = ();
+        foreach $ref (@ref_sys) {
+            push @ref_segments, $ref_data{$ref}{$doc}{SEGS}[$jseg];
+            printf "ref '$ref', seg %d: %s\n", $jseg+1, $ref_data{$ref}{$doc}{SEGS}[$jseg]
+                if $detail >= 3;
+        }
+        printf "sys '$sys', seg %d: %s\n", $jseg+1, $tst_data{$sys}{$doc}{SEGS}[$jseg]
+            if $detail >= 3;
+        ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) =
+            score_segment ($tst_data{$sys}{$doc}{SEGS}[$jseg], @ref_segments);
+#output segment summary score
+#x      printf "$method score = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
+#x            $method eq "BLEU" ?  bleu_score($shortest_ref_length, $match_cnt, $tst_cnt) :
+#x              nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info),
+#x              $jseg+1, $tst_cnt->[1]
+#x                  if $detail >= 2;
+        if (($detail >=2) && ($METHOD eq "BLEU")) {
+            my %DOCmt = ();
+            printf "  $method score using 4-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
+            bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
+        }
+        if (($detail >=2) && ($METHOD eq "NIST")) {
+            my %DOCmt = ();
+            printf "  $method score using 5-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
+            nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
+        }
+        $cum_ref_length += $shortest_ref_length;
+        for (my $j=1; $j<=$max_Ngram; $j++) {
+            $cum_match[$j] += $match_cnt->[$j];
+            $cum_tst_cnt[$j] += $tst_cnt->[$j];
+            $cum_ref_cnt[$j] += $ref_cnt->[$j];
+            $cum_tst_info[$j] += $tst_info->[$j];
+            $cum_ref_info[$j] += $ref_info->[$j];
+        }
+    }
+    return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
+}
+#################################
+sub score_segment {
+    my ($tst_seg, @ref_segs) = @_;
+    my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
+    my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
+    my ($ngram);
+    my (@nwrds_ref);
+    my $shortest_ref_length;
+    for (my $j=1; $j<= $max_Ngram; $j++) {
+        $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
+    }
+# get the ngram counts for the test segment
+    @tst_wrds = split /\s+/, $tst_seg;
+    %tst_ngrams = %{Words2Ngrams (@tst_wrds)};
+    for (my $j=1; $j<=$max_Ngram; $j++) { # compute ngram counts
+        $tst_count[$j]  = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
+    }
+# get the ngram counts for the reference segments
+    foreach $ref_seg (@ref_segs) {
+        @ref_wrds = split /\s+/, $ref_seg;
+        %ref_ngrams = %{Words2Ngrams (@ref_wrds)};
+        foreach $ngram (keys %ref_ngrams) { # find the maximum # of occurrences
+            my @wrds = split / /, $ngram;
+            $ref_info[@wrds] += $ngram_info{$ngram};
+            $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ?
+                max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) :
+                    $ref_ngrams{$ngram};
+        }
+        for (my $j=1; $j<=$max_Ngram; $j++) { # update ngram counts
+            $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
+        }
+        $shortest_ref_length = scalar @ref_wrds # find the shortest reference segment
+            if (not defined $shortest_ref_length) or @ref_wrds < $shortest_ref_length;
+    }
+# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
+    foreach $ngram (keys %tst_ngrams) {
+        next unless defined $ref_ngrams_max{$ngram};
+        my @wrds = split / /, $ngram;
+        $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
+        $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
+        printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
+            if $detail >= 3;
+    }
+    return ($shortest_ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
+}
+#################################
+sub bleu_score {
+    my ($shortest_ref_length, $matching_ngrams, $tst_ngrams, $sys, %SCOREmt) = @_;
+    my $score = 0;
+    my $iscore = 0;
+    my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
+    print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n";
+    for (my $j=1; $j<=$max_Ngram; $j++) {
+        if ($matching_ngrams->[$j] == 0) {
+            $SCOREmt{$j}{$sys}{cum}=0;
+        } else {
+# Cumulative N-Gram score
+            $score += log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
+            $SCOREmt{$j}{$sys}{cum} = exp($score/$j + $len_score);
+# Individual N-Gram score
+            $iscore = log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
+            $SCOREmt{$j}{$sys}{ind} = exp($iscore);
+        }
+    }
+    return $SCOREmt{4}{$sys}{cum};
+}
+#################################
+sub nist_score {
+    my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, %SCOREmt) = @_;
+    my $score = 0;
+    my $iscore = 0;
+    for (my $n=1; $n<=$max_Ngram; $n++) {
+        $score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
+        $SCOREmt{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
+        $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
+        $SCOREmt{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
+    }
+    return $SCOREmt{5}{$sys}{cum};
+}
+#################################
+sub Words2Ngrams { #convert a string of words to an Ngram count hash
+    my %count = ();
+    for (; @_; shift) {
+        my ($j, $ngram, $word);
+        for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) {
+            $ngram .= defined $ngram ? " $word" : $word;
+            $count{$ngram}++;
+        }
+    }
+    return {%count};
+}
+#################################
+sub NormalizeText {
+    my ($norm_text) = @_;
+    $norm_text =~ s/<skipped>//g; # strip "skipped" tags
+    $norm_text =~ s/\p{Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines
+    $norm_text =~ s/\p{Zl}/ /g; # join lines
+    # replace entities
+    $norm_text =~ s/&quot;/\"/g;  # quote to "
+    $norm_text =~ s/&amp;/&/g;   # ampersand to &
+    $norm_text =~ s/&lt;/</g;    # less-than to <
+    $norm_text =~ s/&gt;/>/g;    # greater-than to >
+    $norm_text =~ s/&apos;/\'/g; # apostrophe to '
+    $norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed
+    $norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII );
+		# punctuation: tokenize any punctuation unless followed AND preceded by a digit
+		$norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g;
+		$norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g;
+		$norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols
+    $norm_text =~ s/\p{Z}+/ /g; # one space only between words
+    $norm_text =~ s/^\p{Z}+//; # no leading space
+    $norm_text =~ s/\p{Z}+$//; # no trailing space
+    return $norm_text;
+}
+#################################
+sub nist_length_penalty {
+    my ($ratio) = @_;
+    return 1 if $ratio >= 1;
+    return 0 if $ratio <= 0;
+    my $ratio_x = 1.5;
+    my $score_x = 0.5;
+    my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
+    return exp (-$beta*log($ratio)*log($ratio));
+}
+#################################
+sub date_time_stamp {
+    my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
+    my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
+    my ($date, $time);
+    $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
+    $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
+    return ($date, $time);
+}
+#################################
+sub extract_sgml_tag_and_span {
+    my ($name, $data) = @_;
+    ($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
+}
+#################################
+sub extract_sgml_tag_attribute {
+    my ($name, $data) = @_;
+    ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
+}
+#################################
+sub max {
+    my ($max, $next);
+    return unless defined ($max=pop);
+    while (defined ($next=pop)) {
+        $max = $next if $next > $max;
+    }
+    return $max;
+}
+#################################
+sub min {
+    my ($min, $next);
+    return unless defined ($min=pop);
+    while (defined ($next=pop)) {
+        $min = $next if $next < $min;
+    }
+    return $min;
+}
+#################################
+sub printout_report
+{
+    if ( $METHOD eq "BOTH" ) {
+        foreach my $sys (sort @tst_sys) {
+            printf "NIST score = %2.4f  BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
+        }
+    } elsif ($METHOD eq "NIST" ) {
+        foreach my $sys (sort @tst_sys) {
+            printf "NIST score = %2.4f  for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
+        }
+    } elsif ($METHOD eq "BLEU" ) {
+        foreach my $sys (sort @tst_sys) {
+            printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
+        }
+    }
+    printf "\n# ------------------------------------------------------------------------\n\n";
+    printf "Individual N-gram scoring\n";
+    printf "        1-gram   2-gram   3-gram   4-gram   5-gram   6-gram   7-gram   8-gram   9-gram\n";
+    printf "        ------   ------   ------   ------   ------   ------   ------   ------   ------\n";
+    if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
+        foreach my $sys (sort @tst_sys) {
+            printf " NIST:";
+            for (my $i=1; $i<=$max_Ngram; $i++) {
+                printf "  %2.4f ",$NISTmt{$i}{$sys}{ind}
+            }
+            printf " \"$sys\"\n";
+        }
+        printf "\n";
+    }
+    if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
+        foreach my $sys (sort @tst_sys) {
+            printf " BLEU:";
+            for (my $i=1; $i<=$max_Ngram; $i++) {
+                printf "  %2.4f ",$BLEUmt{$i}{$sys}{ind}
+            }
+           printf " \"$sys\"\n";
+        }
+    }
+    printf "\n# ------------------------------------------------------------------------\n";
+    printf "Cumulative N-gram scoring\n";
+    printf "        1-gram   2-gram   3-gram   4-gram   5-gram   6-gram   7-gram   8-gram   9-gram\n";
+    printf "        ------   ------   ------   ------   ------   ------   ------   ------   ------\n";
+    if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
+        foreach my $sys (sort @tst_sys) {
+            printf " NIST:";
+            for (my $i=1; $i<=$max_Ngram; $i++) {
+                printf "  %2.4f ",$NISTmt{$i}{$sys}{cum}
+            }
+            printf " \"$sys\"\n";
+        }
+    }
+    printf "\n";
+    if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
+        foreach my $sys (sort @tst_sys) {
+            printf " BLEU:";
+            for (my $i=1; $i<=$max_Ngram; $i++) {
+                printf "  %2.4f ",$BLEUmt{$i}{$sys}{cum}
+            }
+            printf " \"$sys\"\n";
+        }
+    }
+}

mosesdecoder/scripts/generic/mteval-v13a.pl ADDED Viewed

	@@ -0,0 +1,1170 @@

+#!/usr/bin/env perl
+use warnings;
+use strict;
+use utf8;
+use Encode;
+use XML::Twig;
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+#################################
+# History:
+#
+# version 13a
+#    * modified the scoring functions to prevent division-by-zero errors when a system segment is empty
+#        * affected methods: 'bleu_score' and 'bleu_score_smoothing'
+#    * use \p{Line_Breaks} instead of \p{Hyphen} when stripping end-of-line hyphenation and join lines
+#        * because \p{Hyphen} is deprecated since 2016-06-01, see http://www.unicode.org/reports/tr14/#Hyphen
+#
+# version 13
+#    * Uses a XML parser to read data (only when extension is .xml)
+#    * Smoothing of the segment-level BLEU scores, done by default
+#        * smoothing method similar to that of bleu-1.04.pl (IBM)
+#        * see comments above the 'bleu_score' method for more details on how the smoothing is computed
+#        * added a '--no-smoothing' option to simulate old scripts behavior
+#    * Introduction of the 'brevity-penalty' option, taking one of two values:
+#        * 'closest' (default) : act as IBM BLEU (taking the closest reference translation length)
+#            * in case two reference translations are at the same distance, will take the shortest one
+#            * for more details regarding how the BP is computed, see comments of the 'brevity_penalty_closest' function
+#        * 'shortest' : act as previous versions of the script (taking shortest reference translation length)
+#    * Introduction of the 'international-tokenization' option, boolean, disabled by default
+#        by default (when the option is not provided), uses 11b's tokenization function
+#        when option specified, uses v12's tokenization function
+#    * Introduction of a 'Metrics MATR output' flag (option '--metricsMATR')
+#        when used, creates three files for both BLEU score and NIST score:
+#            * BLEU-seg.scr and NIST-seg.scr: contain segment-level scores
+#            * BLEU-doc.scr and NIST-doc.scr: contain document-level scores
+#            * BLEU-sys.scr and NIST-sys.scr: contain system-level scores
+#    * SGML parsing
+#        * script will halt if source, reference and test files don't share the same setid attribute value (used for metricsMATR output)
+#        * correct segment IDs extracted from the files (was previously using an array, and using the index as a segID for output)
+#    * detailed output flag (-d) can now be used when running both BLEU and NIST
+#
+# version 12
+#    * Text normalization changes:
+#        * convert entity references (only the entities declared in the DTD)
+#        * now uses unicode categories
+#        * tokenize punctuation unless followed AND preceded by digits
+#        * tokenize symbols
+#    * UTF-8 handling:
+#        * files are now read using utf8 mode
+#    * Added the '-e' command-line option to enclose non-ASCII characters between spaces
+#
+# version 11b -- text normalization modified:
+#    * take out the join digit line because it joins digits
+#      when it shouldn't have
+#      $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
+#
+# version 11a -- corrected output of individual n-gram precision values
+#
+# version 11 -- bug fixes:
+#    * make filehandle operate in binary mode to prevent Perl from operating
+#      (by default in Red Hat 9) in UTF-8
+#    * fix failure on joining digits
+# version 10 -- updated output to include more details of n-gram scoring.
+#    Defaults to generate both NIST and BLEU scores.  Use -b for BLEU
+#    only, use -n for NIST only
+#
+# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
+#    being the max, regardless what was entered on the command line.)
+#
+# version 09c -- bug fix (During the calculation of ngram information,
+#    each ngram was being counted only once for each segment.  This has
+#    been fixed so that each ngram is counted correctly in each segment.)
+#
+# version 09b -- text normalization modified:
+#    * option flag added to preserve upper case
+#    * non-ASCII characters left in place.
+#
+# version 09a -- text normalization modified:
+#    * &quot; and &amp; converted to "" and &, respectively
+#    * non-ASCII characters kept together (bug fix)
+#
+# version 09 -- modified to accommodate sgml tag and attribute
+#    names revised to conform to default SGML conventions.
+#
+# version 08 -- modifies the NIST metric in accordance with the
+#    findings on the 2001 Chinese-English dry run corpus.  Also
+#    incorporates the BLEU metric as an option and supports the
+#    output of ngram detail.
+#
+# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
+#    Keep strings of non-ASCII characters together as one word
+#    (rather than splitting them into one-character words).
+#    Change length penalty so that translations that are longer than
+#    the average reference translation are not penalized.
+#
+# version 06
+#    Prevent divide-by-zero when a segment has no evaluation N-grams.
+#    Correct segment index for level 3 debug output.
+#
+# version 05
+#    improve diagnostic error messages
+#
+# version 04
+#    tag segments
+#
+# version 03
+#    add detailed output option (intermediate document and segment scores)
+#
+# version 02
+#    accommodation of modified sgml tags and attributes
+#
+# version 01
+#    same as bleu version 15, but modified to provide formal score output.
+#
+# original IBM version
+#    Author: Kishore Papineni
+#    Date: 06/10/2001
+#################################
+######
+# Intro
+my ($date, $time) = date_time_stamp();
+print "MT evaluation scorer began on $date at $time\n";
+print "command line:  ", $0, " ", join(" ", @ARGV), "\n";
+my $usage = "\n\nUsage: $0 -r <ref_file> -s <src_file> -t <tst_file>\n\n".
+    "Description:  This Perl script evaluates MT system performance.\n".
+    "\n".
+    "Required arguments:\n".
+    "  -r <ref_file> is a file containing the reference translations for\n".
+    "      the documents to be evaluated.\n".
+    "  -s <src_file> is a file containing the source documents for which\n".
+    "      translations are to be evaluated\n".
+    "  -t <tst_file> is a file containing the translations to be evaluated\n".
+    "\n".
+    "Optional arguments:\n".
+    "  -h prints this help message to STDOUT\n".
+    "  -c preserves upper-case alphabetic characters\n".
+    "  -b generate BLEU scores only\n".
+    "  -n generate NIST scores only\n".
+    "  -d detailed output flag:\n".
+    "         0 (default) for system-level score only\n".
+    "         1 to include document-level scores\n".
+    "         2 to include segment-level scores\n".
+    "         3 to include ngram-level scores\n".
+    "  -e enclose non-ASCII characters between spaces\n".
+     "  --brevity-penalty ( closest | shortest )\n" .
+    "         closest (default) : acts as IBM BLEU (takes the closest reference translation length)\n" .
+    "         shortest : acts as previous versions of the script (takes the shortest reference translation length)\n" .
+    "  --international-tokenization\n" .
+    "         when specified, uses Unicode-based (only) tokenization rules\n" .
+    "         when not specified (default), uses default tokenization (some language-dependant rules)\n" .
+    "  --metricsMATR : create three files for both BLEU scores and NIST scores:\n" .
+    "         BLEU-seg.scr and NIST-seg.scr : segment-level scores\n" .
+    "         BLEU-doc.scr and NIST-doc.scr : document-level scores\n" .
+    "         BLEU-sys.scr and NIST-sys.scr : system-level scores\n" .
+    "  --no-smoothing : disable smoothing on BLEU scores\n" .
+    "\n";
+use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e);
+use Getopt::Long;
+my $ref_file = '';
+my $src_file = '';
+my $tst_file = '';
+my $detail = 0;
+my $help = '';
+my $preserve_case = '';
+my $split_non_ASCII = '';
+my $brevity_penalty = 'closest';
+my $international_tokenization;
+my $metricsMATR_output = '';
+my $no_smoothing = '';
+our $opt_x = '';
+our $opt_b = '';
+our $opt_n = '';
+GetOptions(
+	'r=s' => \$ref_file,
+	's=s' => \$src_file,
+	't=s' => \$tst_file,
+	'd:i' => \$detail,
+	'h|help' => \$help,
+	'b',
+	'n',
+	'c' => \$preserve_case,
+	'x:s',
+	'e' => \$split_non_ASCII,
+	'brevity-penalty:s' => \$brevity_penalty,
+	'international-tokenization' => \$international_tokenization,
+	'metricsMATR-output' => \$metricsMATR_output,
+	'no-smoothing' => \$no_smoothing
+);
+die $usage if $help;
+die "Error in command line:  ref_file not defined$usage" unless ( $ref_file );
+die "Error in command line:  src_file not defined$usage" unless ( $src_file );
+die "Error in command line:  tst_file not defined$usage" unless ( $tst_file );
+my $BLEU_BP;
+if ( !( $brevity_penalty cmp 'closest' ) )
+{
+	$BLEU_BP = \&brevity_penalty_closest;
+}
+elsif ( !( $brevity_penalty cmp 'shortest' ) )
+{
+	$BLEU_BP = \&brevity_penalty_shortest;
+}
+else
+{
+	die "Incorrect value supplied for 'brevity_penalty'$usage";
+}
+my $TOKENIZATION = \&tokenization;
+$TOKENIZATION = \&tokenization_international if ( $international_tokenization );
+my $BLEU_SCORE = \&bleu_score;
+$BLEU_SCORE = \&bleu_score_nosmoothing if ( $no_smoothing );
+my $max_Ngram = 9;
+my $METHOD = "BOTH";
+if ( $opt_b ) { $METHOD = "BLEU"; }
+if ( $opt_n ) { $METHOD = "NIST"; }
+my $method;
+######
+# Global variables
+my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
+my (%tst_data, %ref_data); # the data -- with structure:  {system}{document}{segments}
+my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
+my %eval_docs;     # document information for the evaluation data set
+my %ngram_info;    # the information obtained from (the last word in) the ngram
+######
+# Get source document ID's
+($src_id) = get_source_info ($src_file);
+######
+# Get reference translations
+($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
+compute_ngram_info ();
+######
+# Get translations to evaluate
+($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
+######
+# Check data for completeness and correctness
+check_MT_data ();
+######
+#
+my %NISTmt;
+my %NISTOverall;
+my %BLEUmt;
+my %BLEUOverall;
+######
+# Evaluate
+print "  Evaluation of $src_lang-to-$tgt_lang translation using:\n";
+my $cum_seg = 0;
+foreach my $doc (sort keys %eval_docs)
+{
+	$cum_seg += scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
+}
+print "    src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
+print "    ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
+print "    tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
+foreach my $sys (sort @tst_sys)
+{
+	for (my $n=1; $n<=$max_Ngram; $n++)
+	{
+		$NISTmt{$n}{$sys}{cum} = 0;
+		$NISTmt{$n}{$sys}{ind} = 0;
+		$BLEUmt{$n}{$sys}{cum} = 0;
+		$BLEUmt{$n}{$sys}{ind} = 0;
+	}
+	if ( ($METHOD eq "BOTH") || ($METHOD eq "NIST") )
+	{
+		$method="NIST";
+		score_system ($sys, \%NISTmt, \%NISTOverall);
+	}
+	if ( ($METHOD eq "BOTH") || ($METHOD eq "BLEU") )
+	{
+		$method="BLEU";
+		score_system ($sys, \%BLEUmt, \%BLEUOverall);
+	}
+}
+######
+printout_report ();
+if ( $metricsMATR_output )
+{
+	outputMetricsMATR( 'NIST', %NISTOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'NIST' ) );
+	outputMetricsMATR( 'BLEU', %BLEUOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'BLEU' ) );
+}
+($date, $time) = date_time_stamp();
+print "MT evaluation scorer ended on $date at $time\n";
+exit 0;
+#################################
+sub get_source_info
+{
+	my ($file) = @_;
+	my ($name, $id, $src, $doc, $seg);
+	my ($data, $tag, $span);
+	# Extension of the file determines the parser used:
+	#   .xml       : XML::Twig
+	#   otherwise  : simple SGML parsing functions
+	if ( $file =~ /\.xml$/i )
+	{
+		my $twig = XML::Twig->new();
+		$twig->parsefile( $file );
+		my $root = $twig->root;
+		my $currentSet = $root->first_child( 'srcset' );
+		die "Source XML file '$file' does not contain the 'srcset' element" if ( not $currentSet );
+		$id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
+		$src = $currentSet->{ 'att' }->{ 'srclang' } or die "No srcset 'srclang' attribute value in '$file'";
+		die "Not the same srclang attribute values across sets" unless ( not defined $src_lang or $src eq $src_lang );
+		$src_lang = $src;
+		foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
+		{
+			my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
+			foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
+			{
+				my $segID = $currentSeg->{ 'att' }->{ 'id' } or die "No segment 'id' attribute value in '$file'";
+				my $segData = $currentSeg->text;
+				($eval_docs{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
+			}
+		}
+	}
+	else
+	{
+		#read data from file
+		open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
+		binmode FILE, ":utf8";
+		$data .= $_ while <FILE>;
+		close (FILE);
+		#get source set info
+		die "\n\nFATAL INPUT ERROR:  no 'src_set' tag in src_file '$file'\n\n"
+			unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
+		die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+			unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
+		die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+			unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
+		die "\n\nFATAL INPUT ERROR:  $name ('$src') in file '$file' inconsistent\n"
+			."                    with $name in previous input data ('$src_lang')\n\n"
+			unless (not defined $src_lang or $src eq $src_lang);
+		$src_lang = $src;
+		#get doc info -- ID and # of segs
+		$data = $span;
+		while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data))
+		{
+			die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+				unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
+			die "\n\nFATAL INPUT ERROR:  duplicate '$name' in file '$file'\n\n"
+				if defined $eval_docs{$doc};
+			$span =~ s/[\s\n\r]+/ /g;  # concatenate records
+			my $nseg=0, my $seg_data = $span;
+			while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
+			{
+				die "\n\nFATAL INPUT ERROR:  no attribute '$name' in file '$file'\n\n"
+					unless ($seg) = extract_sgml_tag_attribute( $name='id', $tag );
+				($eval_docs{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
+				$nseg++;
+			}
+			die "\n\nFATAL INPUT ERROR:  no segments in document '$doc' in file '$file'\n\n"
+				if $nseg == 0;
+		}
+		die "\n\nFATAL INPUT ERROR:  no documents in file '$file'\n\n"
+			unless keys %eval_docs > 0;
+	}
+	return $id;
+}
+#################################
+sub get_MT_data
+{
+	my ($docs, $set_tag, $file) = @_;
+	my ($name, $id, $src, $tgt, $sys, $doc, $seg);
+	my ($tag, $span, $data);
+	# Extension of the file determines the parser used:
+	#   .xml       : XML::Twig
+	#   otherwise  : simple SGML parsing functions
+	if ( $file =~ /\.xml$/i )
+	{
+		my $twig = XML::Twig->new();
+		$twig->parsefile( $file );
+		my $root = $twig->root;
+		foreach my $currentSet ( $root->get_xpath( 'refset' ), $root->get_xpath( 'tstset' ) )
+		{
+			$id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
+			$src = $currentSet->{ 'att' }->{ 'srclang' } or die "No 'srclang' attribute value in '$file'";
+			$tgt = $currentSet->{ 'att' }->{ 'trglang' } or die "No 'trglang' attribute value in '$file'";
+			die "Not the same 'srclang' attribute value across sets" unless ( $src eq $src_lang );
+			die "Not the same 'trglang' attribute value across sets" unless ( ( not defined $tgt_lang ) or ( $tgt = $tgt_lang ) );
+			$tgt_lang = $tgt;
+			my $sys;
+			if ( $currentSet->name eq 'tstset' )
+			{
+				$sys = $currentSet->{ 'att' }->{ 'sysid' } or die "No 'sysid' attribute value in '$file'";
+			}
+			else
+			{
+				$sys = $currentSet->{ 'att' }->{ 'refid' } or die "No 'refid' attribute value in '$file'";
+			}
+			foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
+			{
+				my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
+				$docs->{ $sys }{ $docID }{ FILE } = $file;
+				foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
+				{
+					my $segID = $currentSeg->{ 'att' }->{ 'id' } or die "No segment 'id' attribute value in '$file'";
+					my $segData = $currentSeg->text;
+					($docs->{$sys}{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
+				}
+			}
+		}
+	}
+	else
+	{
+		#read data from file
+		open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
+		binmode FILE, ":utf8";
+		$data .= $_ while <FILE>;
+		close (FILE);
+		#get tag info
+		while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data))
+		{
+			die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+				unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
+			die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+				unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
+			die "\n\nFATAL INPUT ERROR:  $name ('$src') in file '$file' inconsistent\n"
+				."                    with $name of source ('$src_lang')\n\n"
+				unless $src eq $src_lang;
+			die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+				unless ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
+			die "\n\nFATAL INPUT ERROR:  $name ('$tgt') in file '$file' inconsistent\n"
+				."                    with $name of the evaluation ('$tgt_lang')\n\n"
+				unless (not defined $tgt_lang or $tgt eq $tgt_lang);
+			$tgt_lang = $tgt;
+			my $mtdata = $span;
+			while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata))
+			{
+				die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+					unless (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
+				die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+					unless $doc = extract_sgml_tag_attribute ($name="DocID", $tag);
+				die "\n\nFATAL INPUT ERROR:  document '$doc' for system '$sys' in file '$file'\n"
+					."                    previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
+					unless (not defined $docs->{$sys}{$doc});
+				$span =~ s/[\s\n\r]+/ /g;  # concatenate records
+				my $nseg=0, my $seg_data = $span;
+				while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
+				{
+					die "\n\nFATAIL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+						unless $seg = extract_sgml_tag_attribute( $name="id", $tag );
+					($docs->{$sys}{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
+					$nseg++;
+				}
+				die "\n\nFATAL INPUT ERROR:  no segments in document '$doc' in file '$file'\n\n" if $nseg == 0;
+				$docs->{$sys}{$doc}{FILE} = $file;
+			}
+		}
+	}
+	return $id;
+}
+#################################
+sub check_MT_data
+{
+	@tst_sys = sort keys %tst_data;
+	@ref_sys = sort keys %ref_data;
+	die "Not the same 'setid' attribute values across files" unless ( ( $src_id eq $tst_id ) && ( $src_id eq $ref_id ) );
+#every evaluation document must be represented for every system and every reference
+	foreach my $doc (sort keys %eval_docs)
+	{
+		my $nseg_source = scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
+		foreach my $sys (@tst_sys)
+		{
+			die "\n\nFATAL ERROR:  no document '$doc' for system '$sys'\n\n" unless defined $tst_data{$sys}{$doc};
+			my $nseg = scalar( keys( %{$tst_data{$sys}{$doc}{SEGS}} ) );
+			die "\n\nFATAL ERROR:  translated documents must contain the same # of segments as the source, but\n"
+				."              document '$doc' for system '$sys' contains $nseg segments, while\n"
+				."              the source document contains $nseg_source segments.\n\n"
+				unless $nseg == $nseg_source;
+		}
+		foreach my $sys (@ref_sys)
+		{
+			die "\n\nFATAL ERROR:  no document '$doc' for reference '$sys'\n\n" unless defined $ref_data{$sys}{$doc};
+			my $nseg = scalar( keys( %{$ref_data{$sys}{$doc}{SEGS}} ) );
+			die "\n\nFATAL ERROR:  translated documents must contain the same # of segments as the source, but\n"
+				."              document '$doc' for system '$sys' contains $nseg segments, while\n"
+				."              the source document contains $nseg_source segments.\n\n"
+				unless $nseg == $nseg_source;
+		}
+	}
+}
+#################################
+sub compute_ngram_info
+{
+	my ($ref, $doc, $seg);
+	my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
+	my (%ngram_count, @tot_ngrams);
+	foreach $ref (keys %ref_data)
+	{
+		foreach $doc (keys %{$ref_data{$ref}})
+		{
+			foreach $seg ( keys %{$ref_data{$ref}{$doc}{SEGS}})
+			{
+				@wrds = split /\s+/, $ref_data{ $ref }{ $doc }{ SEGS }{ $seg };
+				$tot_wrds += @wrds;
+				%ngrams = %{Words2Ngrams (@wrds)};
+				foreach $ngram (keys %ngrams)
+				{
+					$ngram_count{$ngram} += $ngrams{$ngram};
+				}
+			}
+		}
+	}
+	foreach $ngram (keys %ngram_count)
+	{
+		@wrds = split / /, $ngram;
+		pop @wrds, $mgram = join " ", @wrds;
+		$ngram_info{$ngram} = - log ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} : $ngram_count{$ngram}/$tot_wrds) / log 2;
+		if (defined $opt_x and $opt_x eq "ngram info")
+		{
+			@wrds = split / /, $ngram;
+			printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
+				$mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
+		}
+	}
+}
+#################################
+sub score_system
+{
+	my ($sys, $ref, $doc, $SCOREmt, $overallScore);
+	($sys, $SCOREmt, $overallScore) = @_;
+	my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
+	my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
+	$cum_ref_length = 0;
+	for (my $j=1; $j<=$max_Ngram; $j++)
+	{
+		$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
+	}
+	foreach $doc (sort keys %eval_docs)
+	{
+		($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc, $overallScore);
+		if ( $method eq "NIST" )
+		{
+			my %DOCmt = ();
+			my $docScore = nist_score( scalar( @ref_sys ), $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, \%DOCmt );
+			$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
+			if ( $detail >= 1 )
+			{
+				printf "$method score using   5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
+				$docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
+			}
+		}
+		if ( $method eq "BLEU" )
+		{
+			my %DOCmt = ();
+			my $docScore = &{$BLEU_SCORE}( $ref_length, $match_cnt, $tst_cnt, $sys, \%DOCmt );
+			$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
+			if ( $detail >= 1 )
+			{
+				printf "$method score using   4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
+					$docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
+			}
+		}
+		$cum_ref_length += $ref_length;
+		for (my $j=1; $j<=$max_Ngram; $j++)
+		{
+			$cum_match[$j] += $match_cnt->[$j];
+			$cum_tst_cnt[$j] += $tst_cnt->[$j];
+			$cum_ref_cnt[$j] += $ref_cnt->[$j];
+			$cum_tst_info[$j] += $tst_info->[$j];
+			$cum_ref_info[$j] += $ref_info->[$j];
+			printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
+				$tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
+				if (defined $opt_x and $opt_x eq "document info");
+		}
+	}
+	if ($method eq "BLEU")
+	{
+		$overallScore->{ $sys }{ 'score' } = &{$BLEU_SCORE}($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, $SCOREmt, 1);
+	}
+	if ($method eq "NIST")
+	{
+		$overallScore->{ $sys }{ 'score' } = nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, $SCOREmt);
+	}
+}
+#################################
+sub score_document
+{
+	my ($sys, $ref, $doc, $overallScore);
+	($sys, $doc, $overallScore) = @_;
+	my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
+	my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
+	$cum_ref_length = 0;
+	for (my $j=1; $j<=$max_Ngram; $j++)
+	{
+		$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
+	}
+#score each segment
+	foreach my $seg ( sort{ $a <=> $b } keys( %{$tst_data{$sys}{$doc}{SEGS}} ) )
+	{
+		my @ref_segments = ();
+		foreach $ref (@ref_sys)
+		{
+			push @ref_segments, $ref_data{$ref}{$doc}{SEGS}{$seg};
+			if ( $detail >= 3 )
+			{
+				printf "ref '$ref', seg $seg: %s\n", $ref_data{$ref}{$doc}{SEGS}{$seg}
+			}
+		}
+		printf "sys '$sys', seg $seg: %s\n", $tst_data{$sys}{$doc}{SEGS}{$seg} if ( $detail >= 3 );
+		($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_segment ($tst_data{$sys}{$doc}{SEGS}{$seg}, @ref_segments);
+		if ( $method eq "BLEU" )
+		{
+			my %DOCmt = ();
+			my $segScore = &{$BLEU_SCORE}($ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt);
+			$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
+			if ( $detail >= 2 )
+			{
+				printf "  $method score using 4-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1]
+			}
+		}
+		if ( $method eq "NIST" )
+		{
+			my %DOCmt = ();
+			my $segScore = nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt);
+			$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
+			if ( $detail >= 2 )
+			{
+				printf "  $method score using 5-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1];
+			}
+		}
+		$cum_ref_length += $ref_length;
+		for (my $j=1; $j<=$max_Ngram; $j++)
+		{
+			$cum_match[$j] += $match_cnt->[$j];
+			$cum_tst_cnt[$j] += $tst_cnt->[$j];
+			$cum_ref_cnt[$j] += $ref_cnt->[$j];
+			$cum_tst_info[$j] += $tst_info->[$j];
+			$cum_ref_info[$j] += $ref_info->[$j];
+		}
+	}
+	return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
+}
+###############################################################################################################################
+# function returning the shortest reference length
+# takes as input:
+#  - currentLength : the current (shortest) reference length
+#  - referenceSentenceLength : the current reference sentence length
+#  - candidateSentenceLength : the current candidate sentence length (unused)
+###############################################################################################################################
+sub brevity_penalty_shortest
+{
+	my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
+	return ( $referenceSentenceLength < $currentLength ? $referenceSentenceLength : $currentLength );
+}
+###############################################################################################################################
+# function returning the closest reference length (to the candidate sentence length)
+# takes as input:
+#  - currentLength: the current (closest) reference length.
+#  - candidateSentenceLength : the current reference sentence length
+#  - candidateSentenceLength : the current candidate sentence length
+# when two reference sentences are at the same distance, it will return the shortest reference sentence length
+# example of 4 iterations, given:
+#  - one candidate sentence containing 7 tokens
+#  - one reference translation containing 11 tokens
+#  - one reference translation containing 8 tokens
+#  - one reference translation containing 6 tokens
+#  - one reference translation containing 7 tokens
+# the multiple invokations will return:
+#  - currentLength is set to 11 (outside of this function)
+#  - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 )
+#  - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8
+#  - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 )
+###############################################################################################################################
+sub brevity_penalty_closest
+{
+	my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
+	my $result = $currentLength;
+	if ( abs( $candidateSentenceLength - $referenceSentenceLength ) <= abs( $candidateSentenceLength - $currentLength ) )
+	{
+		if ( abs( $candidateSentenceLength - $referenceSentenceLength ) == abs( $candidateSentenceLength - $currentLength ) )
+		{
+			if ( $currentLength > $referenceSentenceLength )
+			{
+				$result = $referenceSentenceLength;
+			}
+		}
+		else
+		{
+			$result = $referenceSentenceLength;
+		}
+	}
+	return $result;
+}
+#################################
+sub score_segment
+{
+	my ($tst_seg, @ref_segs) = @_;
+	my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
+	my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
+	my ($ngram);
+	my (@nwrds_ref);
+	my $ref_length;
+	for (my $j=1; $j<= $max_Ngram; $j++)
+	{
+		$match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
+	}
+# get the ngram counts for the test segment
+	@tst_wrds = split /\s+/, $tst_seg;
+	%tst_ngrams = %{Words2Ngrams (@tst_wrds)};
+	for (my $j=1; $j<=$max_Ngram; $j++)
+	{
+		# compute ngram counts
+		$tst_count[$j]  = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
+	}
+# get the ngram counts for the reference segments
+	foreach $ref_seg (@ref_segs)
+	{
+		@ref_wrds = split /\s+/, $ref_seg;
+		%ref_ngrams = %{Words2Ngrams (@ref_wrds)};
+		foreach $ngram (keys %ref_ngrams)
+		{
+			# find the maximum # of occurrences
+			my @wrds = split / /, $ngram;
+			$ref_info[@wrds] += $ngram_info{$ngram};
+			$ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : $ref_ngrams{$ngram};
+		}
+		for (my $j=1; $j<=$max_Ngram; $j++)
+		{
+			# update ngram counts
+			$ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
+		}
+		if ( not defined( $ref_length ) )
+		{
+			$ref_length = scalar( @ref_wrds );
+		}
+		else
+		{
+			$ref_length = &{$BLEU_BP}( $ref_length, scalar( @ref_wrds ), scalar( @tst_wrds ) );
+		}
+	}
+# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
+	foreach $ngram (keys %tst_ngrams)
+	{
+		next unless defined $ref_ngrams_max{$ngram};
+		my @wrds = split / /, $ngram;
+		$tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
+		$match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
+		printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
+			if $detail >= 3;
+	}
+	return ($ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
+}
+#################################
+sub bleu_score_nosmoothing
+{
+	my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_;
+	my $score = 0;
+	my $iscore = 0;
+	for ( my $j = 1; $j <= $max_Ngram; ++$j )
+	{
+		if ($matching_ngrams->[ $j ] == 0)
+		{
+			$SCOREmt->{ $j }{ $sys }{ cum }=0;
+		}
+		else
+		{
+			my $len_score = min (0, 1-$ref_length/$tst_ngrams->[1]);
+			# Cumulative N-Gram score
+			$score += log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
+			$SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j + $len_score );
+			# Individual N-Gram score
+			$iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
+			$SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
+		}
+	}
+	return $SCOREmt->{ 4 }{ $sys }{ cum };
+}
+###############################################################################################################################
+# Default method used to compute the BLEU score, using smoothing.
+# Note that the method used can be overridden using the '--no-smoothing' command-line argument
+# The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each precision score whose matching n-gram count is null
+# k is 1 for the first 'n' value for which the n-gram match count is null
+# For example, if the text contains:
+#   - one 2-gram match
+#   - and (consequently) two 1-gram matches
+# the n-gram count for each individual precision score would be:
+#   - n=1  =>  prec_count = 2     (two unigrams)
+#   - n=2  =>  prec_count = 1     (one bigram)
+#   - n=3  =>  prec_count = 1/2   (no trigram,  taking 'smoothed' value of 1 / ( 2^k ), with k=1)
+#   - n=4  =>  prec_count = 1/4   (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
+###############################################################################################################################
+sub bleu_score
+{
+	my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt,$report_length) = @_;
+	my $score = 0;
+	my $iscore = 0;
+	my $exp_len_score = 0;
+	$exp_len_score = exp( min (0, 1 - $ref_length / $tst_ngrams->[ 1 ] ) ) if ( $tst_ngrams->[ 1 ] > 0 );
+        print "length ratio: ".($tst_ngrams->[1]/$ref_length)." ($tst_ngrams->[1]/$ref_length), penalty (log): ".log($exp_len_score)."\n" if $report_length;
+	my $smooth = 1;
+	for ( my $j = 1; $j <= $max_Ngram; ++$j )
+	{
+		if ( $tst_ngrams->[ $j ] == 0 )
+		{
+			$iscore = 0;
+		}
+		elsif ( $matching_ngrams->[ $j ] == 0 )
+		{
+			$smooth *= 2;
+			$iscore = log( 1 / ( $smooth * $tst_ngrams->[ $j ] ) );
+		}
+		else
+		{
+			$iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
+		}
+		$SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
+		$score += $iscore;
+		$SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j ) * $exp_len_score;
+	}
+	return $SCOREmt->{ 4 }{ $sys }{ cum };
+}
+#################################
+sub nist_score
+{
+	my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, $SCOREmt) = @_;
+	my $score = 0;
+	my $iscore = 0;
+	for (my $n=1; $n<=$max_Ngram; $n++)
+	{
+		$score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
+		$SCOREmt->{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
+		$iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
+		$SCOREmt->{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
+	}
+	return $SCOREmt->{5}{$sys}{cum};
+}
+#################################
+sub Words2Ngrams
+{
+	#convert a string of words to an Ngram count hash
+	my %count = ();
+	for (; @_; shift)
+	{
+		my ($j, $ngram, $word);
+		for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++)
+		{
+			$ngram .= defined $ngram ? " $word" : $word;
+			$count{$ngram}++;
+		}
+	}
+	return {%count};
+}
+#################################
+sub tokenization
+{
+	my ($norm_text) = @_;
+# language-independent part:
+	$norm_text =~ s/<skipped>//g; # strip "skipped" tags
+	$norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
+	$norm_text =~ s/\n/ /g; # join lines
+	$norm_text =~ s/&quot;/"/g;  # convert SGML tag for quote to "
+	$norm_text =~ s/&amp;/&/g;   # convert SGML tag for ampersand to &
+	$norm_text =~ s/&lt;/</g;    # convert SGML tag for less-than to >
+	$norm_text =~ s/&gt;/>/g;    # convert SGML tag for greater-than to <
+# language-dependent part (assuming Western languages):
+	$norm_text = " $norm_text ";
+	$norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
+	$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g;   # tokenize punctuation
+	$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
+	$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
+	$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
+	$norm_text =~ s/\s+/ /g; # one space only between words
+	$norm_text =~ s/^\s+//;  # no leading space
+	$norm_text =~ s/\s+$//;  # no trailing space
+	return $norm_text;
+}
+sub tokenization_international
+{
+	my ($norm_text) = @_;
+	$norm_text =~ s/<skipped>//g; # strip "skipped" tags
+	$norm_text =~ s/\p{Line_Break: Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines
+	$norm_text =~ s/\p{Zl}/ /g; # join lines
+	# replace entities
+	$norm_text =~ s/&quot;/\"/g;  # quote to "
+	$norm_text =~ s/&amp;/&/g;   # ampersand to &
+	$norm_text =~ s/&lt;/</g;    # less-than to <
+	$norm_text =~ s/&gt;/>/g;    # greater-than to >
+	$norm_text =~ s/&apos;/\'/g; # apostrophe to '
+	$norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed
+	$norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII );
+	# punctuation: tokenize any punctuation unless followed AND preceded by a digit
+	$norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g;
+	$norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g;
+	$norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols
+	$norm_text =~ s/\p{Z}+/ /g; # one space only between words
+	$norm_text =~ s/^\p{Z}+//; # no leading space
+	$norm_text =~ s/\p{Z}+$//; # no trailing space
+	return $norm_text;
+}
+#################################
+sub nist_length_penalty
+{
+	my ($ratio) = @_;
+	return 1 if $ratio >= 1;
+	return 0 if $ratio <= 0;
+	my $ratio_x = 1.5;
+	my $score_x = 0.5;
+	my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
+	return exp (-$beta*log($ratio)*log($ratio));
+}
+#################################
+sub date_time_stamp
+{
+	my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
+	my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
+	my ($date, $time);
+	$time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
+	$date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
+	return ($date, $time);
+}
+#################################
+sub extract_sgml_tag_and_span
+{
+	my ($name, $data) = @_;
+	($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
+}
+#################################
+sub extract_sgml_tag_attribute
+{
+	my ($name, $data) = @_;
+	($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
+}
+#################################
+sub max
+{
+	my ($max, $next);
+	return unless defined ($max=pop);
+	while (defined ($next=pop))
+	{
+		$max = $next if $next > $max;
+	}
+	return $max;
+}
+#################################
+sub min
+{
+	my ($min, $next);
+	return unless defined ($min=pop);
+	while (defined ($next=pop))
+	{
+		$min = $next if $next < $min;
+	}
+	return $min;
+}
+#################################
+sub printout_report
+{
+	if ( $METHOD eq "BOTH" )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf "NIST score = %2.4f  BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
+		}
+	}
+	elsif ($METHOD eq "NIST" )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf "NIST score = %2.4f  for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
+		}
+	}
+	elsif ($METHOD eq "BLEU" )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
+		}
+	}
+	printf "\n# ------------------------------------------------------------------------\n\n";
+	printf "Individual N-gram scoring\n";
+	printf "        1-gram   2-gram   3-gram   4-gram   5-gram   6-gram   7-gram   8-gram   9-gram\n";
+	printf "        ------   ------   ------   ------   ------   ------   ------   ------   ------\n";
+	if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "NIST") )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf " NIST:";
+			for (my $i=1; $i<=$max_Ngram; $i++)
+			{
+				printf "  %2.4f ",$NISTmt{$i}{$sys}{ind}
+			}
+			printf " \"$sys\"\n";
+		}
+		printf "\n";
+	}
+	if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf " BLEU:";
+			for (my $i=1; $i<=$max_Ngram; $i++)
+			{
+				printf "  %2.4f ",$BLEUmt{$i}{$sys}{ind}
+			}
+			printf " \"$sys\"\n";
+		}
+	}
+	printf "\n# ------------------------------------------------------------------------\n";
+	printf "Cumulative N-gram scoring\n";
+	printf "        1-gram   2-gram   3-gram   4-gram   5-gram   6-gram   7-gram   8-gram   9-gram\n";
+	printf "        ------   ------   ------   ------   ------   ------   ------   ------   ------\n";
+	if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST"))
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf " NIST:";
+			for (my $i=1; $i<=$max_Ngram; $i++)
+			{
+				printf "  %2.4f ",$NISTmt{$i}{$sys}{cum}
+			}
+			printf " \"$sys\"\n";
+		}
+	}
+	printf "\n";
+	if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf " BLEU:";
+			for (my $i=1; $i<=$max_Ngram; $i++)
+			{
+				printf "  %2.4f ",$BLEUmt{$i}{$sys}{cum}
+			}
+			printf " \"$sys\"\n";
+		}
+	}
+}
+###############################################################################################################################
+# Create three files, by using:
+# - $prefix : the prefix used for the output file names
+# - %overall : a hash containing seg/doc/sys-level scores:
+#   - $overall{ $SYSTEM_ID }{ 'score' } => system-level score
+#   - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'score' } => document-level score
+#   - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'segments' }{ $SEGMENT_ID } => segment-level score
+###############################################################################################################################
+sub outputMetricsMATR
+{
+	my ( $prefix, %overall ) = @_;
+	my $fileNameSys = $prefix . '-sys.scr';
+	my $fileNameDoc = $prefix . '-doc.scr';
+	my $fileNameSeg = $prefix . '-seg.scr';
+	open FILEOUT_SYS, '>', $fileNameSys or die "Could not open file: ${fileNameSys}";
+	open FILEOUT_DOC, '>', $fileNameDoc or die "Could not open file: ${fileNameDoc}";
+	open FILEOUT_SEG, '>', $fileNameSeg or die "Could not open file: ${fileNameSeg}";
+	foreach my $sys ( sort( keys( %overall ) ) )
+	{
+		my $scoreSys = $overall{ $sys }{ 'score' };
+		print FILEOUT_SYS "${tst_id}\t${sys}\t${scoreSys}\n";
+		foreach my $doc ( sort( keys( %{$overall{ $sys }{ 'documents' }} ) ) )
+		{
+			my $scoreDoc = $overall{ $sys }{ 'documents' }{ $doc }{ 'score' };
+			print FILEOUT_DOC "${tst_id}\t${sys}\t${doc}\t${scoreDoc}\n";
+			foreach my $seg ( sort{ $a <=> $b }( keys( %{$overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }} ) ) )
+			{
+				my $scoreSeg = $overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' };
+				print FILEOUT_SEG "${tst_id}\t${sys}\t${doc}\t${seg}\t${scoreSeg}\n";
+			}
+		}
+	}
+	close FILEOUT_SEG;
+	close FILEOUT_DOC;
+	close FILEOUT_SYS;
+}

mosesdecoder/scripts/generic/mteval-v14.pl ADDED Viewed

	@@ -0,0 +1,1179 @@

+#!/usr/bin/env perl
+use warnings;
+use strict;
+use utf8;
+use Encode;
+use XML::Twig;
+use Sort::Naturally;
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+#################################
+# History:
+#
+# version 14
+#    (2016-03-29 lukas.diduch@nist.gov)
+#    * Fixed warning message in case seg-id is a string, by sorting in correct order using Sort::Naturally.
+#
+# version 13b
+#    * Fixed die 'bug' in case seg->id = 0
+#
+# version 13a
+#    * modified the scoring functions to prevent division-by-zero errors when a system segment is empty
+#        * affected methods: 'bleu_score' and 'bleu_score_smoothing'
+#
+# version 13
+#    * Uses a XML parser to read data (only when extension is .xml)
+#    * Smoothing of the segment-level BLEU scores, done by default
+#        * smoothing method similar to that of bleu-1.04.pl (IBM)
+#        * see comments above the 'bleu_score' method for more details on how the smoothing is computed
+#        * added a '--no-smoothing' option to simulate old scripts behavior
+#    * Introduction of the 'brevity-penalty' option, taking one of two values:
+#        * 'closest' (default) : act as IBM BLEU (taking the closest reference translation length)
+#            * in case two reference translations are at the same distance, will take the shortest one
+#            * for more details regarding how the BP is computed, see comments of the 'brevity_penalty_closest' function
+#        * 'shortest' : act as previous versions of the script (taking shortest reference translation length)
+#    * Introduction of the 'international-tokenization' option, boolean, disabled by default
+#        by default (when the option is not provided), uses 11b's tokenization function
+#        when option specified, uses v12's tokenization function
+#    * Introduction of a 'Metrics MATR output' flag (option '--metricsMATR')
+#        when used, creates three files for both BLEU score and NIST score:
+#            * BLEU-seg.scr and NIST-seg.scr: contain segment-level scores
+#            * BLEU-doc.scr and NIST-doc.scr: contain document-level scores
+#            * BLEU-sys.scr and NIST-sys.scr: contain system-level scores
+#    * SGML parsing
+#        * script will halt if source, reference and test files don't share the same setid attribute value (used for metricsMATR output)
+#        * correct segment IDs extracted from the files (was previously using an array, and using the index as a segID for output)
+#    * detailed output flag (-d) can now be used when running both BLEU and NIST
+#
+# version 12
+#    * Text normalization changes:
+#        * convert entity references (only the entities declared in the DTD)
+#        * now uses unicode categories
+#        * tokenize punctuation unless followed AND preceded by digits
+#        * tokenize symbols
+#    * UTF-8 handling:
+#        * files are now read using utf8 mode
+#    * Added the '-e' command-line option to enclose non-ASCII characters between spaces
+#
+# version 11b -- text normalization modified:
+#    * take out the join digit line because it joins digits
+#      when it shouldn't have
+#      $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
+#
+# version 11a -- corrected output of individual n-gram precision values
+#
+# version 11 -- bug fixes:
+#    * make filehandle operate in binary mode to prevent Perl from operating
+#      (by default in Red Hat 9) in UTF-8
+#    * fix failure on joining digits
+# version 10 -- updated output to include more details of n-gram scoring.
+#    Defaults to generate both NIST and BLEU scores.  Use -b for BLEU
+#    only, use -n for NIST only
+#
+# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
+#    being the max, regardless what was entered on the command line.)
+#
+# version 09c -- bug fix (During the calculation of ngram information,
+#    each ngram was being counted only once for each segment.  This has
+#    been fixed so that each ngram is counted correctly in each segment.)
+#
+# version 09b -- text normalization modified:
+#    * option flag added to preserve upper case
+#    * non-ASCII characters left in place.
+#
+# version 09a -- text normalization modified:
+#    * &quot; and &amp; converted to "" and &, respectively
+#    * non-ASCII characters kept together (bug fix)
+#
+# version 09 -- modified to accommodate sgml tag and attribute
+#    names revised to conform to default SGML conventions.
+#
+# version 08 -- modifies the NIST metric in accordance with the
+#    findings on the 2001 Chinese-English dry run corpus.  Also
+#    incorporates the BLEU metric as an option and supports the
+#    output of ngram detail.
+#
+# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
+#    Keep strings of non-ASCII characters together as one word
+#    (rather than splitting them into one-character words).
+#    Change length penalty so that translations that are longer than
+#    the average reference translation are not penalized.
+#
+# version 06
+#    Prevent divide-by-zero when a segment has no evaluation N-grams.
+#    Correct segment index for level 3 debug output.
+#
+# version 05
+#    improve diagnostic error messages
+#
+# version 04
+#    tag segments
+#
+# version 03
+#    add detailed output option (intermediate document and segment scores)
+#
+# version 02
+#    accommodation of modified sgml tags and attributes
+#
+# version 01
+#    same as bleu version 15, but modified to provide formal score output.
+#
+# original IBM version
+#    Author: Kishore Papineni
+#    Date: 06/10/2001
+#################################
+######
+# Intro
+my ($date, $time) = date_time_stamp();
+print "MT evaluation scorer began on $date at $time\n";
+print "\ncommand line:  ", $0, " ", join(" ", @ARGV), "\n";
+my $usage = "\n\nUsage: $0 -r <ref_file> -s <src_file> -t <tst_file>\n\n".
+    "Description:  This Perl script evaluates MT system performance.\n".
+    "\n".
+    "Required arguments:\n".
+    "  -r <ref_file> is a file containing the reference translations for\n".
+    "      the documents to be evaluated.\n".
+    "  -s <src_file> is a file containing the source documents for which\n".
+    "      translations are to be evaluated\n".
+    "  -t <tst_file> is a file containing the translations to be evaluated\n".
+    "\n".
+    "Optional arguments:\n".
+    "  -h prints this help message to STDOUT\n".
+    "  -c preserves upper-case alphabetic characters\n".
+    "  -b generate BLEU scores only\n".
+    "  -n generate NIST scores only\n".
+    "  -d detailed output flag:\n".
+    "         0 (default) for system-level score only\n".
+    "         1 to include document-level scores\n".
+    "         2 to include segment-level scores\n".
+    "         3 to include ngram-level scores\n".
+    "  -e enclose non-ASCII characters between spaces\n".
+     "  --brevity-penalty ( closest | shortest )\n" .
+    "         closest (default) : acts as IBM BLEU (takes the closest reference translation length)\n" .
+    "         shortest : acts as previous versions of the script (takes the shortest reference translation length)\n" .
+    "  --international-tokenization\n" .
+    "         when specified, uses Unicode-based (only) tokenization rules\n" .
+    "         when not specified (default), uses default tokenization (some language-dependant rules)\n" .
+    "  --metricsMATR : create three files for both BLEU scores and NIST scores:\n" .
+    "         BLEU-seg.scr and NIST-seg.scr : segment-level scores\n" .
+    "         BLEU-doc.scr and NIST-doc.scr : document-level scores\n" .
+    "         BLEU-sys.scr and NIST-sys.scr : system-level scores\n" .
+    "  --no-smoothing : disable smoothing on BLEU scores\n" .
+    "\n";
+use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e);
+use Getopt::Long;
+my $ref_file = '';
+my $src_file = '';
+my $tst_file = '';
+my $detail = 0;
+my $help = '';
+my $preserve_case = '';
+my $split_non_ASCII = '';
+my $brevity_penalty = 'closest';
+my $international_tokenization;
+my $metricsMATR_output = '';
+my $no_smoothing = '';
+our $opt_x = '';
+our $opt_b = '';
+our $opt_n = '';
+GetOptions(
+	'r=s' => \$ref_file,
+	's=s' => \$src_file,
+	't=s' => \$tst_file,
+	'd:i' => \$detail,
+	'h|help' => \$help,
+	'b',
+	'n',
+	'c' => \$preserve_case,
+	'x:s',
+	'e' => \$split_non_ASCII,
+	'brevity-penalty:s' => \$brevity_penalty,
+	'international-tokenization' => \$international_tokenization,
+	'metricsMATR-output' => \$metricsMATR_output,
+	'no-smoothing' => \$no_smoothing
+);
+die $usage if $help;
+die "Error in command line:  ref_file not defined$usage" unless ( $ref_file );
+die "Error in command line:  src_file not defined$usage" unless ( $src_file );
+die "Error in command line:  tst_file not defined$usage" unless ( $tst_file );
+my $BLEU_BP;
+if ( !( $brevity_penalty cmp 'closest' ) )
+{
+	$BLEU_BP = \&brevity_penalty_closest;
+}
+elsif ( !( $brevity_penalty cmp 'shortest' ) )
+{
+	$BLEU_BP = \&brevity_penalty_shortest;
+}
+else
+{
+	die "Incorrect value supplied for 'brevity_penalty'$usage";
+}
+my $TOKENIZATION = \&tokenization;
+$TOKENIZATION = \&tokenization_international if ( $international_tokenization );
+my $BLEU_SCORE = \&bleu_score;
+$BLEU_SCORE = \&bleu_score_nosmoothing if ( $no_smoothing );
+my $max_Ngram = 9;
+my $METHOD = "BOTH";
+if ( $opt_b ) { $METHOD = "BLEU"; }
+if ( $opt_n ) { $METHOD = "NIST"; }
+my $method;
+######
+# Global variables
+my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
+my (%tst_data, %ref_data); # the data -- with structure:  {system}{document}{segments}
+my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
+my %eval_docs;     # document information for the evaluation data set
+my %ngram_info;    # the information obtained from (the last word in) the ngram
+######
+# Get source document ID's
+($src_id) = get_source_info ($src_file);
+######
+# Get reference translations
+($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
+compute_ngram_info ();
+######
+# Get translations to evaluate
+($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
+######
+# Check data for completeness and correctness
+check_MT_data ();
+######
+#
+my %NISTmt;
+my %NISTOverall;
+my %BLEUmt;
+my %BLEUOverall;
+######
+# Evaluate
+print "\nEvaluation of $src_lang-to-$tgt_lang translation using:\n";
+my $cum_seg = 0;
+foreach my $doc (sort keys %eval_docs)
+{
+	$cum_seg += scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
+}
+print "    src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
+print "    ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
+print "    tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
+foreach my $sys (sort @tst_sys)
+{
+	for (my $n=1; $n<=$max_Ngram; $n++)
+	{
+		$NISTmt{$n}{$sys}{cum} = 0;
+		$NISTmt{$n}{$sys}{ind} = 0;
+		$BLEUmt{$n}{$sys}{cum} = 0;
+		$BLEUmt{$n}{$sys}{ind} = 0;
+	}
+	if ( ($METHOD eq "BOTH") || ($METHOD eq "NIST") )
+	{
+		$method="NIST";
+		score_system ($sys, \%NISTmt, \%NISTOverall);
+	}
+	if ( ($METHOD eq "BOTH") || ($METHOD eq "BLEU") )
+	{
+		$method="BLEU";
+		score_system ($sys, \%BLEUmt, \%BLEUOverall);
+	}
+}
+######
+printout_report ();
+if ( $metricsMATR_output )
+{
+	outputMetricsMATR( 'NIST', %NISTOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'NIST' ) );
+	outputMetricsMATR( 'BLEU', %BLEUOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'BLEU' ) );
+}
+($date, $time) = date_time_stamp();
+print "\nMT evaluation scorer ended on $date at $time\n";
+exit 0;
+#################################
+sub get_source_info
+{
+	my ($file) = @_;
+	my ($name, $id, $src, $doc, $seg);
+	my ($data, $tag, $span);
+	# Extension of the file determines the parser used:
+	#   .xml       : XML::Twig
+	#   otherwise  : simple SGML parsing functions
+	if ( $file =~ /\.xml$/i )
+	{
+		my $twig = XML::Twig->new();
+		$twig->parsefile( $file );
+		my $root = $twig->root;
+		my $currentSet = $root->first_child( 'srcset' );
+		die "Source XML file '$file' does not contain the 'srcset' element" if ( not $currentSet );
+		$id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
+		$src = $currentSet->{ 'att' }->{ 'srclang' } or die "No srcset 'srclang' attribute value in '$file'";
+		die "Not the same srclang attribute values across sets" unless ( not defined $src_lang or $src eq $src_lang );
+		$src_lang = $src;
+		foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
+		{
+			my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
+			foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
+			{
+				my $segID = $currentSeg->{ 'att' }->{ 'id' };
+				die "No segment 'id' attribute value in '$file'" if (! defined $segID);
+				my $segData = $currentSeg->text;
+				($eval_docs{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
+			}
+		}
+	}
+	else
+	{
+		#read data from file
+		open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
+		binmode FILE, ":utf8";
+		$data .= $_ while <FILE>;
+		close (FILE);
+		#get source set info
+		die "\n\nFATAL INPUT ERROR:  no 'src_set' tag in src_file '$file'\n\n"
+			unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
+		die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+			unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
+		die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+			unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
+		die "\n\nFATAL INPUT ERROR:  $name ('$src') in file '$file' inconsistent\n"
+			."                    with $name in previous input data ('$src_lang')\n\n"
+			unless (not defined $src_lang or $src eq $src_lang);
+		$src_lang = $src;
+		#get doc info -- ID and # of segs
+		$data = $span;
+		while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data))
+		{
+			die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+				unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
+			die "\n\nFATAL INPUT ERROR:  duplicate '$name' in file '$file'\n\n"
+				if defined $eval_docs{$doc};
+			$span =~ s/[\s\n\r]+/ /g;  # concatenate records
+			my $nseg=0, my $seg_data = $span;
+			while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
+			{
+				die "\n\nFATAL INPUT ERROR:  no attribute '$name' in file '$file'\n\n"
+					unless ($seg) = extract_sgml_tag_attribute( $name='id', $tag );
+				($eval_docs{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
+				$nseg++;
+			}
+			die "\n\nFATAL INPUT ERROR:  no segments in document '$doc' in file '$file'\n\n"
+				if $nseg == 0;
+		}
+		die "\n\nFATAL INPUT ERROR:  no documents in file '$file'\n\n"
+			unless keys %eval_docs > 0;
+	}
+	return $id;
+}
+#################################
+sub get_MT_data
+{
+	my ($docs, $set_tag, $file) = @_;
+	my ($name, $id, $src, $tgt, $sys, $doc, $seg);
+	my ($tag, $span, $data);
+	# Extension of the file determines the parser used:
+	#   .xml       : XML::Twig
+	#   otherwise  : simple SGML parsing functions
+	if ( $file =~ /\.xml$/i )
+	{
+		my $twig = XML::Twig->new();
+		$twig->parsefile( $file );
+		my $root = $twig->root;
+		foreach my $currentSet ( $root->get_xpath( 'refset' ), $root->get_xpath( 'tstset' ) )
+		{
+			$id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
+			$src = $currentSet->{ 'att' }->{ 'srclang' } or die "No 'srclang' attribute value in '$file'";
+			$tgt = $currentSet->{ 'att' }->{ 'trglang' } or die "No 'trglang' attribute value in '$file'";
+			die "Not the same 'srclang' attribute value across sets" unless ( $src eq $src_lang );
+			die "Not the same 'trglang' attribute value across sets" unless ( ( not defined $tgt_lang ) or ( $tgt = $tgt_lang ) );
+			$tgt_lang = $tgt;
+			my $sys;
+			if ( $currentSet->name eq 'tstset' )
+			{
+				$sys = $currentSet->{ 'att' }->{ 'sysid' } or die "No 'sysid' attribute value in '$file'";
+			}
+			else
+			{
+				$sys = $currentSet->{ 'att' }->{ 'refid' } or die "No 'refid' attribute value in '$file'";
+			}
+			foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
+			{
+				my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
+				$docs->{ $sys }{ $docID }{ FILE } = $file;
+				foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
+				{
+					my $segID = $currentSeg->{ 'att' }->{ 'id' };
+					die "No segment 'id' attribute value in '$file'" if (! defined $segID);
+					my $segData = $currentSeg->text;
+					($docs->{$sys}{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
+				}
+			}
+		}
+	}
+	else
+	{
+		#read data from file
+		open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
+		binmode FILE, ":utf8";
+		$data .= $_ while <FILE>;
+		close (FILE);
+		#get tag info
+		while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data))
+		{
+			die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+				unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
+			die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+				unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
+			die "\n\nFATAL INPUT ERROR:  $name ('$src') in file '$file' inconsistent\n"
+				."                    with $name of source ('$src_lang')\n\n"
+				unless $src eq $src_lang;
+			die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+				unless ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
+			die "\n\nFATAL INPUT ERROR:  $name ('$tgt') in file '$file' inconsistent\n"
+				."                    with $name of the evaluation ('$tgt_lang')\n\n"
+				unless (not defined $tgt_lang or $tgt eq $tgt_lang);
+			$tgt_lang = $tgt;
+			my $mtdata = $span;
+			while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata))
+			{
+				die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+					unless (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
+				die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+					unless $doc = extract_sgml_tag_attribute ($name="DocID", $tag);
+				die "\n\nFATAL INPUT ERROR:  document '$doc' for system '$sys' in file '$file'\n"
+					."                    previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
+					unless (not defined $docs->{$sys}{$doc});
+				$span =~ s/[\s\n\r]+/ /g;  # concatenate records
+				my $nseg=0, my $seg_data = $span;
+				while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
+				{
+					die "\n\nFATAIL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
+						unless $seg = extract_sgml_tag_attribute( $name="id", $tag );
+					($docs->{$sys}{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
+					$nseg++;
+				}
+				die "\n\nFATAL INPUT ERROR:  no segments in document '$doc' in file '$file'\n\n" if $nseg == 0;
+				$docs->{$sys}{$doc}{FILE} = $file;
+			}
+		}
+	}
+	return $id;
+}
+#################################
+sub check_MT_data
+{
+	@tst_sys = sort keys %tst_data;
+	@ref_sys = sort keys %ref_data;
+	die "Not the same 'setid' attribute values across files" unless ( ( $src_id eq $tst_id ) && ( $src_id eq $ref_id ) );
+#every evaluation document must be represented for every system and every reference
+	foreach my $doc (sort keys %eval_docs)
+	{
+		my $nseg_source = scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
+		foreach my $sys (@tst_sys)
+		{
+			die "\n\nFATAL ERROR:  no document '$doc' for system '$sys'\n\n" unless defined $tst_data{$sys}{$doc};
+			my $nseg = scalar( keys( %{$tst_data{$sys}{$doc}{SEGS}} ) );
+			die "\n\nFATAL ERROR:  translated documents must contain the same # of segments as the source, but\n"
+				."              document '$doc' for system '$sys' contains $nseg segments, while\n"
+				."              the source document contains $nseg_source segments.\n\n"
+				unless $nseg == $nseg_source;
+		}
+		foreach my $sys (@ref_sys)
+		{
+			die "\n\nFATAL ERROR:  no document '$doc' for reference '$sys'\n\n" unless defined $ref_data{$sys}{$doc};
+			my $nseg = scalar( keys( %{$ref_data{$sys}{$doc}{SEGS}} ) );
+			die "\n\nFATAL ERROR:  translated documents must contain the same # of segments as the source, but\n"
+				."              document '$doc' for system '$sys' contains $nseg segments, while\n"
+				."              the source document contains $nseg_source segments.\n\n"
+				unless $nseg == $nseg_source;
+		}
+	}
+}
+#################################
+sub compute_ngram_info
+{
+	my ($ref, $doc, $seg);
+	my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
+	my (%ngram_count, @tot_ngrams);
+	foreach $ref (keys %ref_data)
+	{
+		foreach $doc (keys %{$ref_data{$ref}})
+		{
+			foreach $seg ( keys %{$ref_data{$ref}{$doc}{SEGS}})
+			{
+				@wrds = split /\s+/, $ref_data{ $ref }{ $doc }{ SEGS }{ $seg };
+				$tot_wrds += @wrds;
+				%ngrams = %{Words2Ngrams (@wrds)};
+				foreach $ngram (keys %ngrams)
+				{
+					$ngram_count{$ngram} += $ngrams{$ngram};
+				}
+			}
+		}
+	}
+	foreach $ngram (keys %ngram_count)
+	{
+		@wrds = split / /, $ngram;
+		pop @wrds, $mgram = join " ", @wrds;
+		$ngram_info{$ngram} = - log ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} : $ngram_count{$ngram}/$tot_wrds) / log 2;
+		if (defined $opt_x and $opt_x eq "ngram info")
+		{
+			@wrds = split / /, $ngram;
+			printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
+				$mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
+		}
+	}
+}
+#################################
+sub score_system
+{
+	my ($sys, $ref, $doc, $SCOREmt, $overallScore);
+	($sys, $SCOREmt, $overallScore) = @_;
+	my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
+	my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
+	$cum_ref_length = 0;
+	for (my $j=1; $j<=$max_Ngram; $j++)
+	{
+		$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
+	}
+	foreach $doc (sort keys %eval_docs)
+	{
+		($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc, $overallScore);
+		if ( $method eq "NIST" )
+		{
+			my %DOCmt = ();
+			my $docScore = nist_score( scalar( @ref_sys ), $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, \%DOCmt );
+			$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
+			if ( $detail >= 1 )
+			{
+				printf "$method score using   5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
+				$docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
+			}
+		}
+		if ( $method eq "BLEU" )
+		{
+			my %DOCmt = ();
+			my $docScore = &{$BLEU_SCORE}( $ref_length, $match_cnt, $tst_cnt, $sys, \%DOCmt );
+			$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
+			if ( $detail >= 1 )
+			{
+				printf "$method score using   4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
+					$docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
+			}
+		}
+		$cum_ref_length += $ref_length;
+		for (my $j=1; $j<=$max_Ngram; $j++)
+		{
+			$cum_match[$j] += $match_cnt->[$j];
+			$cum_tst_cnt[$j] += $tst_cnt->[$j];
+			$cum_ref_cnt[$j] += $ref_cnt->[$j];
+			$cum_tst_info[$j] += $tst_info->[$j];
+			$cum_ref_info[$j] += $ref_info->[$j];
+			printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
+				$tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
+				if (defined $opt_x and $opt_x eq "document info");
+		}
+	}
+	if ($method eq "BLEU")
+	{
+		$overallScore->{ $sys }{ 'score' } = &{$BLEU_SCORE}($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, $SCOREmt);
+	}
+	if ($method eq "NIST")
+	{
+		$overallScore->{ $sys }{ 'score' } = nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, $SCOREmt);
+	}
+}
+#################################
+sub score_document
+{
+	my ($sys, $ref, $doc, $overallScore);
+	($sys, $doc, $overallScore) = @_;
+	my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
+	my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
+	$cum_ref_length = 0;
+	for (my $j=1; $j<=$max_Ngram; $j++)
+	{
+		$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
+	}
+    # score each segment
+	foreach my $seg ( nsort keys( %{$tst_data{$sys}{$doc}{SEGS}} ) )
+	{
+		my @ref_segments = ();
+		foreach $ref (@ref_sys)
+		{
+			push @ref_segments, $ref_data{$ref}{$doc}{SEGS}{$seg};
+			if ( $detail >= 3 )
+			{
+				printf "ref '$ref', seg $seg: %s\n", $ref_data{$ref}{$doc}{SEGS}{$seg}
+			}
+		}
+		printf "sys '$sys', seg $seg: %s\n", $tst_data{$sys}{$doc}{SEGS}{$seg} if ( $detail >= 3 );
+		($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_segment ($tst_data{$sys}{$doc}{SEGS}{$seg}, @ref_segments);
+		if ( $method eq "BLEU" )
+		{
+			my %DOCmt = ();
+			my $segScore = &{$BLEU_SCORE}($ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt);
+			$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
+			if ( $detail >= 2 )
+			{
+				printf "  $method score using 4-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1]
+			}
+		}
+		if ( $method eq "NIST" )
+		{
+			my %DOCmt = ();
+			my $segScore = nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt);
+			$overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
+			if ( $detail >= 2 )
+			{
+				printf "  $method score using 5-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1];
+			}
+		}
+		$cum_ref_length += $ref_length;
+		for (my $j=1; $j<=$max_Ngram; $j++)
+		{
+			$cum_match[$j] += $match_cnt->[$j];
+			$cum_tst_cnt[$j] += $tst_cnt->[$j];
+			$cum_ref_cnt[$j] += $ref_cnt->[$j];
+			$cum_tst_info[$j] += $tst_info->[$j];
+			$cum_ref_info[$j] += $ref_info->[$j];
+		}
+	}
+	return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
+}
+###############################################################################################################################
+# function returning the shortest reference length
+# takes as input:
+#  - currentLength : the current (shortest) reference length
+#  - referenceSentenceLength : the current reference sentence length
+#  - candidateSentenceLength : the current candidate sentence length (unused)
+###############################################################################################################################
+sub brevity_penalty_shortest
+{
+	my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
+	return ( $referenceSentenceLength < $currentLength ? $referenceSentenceLength : $currentLength );
+}
+###############################################################################################################################
+# function returning the closest reference length (to the candidate sentence length)
+# takes as input:
+#  - currentLength: the current (closest) reference length.
+#  - candidateSentenceLength : the current reference sentence length
+#  - candidateSentenceLength : the current candidate sentence length
+# when two reference sentences are at the same distance, it will return the shortest reference sentence length
+# example of 4 iterations, given:
+#  - one candidate sentence containing 7 tokens
+#  - one reference translation containing 11 tokens
+#  - one reference translation containing 8 tokens
+#  - one reference translation containing 6 tokens
+#  - one reference translation containing 7 tokens
+# the multiple invokations will return:
+#  - currentLength is set to 11 (outside of this function)
+#  - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 )
+#  - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8
+#  - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 )
+###############################################################################################################################
+sub brevity_penalty_closest
+{
+	my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
+	my $result = $currentLength;
+	if ( abs( $candidateSentenceLength - $referenceSentenceLength ) <= abs( $candidateSentenceLength - $currentLength ) )
+	{
+		if ( abs( $candidateSentenceLength - $referenceSentenceLength ) == abs( $candidateSentenceLength - $currentLength ) )
+		{
+			if ( $currentLength > $referenceSentenceLength )
+			{
+				$result = $referenceSentenceLength;
+			}
+		}
+		else
+		{
+			$result = $referenceSentenceLength;
+		}
+	}
+	return $result;
+}
+#################################
+sub score_segment
+{
+	my ($tst_seg, @ref_segs) = @_;
+	my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
+	my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
+	my ($ngram);
+	my (@nwrds_ref);
+	my $ref_length;
+	for (my $j=1; $j<= $max_Ngram; $j++)
+	{
+		$match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
+	}
+# get the ngram counts for the test segment
+	@tst_wrds = split /\s+/, $tst_seg;
+	%tst_ngrams = %{Words2Ngrams (@tst_wrds)};
+	for (my $j=1; $j<=$max_Ngram; $j++)
+	{
+		# compute ngram counts
+		$tst_count[$j]  = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
+	}
+# get the ngram counts for the reference segments
+	foreach $ref_seg (@ref_segs)
+	{
+		@ref_wrds = split /\s+/, $ref_seg;
+		%ref_ngrams = %{Words2Ngrams (@ref_wrds)};
+		foreach $ngram (keys %ref_ngrams)
+		{
+			# find the maximum # of occurrences
+			my @wrds = split / /, $ngram;
+			$ref_info[@wrds] += $ngram_info{$ngram};
+			$ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : $ref_ngrams{$ngram};
+		}
+		for (my $j=1; $j<=$max_Ngram; $j++)
+		{
+			# update ngram counts
+			$ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
+		}
+		if ( not defined( $ref_length ) )
+		{
+			$ref_length = scalar( @ref_wrds );
+		}
+		else
+		{
+			$ref_length = &{$BLEU_BP}( $ref_length, scalar( @ref_wrds ), scalar( @tst_wrds ) );
+		}
+	}
+# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
+	foreach $ngram (keys %tst_ngrams)
+	{
+		next unless defined $ref_ngrams_max{$ngram};
+		my @wrds = split / /, $ngram;
+		$tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
+		$match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
+		printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
+			if $detail >= 3;
+	}
+	return ($ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
+}
+#################################
+sub bleu_score_nosmoothing
+{
+	my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_;
+	my $score = 0;
+	my $iscore = 0;
+	for ( my $j = 1; $j <= $max_Ngram; ++$j )
+	{
+		if ($matching_ngrams->[ $j ] == 0)
+		{
+			$SCOREmt->{ $j }{ $sys }{ cum }=0;
+		}
+		else
+		{
+			my $len_score = min (0, 1-$ref_length/$tst_ngrams->[1]);
+			# Cumulative N-Gram score
+			$score += log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
+			$SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j + $len_score );
+			# Individual N-Gram score
+			$iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
+			$SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
+		}
+	}
+	return $SCOREmt->{ 4 }{ $sys }{ cum };
+}
+###############################################################################################################################
+# Default method used to compute the BLEU score, using smoothing.
+# Note that the method used can be overridden using the '--no-smoothing' command-line argument
+# The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each precision score whose matching n-gram count is null
+# k is 1 for the first 'n' value for which the n-gram match count is null
+# For example, if the text contains:
+#   - one 2-gram match
+#   - and (consequently) two 1-gram matches
+# the n-gram count for each individual precision score would be:
+#   - n=1  =>  prec_count = 2     (two unigrams)
+#   - n=2  =>  prec_count = 1     (one bigram)
+#   - n=3  =>  prec_count = 1/2   (no trigram,  taking 'smoothed' value of 1 / ( 2^k ), with k=1)
+#   - n=4  =>  prec_count = 1/4   (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
+###############################################################################################################################
+sub bleu_score
+{
+	my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_;
+	my $score = 0;
+	my $iscore = 0;
+	my $exp_len_score = 0;
+	$exp_len_score = exp( min (0, 1 - $ref_length / $tst_ngrams->[ 1 ] ) ) if ( $tst_ngrams->[ 1 ] > 0 );
+	my $smooth = 1;
+	for ( my $j = 1; $j <= $max_Ngram; ++$j )
+	{
+		if ( $tst_ngrams->[ $j ] == 0 )
+		{
+			$iscore = 0;
+		}
+		elsif ( $matching_ngrams->[ $j ] == 0 )
+		{
+			$smooth *= 2;
+			$iscore = log( 1 / ( $smooth * $tst_ngrams->[ $j ] ) );
+		}
+		else
+		{
+			$iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
+		}
+		$SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
+		$score += $iscore;
+		$SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j ) * $exp_len_score;
+	}
+	return $SCOREmt->{ 4 }{ $sys }{ cum };
+}
+#################################
+sub nist_score
+{
+	my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, $SCOREmt) = @_;
+	my $score = 0;
+	my $iscore = 0;
+	for (my $n=1; $n<=$max_Ngram; $n++)
+	{
+		$score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
+		$SCOREmt->{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
+		$iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
+		$SCOREmt->{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
+	}
+	return $SCOREmt->{5}{$sys}{cum};
+}
+#################################
+sub Words2Ngrams
+{
+	#convert a string of words to an Ngram count hash
+	my %count = ();
+	for (; @_; shift)
+	{
+		my ($j, $ngram, $word);
+		for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++)
+		{
+			$ngram .= defined $ngram ? " $word" : $word;
+			$count{$ngram}++;
+		}
+	}
+	return {%count};
+}
+#################################
+sub tokenization
+{
+	my ($norm_text) = @_;
+# language-independent part:
+	$norm_text =~ s/<skipped>//g; # strip "skipped" tags
+	$norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
+	$norm_text =~ s/\n/ /g; # join lines
+	$norm_text =~ s/&quot;/"/g;  # convert SGML tag for quote to "
+	$norm_text =~ s/&amp;/&/g;   # convert SGML tag for ampersand to &
+	$norm_text =~ s/&lt;/</g;    # convert SGML tag for less-than to >
+	$norm_text =~ s/&gt;/>/g;    # convert SGML tag for greater-than to <
+# language-dependent part (assuming Western languages):
+	$norm_text = " $norm_text ";
+	$norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
+	$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g;   # tokenize punctuation
+	$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
+	$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
+	$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
+	$norm_text =~ s/\s+/ /g; # one space only between words
+	$norm_text =~ s/^\s+//;  # no leading space
+	$norm_text =~ s/\s+$//;  # no trailing space
+	return $norm_text;
+}
+sub tokenization_international
+{
+	my ($norm_text) = @_;
+	$norm_text =~ s/<skipped>//g; # strip "skipped" tags
+	#$norm_text =~ s/\p{Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines
+	$norm_text =~ s/\p{Zl}/ /g; # join lines
+	# replace entities
+	$norm_text =~ s/&quot;/\"/g;  # quote to "
+	$norm_text =~ s/&amp;/&/g;   # ampersand to &
+	$norm_text =~ s/&lt;/</g;    # less-than to <
+	$norm_text =~ s/&gt;/>/g;    # greater-than to >
+	$norm_text =~ s/&apos;/\'/g; # apostrophe to '
+	$norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed
+	$norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII );
+	# punctuation: tokenize any punctuation unless followed AND preceded by a digit
+	$norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g;
+	$norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g;
+	$norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols
+	$norm_text =~ s/\p{Z}+/ /g; # one space only between words
+	$norm_text =~ s/^\p{Z}+//; # no leading space
+	$norm_text =~ s/\p{Z}+$//; # no trailing space
+	return $norm_text;
+}
+#################################
+sub nist_length_penalty
+{
+	my ($ratio) = @_;
+	return 1 if $ratio >= 1;
+	return 0 if $ratio <= 0;
+	my $ratio_x = 1.5;
+	my $score_x = 0.5;
+	my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
+	return exp (-$beta*log($ratio)*log($ratio));
+}
+#################################
+sub date_time_stamp
+{
+	my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
+	my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
+	my ($date, $time);
+	$time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
+	$date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
+	return ($date, $time);
+}
+#################################
+sub extract_sgml_tag_and_span
+{
+	my ($name, $data) = @_;
+	($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
+}
+#################################
+sub extract_sgml_tag_attribute
+{
+	my ($name, $data) = @_;
+	($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
+}
+#################################
+sub max
+{
+	my ($max, $next);
+	return unless defined ($max=pop);
+	while (defined ($next=pop))
+	{
+		$max = $next if $next > $max;
+	}
+	return $max;
+}
+#################################
+sub min
+{
+	my ($min, $next);
+	return unless defined ($min=pop);
+	while (defined ($next=pop))
+	{
+		$min = $next if $next < $min;
+	}
+	return $min;
+}
+#################################
+sub printout_report
+{
+	if ( $METHOD eq "BOTH" )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf "NIST score = %2.4f  BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
+		}
+	}
+	elsif ($METHOD eq "NIST" )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf "NIST score = %2.4f  for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
+		}
+	}
+	elsif ($METHOD eq "BLEU" )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
+		}
+	}
+	printf "\n# ------------------------------------------------------------------------\n\n";
+	printf "Individual N-gram scoring\n";
+	printf "        1-gram   2-gram   3-gram   4-gram   5-gram   6-gram   7-gram   8-gram   9-gram\n";
+	printf "        ------   ------   ------   ------   ------   ------   ------   ------   ------\n";
+	if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "NIST") )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf " NIST:";
+			for (my $i=1; $i<=$max_Ngram; $i++)
+			{
+				printf "  %2.4f ",$NISTmt{$i}{$sys}{ind}
+			}
+			printf "  \"$sys\"\n";
+		}
+		printf "\n";
+	}
+	if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf " BLEU:";
+			for (my $i=1; $i<=$max_Ngram; $i++)
+			{
+				printf "  %2.4f ",$BLEUmt{$i}{$sys}{ind}
+			}
+			printf "  \"$sys\"\n";
+		}
+	}
+	printf "\n# ------------------------------------------------------------------------\n";
+	printf "\nCumulative N-gram scoring\n";
+	printf "        1-gram   2-gram   3-gram   4-gram   5-gram   6-gram   7-gram   8-gram   9-gram\n";
+	printf "        ------   ------   ------   ------   ------   ------   ------   ------   ------\n";
+	if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST"))
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf " NIST:";
+			for (my $i=1; $i<=$max_Ngram; $i++)
+			{
+				printf "  %2.4f ",$NISTmt{$i}{$sys}{cum}
+			}
+			printf "  \"$sys\"\n";
+		}
+	}
+	printf "\n";
+	if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
+	{
+		foreach my $sys (sort @tst_sys)
+		{
+			printf " BLEU:";
+			for (my $i=1; $i<=$max_Ngram; $i++)
+			{
+				printf "  %2.4f ",$BLEUmt{$i}{$sys}{cum}
+			}
+			printf "  \"$sys\"\n";
+		}
+	}
+}
+###############################################################################################################################
+# Create three files, by using:
+# - $prefix : the prefix used for the output file names
+# - %overall : a hash containing seg/doc/sys-level scores:
+#   - $overall{ $SYSTEM_ID }{ 'score' } => system-level score
+#   - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'score' } => document-level score
+#   - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'segments' }{ $SEGMENT_ID } => segment-level score
+###############################################################################################################################
+sub outputMetricsMATR
+{
+	my ( $prefix, %overall ) = @_;
+	my $fileNameSys = $prefix . '-sys.scr';
+	my $fileNameDoc = $prefix . '-doc.scr';
+	my $fileNameSeg = $prefix . '-seg.scr';
+	open FILEOUT_SYS, '>', $fileNameSys or die "Could not open file: ${fileNameSys}";
+	open FILEOUT_DOC, '>', $fileNameDoc or die "Could not open file: ${fileNameDoc}";
+	open FILEOUT_SEG, '>', $fileNameSeg or die "Could not open file: ${fileNameSeg}";
+	foreach my $sys ( sort( keys( %overall ) ) )
+	{
+		my $scoreSys = $overall{ $sys }{ 'score' };
+		print FILEOUT_SYS "${tst_id}\t${sys}\t${scoreSys}\n";
+		foreach my $doc ( sort( keys( %{$overall{ $sys }{ 'documents' }} ) ) )
+		{
+			my $scoreDoc = $overall{ $sys }{ 'documents' }{ $doc }{ 'score' };
+			print FILEOUT_DOC "${tst_id}\t${sys}\t${doc}\t${scoreDoc}\n";
+			foreach my $seg ( nsort keys( %{$overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }} ) )
+			{
+				my $scoreSeg = $overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' };
+				print FILEOUT_SEG "${tst_id}\t${sys}\t${doc}\t${seg}\t${scoreSeg}\n";
+			}
+		}
+	}
+	close FILEOUT_SEG;
+	close FILEOUT_DOC;
+	close FILEOUT_SYS;
+}

mosesdecoder/scripts/generic/multi-bleu-detok.perl ADDED Viewed

	@@ -0,0 +1,214 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+# This file uses the internal tokenization of mteval-v13a.pl,
+# giving the exact same (case-sensitive) results on untokenized text.
+# Using this script with detokenized output and untokenized references is
+# preferrable over multi-bleu.perl, since scores aren't affected by tokenization differences.
+#
+# like multi-bleu.perl , it supports plain text input and multiple references.
+# $Id$
+use warnings;
+use strict;
+binmode(STDIN, ":utf8");
+use open ':encoding(UTF-8)';
+my $lowercase = 0;
+if ($ARGV[0] eq "-lc") {
+  $lowercase = 1;
+  shift;
+}
+my $stem = $ARGV[0];
+if (!defined $stem) {
+  print STDERR "usage: multi-bleu-detok.pl [-lc] reference < hypothesis\n";
+  print STDERR "Reads the references from reference or reference0, reference1, ...\n";
+  exit(1);
+}
+$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
+my @REF;
+my $ref=0;
+while(-e "$stem$ref") {
+    &add_to_ref("$stem$ref",\@REF);
+    $ref++;
+}
+&add_to_ref($stem,\@REF) if -e $stem;
+die("ERROR: could not find reference file $stem") unless scalar @REF;
+# add additional references explicitly specified on the command line
+shift;
+foreach my $stem (@ARGV) {
+    &add_to_ref($stem,\@REF) if -e $stem;
+}
+sub add_to_ref {
+    my ($file,$REF) = @_;
+    my $s=0;
+    if ($file =~ /.gz$/) {
+	open(REF,"gzip -dc $file|") or die "Can't read $file";
+    } else {
+	open(REF,$file) or die "Can't read $file";
+    }
+    while(<REF>) {
+	chop;
+	$_ = tokenization($_);
+	push @{$$REF[$s++]}, $_;
+    }
+    close(REF);
+}
+my(@CORRECT,@TOTAL,$length_translation,$length_reference);
+my $s=0;
+while(<STDIN>) {
+    chop;
+    $_ = lc if $lowercase;
+    $_ = tokenization($_);
+    my @WORD = split;
+    my %REF_NGRAM = ();
+    my $length_translation_this_sentence = scalar(@WORD);
+    my ($closest_diff,$closest_length) = (9999,9999);
+    foreach my $reference (@{$REF[$s]}) {
+#      print "$s $_ <=> $reference\n";
+  $reference = lc($reference) if $lowercase;
+	my @WORD = split(' ',$reference);
+	my $length = scalar(@WORD);
+        my $diff = abs($length_translation_this_sentence-$length);
+	if ($diff < $closest_diff) {
+	    $closest_diff = $diff;
+	    $closest_length = $length;
+	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
+	} elsif ($diff == $closest_diff) {
+            $closest_length = $length if $length < $closest_length;
+            # from two references with the same closeness to me
+            # take the *shorter* into account, not the "first" one.
+        }
+	for(my $n=1;$n<=4;$n++) {
+	    my %REF_NGRAM_N = ();
+	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+		my $ngram = "$n";
+		for(my $w=0;$w<$n;$w++) {
+		    $ngram .= " ".$WORD[$start+$w];
+		}
+		$REF_NGRAM_N{$ngram}++;
+	    }
+	    foreach my $ngram (keys %REF_NGRAM_N) {
+		if (!defined($REF_NGRAM{$ngram}) ||
+		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
+		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
+#	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
+		}
+	    }
+	}
+    }
+    $length_translation += $length_translation_this_sentence;
+    $length_reference += $closest_length;
+    for(my $n=1;$n<=4;$n++) {
+	my %T_NGRAM = ();
+	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+	    my $ngram = "$n";
+	    for(my $w=0;$w<$n;$w++) {
+		$ngram .= " ".$WORD[$start+$w];
+	    }
+	    $T_NGRAM{$ngram}++;
+	}
+	foreach my $ngram (keys %T_NGRAM) {
+	    $ngram =~ /^(\d+) /;
+	    my $n = $1;
+            # my $corr = 0;
+#	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
+	    $TOTAL[$n] += $T_NGRAM{$ngram};
+	    if (defined($REF_NGRAM{$ngram})) {
+		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
+		    $CORRECT[$n] += $T_NGRAM{$ngram};
+                    # $corr =  $T_NGRAM{$ngram};
+#	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
+		}
+		else {
+		    $CORRECT[$n] += $REF_NGRAM{$ngram};
+                    # $corr =  $REF_NGRAM{$ngram};
+#	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
+		}
+	    }
+            # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
+            # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
+	}
+    }
+    $s++;
+}
+my $brevity_penalty = 1;
+my $bleu = 0;
+my @bleu=();
+for(my $n=1;$n<=4;$n++) {
+  if (defined ($TOTAL[$n])){
+    $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
+    # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
+  }else{
+    $bleu[$n]=0;
+  }
+}
+if ($length_reference==0){
+  printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
+  exit(1);
+}
+if ($length_translation<$length_reference) {
+  $brevity_penalty = exp(1-$length_reference/$length_translation);
+}
+$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
+				my_log( $bleu[2] ) +
+				my_log( $bleu[3] ) +
+				my_log( $bleu[4] ) ) / 4) ;
+printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
+    100*$bleu,
+    100*$bleu[1],
+    100*$bleu[2],
+    100*$bleu[3],
+    100*$bleu[4],
+    $brevity_penalty,
+    $length_translation / $length_reference,
+    $length_translation,
+    $length_reference;
+sub my_log {
+  return -9999999999 unless $_[0];
+  return log($_[0]);
+}
+sub tokenization
+{
+	my ($norm_text) = @_;
+# language-independent part:
+	$norm_text =~ s/<skipped>//g; # strip "skipped" tags
+	$norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
+	$norm_text =~ s/\n/ /g; # join lines
+	$norm_text =~ s/&quot;/"/g;  # convert SGML tag for quote to "
+	$norm_text =~ s/&amp;/&/g;   # convert SGML tag for ampersand to &
+	$norm_text =~ s/&lt;/</g;    # convert SGML tag for less-than to >
+	$norm_text =~ s/&gt;/>/g;    # convert SGML tag for greater-than to <
+# language-dependent part (assuming Western languages):
+	$norm_text = " $norm_text ";
+	$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g;   # tokenize punctuation
+	$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
+	$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
+	$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
+	$norm_text =~ s/\s+/ /g; # one space only between words
+	$norm_text =~ s/^\s+//;  # no leading space
+	$norm_text =~ s/\s+$//;  # no trailing space
+	return $norm_text;
+}

mosesdecoder/scripts/generic/multi-bleu.perl ADDED Viewed

	@@ -0,0 +1,177 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+# $Id$
+use warnings;
+use strict;
+my $lowercase = 0;
+if ($ARGV[0] eq "-lc") {
+  $lowercase = 1;
+  shift;
+}
+my $stem = $ARGV[0];
+if (!defined $stem) {
+  print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
+  print STDERR "Reads the references from reference or reference0, reference1, ...\n";
+  exit(1);
+}
+$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
+my @REF;
+my $ref=0;
+while(-e "$stem$ref") {
+    &add_to_ref("$stem$ref",\@REF);
+    $ref++;
+}
+&add_to_ref($stem,\@REF) if -e $stem;
+die("ERROR: could not find reference file $stem") unless scalar @REF;
+# add additional references explicitly specified on the command line
+shift;
+foreach my $stem (@ARGV) {
+    &add_to_ref($stem,\@REF) if -e $stem;
+}
+sub add_to_ref {
+    my ($file,$REF) = @_;
+    my $s=0;
+    if ($file =~ /.gz$/) {
+	open(REF,"gzip -dc $file|") or die "Can't read $file";
+    } else {
+	open(REF,$file) or die "Can't read $file";
+    }
+    while(<REF>) {
+	chomp;
+	push @{$$REF[$s++]}, $_;
+    }
+    close(REF);
+}
+my(@CORRECT,@TOTAL,$length_translation,$length_reference);
+my $s=0;
+while(<STDIN>) {
+    chomp;
+    $_ = lc if $lowercase;
+    my @WORD = split;
+    my %REF_NGRAM = ();
+    my $length_translation_this_sentence = scalar(@WORD);
+    my ($closest_diff,$closest_length) = (9999,9999);
+    foreach my $reference (@{$REF[$s]}) {
+#      print "$s $_ <=> $reference\n";
+  $reference = lc($reference) if $lowercase;
+	my @WORD = split(' ',$reference);
+	my $length = scalar(@WORD);
+        my $diff = abs($length_translation_this_sentence-$length);
+	if ($diff < $closest_diff) {
+	    $closest_diff = $diff;
+	    $closest_length = $length;
+	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
+	} elsif ($diff == $closest_diff) {
+            $closest_length = $length if $length < $closest_length;
+            # from two references with the same closeness to me
+            # take the *shorter* into account, not the "first" one.
+        }
+	for(my $n=1;$n<=4;$n++) {
+	    my %REF_NGRAM_N = ();
+	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+		my $ngram = "$n";
+		for(my $w=0;$w<$n;$w++) {
+		    $ngram .= " ".$WORD[$start+$w];
+		}
+		$REF_NGRAM_N{$ngram}++;
+	    }
+	    foreach my $ngram (keys %REF_NGRAM_N) {
+		if (!defined($REF_NGRAM{$ngram}) ||
+		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
+		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
+#	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
+		}
+	    }
+	}
+    }
+    $length_translation += $length_translation_this_sentence;
+    $length_reference += $closest_length;
+    for(my $n=1;$n<=4;$n++) {
+	my %T_NGRAM = ();
+	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+	    my $ngram = "$n";
+	    for(my $w=0;$w<$n;$w++) {
+		$ngram .= " ".$WORD[$start+$w];
+	    }
+	    $T_NGRAM{$ngram}++;
+	}
+	foreach my $ngram (keys %T_NGRAM) {
+	    $ngram =~ /^(\d+) /;
+	    my $n = $1;
+            # my $corr = 0;
+#	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
+	    $TOTAL[$n] += $T_NGRAM{$ngram};
+	    if (defined($REF_NGRAM{$ngram})) {
+		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
+		    $CORRECT[$n] += $T_NGRAM{$ngram};
+                    # $corr =  $T_NGRAM{$ngram};
+#	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
+		}
+		else {
+		    $CORRECT[$n] += $REF_NGRAM{$ngram};
+                    # $corr =  $REF_NGRAM{$ngram};
+#	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
+		}
+	    }
+            # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
+            # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
+	}
+    }
+    $s++;
+}
+my $brevity_penalty = 1;
+my $bleu = 0;
+my @bleu=();
+for(my $n=1;$n<=4;$n++) {
+  if (defined ($TOTAL[$n])){
+    $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
+    # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
+  }else{
+    $bleu[$n]=0;
+  }
+}
+if ($length_reference==0){
+  printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
+  exit(1);
+}
+if ($length_translation<$length_reference) {
+  $brevity_penalty = exp(1-$length_reference/$length_translation);
+}
+$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
+				my_log( $bleu[2] ) +
+				my_log( $bleu[3] ) +
+				my_log( $bleu[4] ) ) / 4) ;
+printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
+    100*$bleu,
+    100*$bleu[1],
+    100*$bleu[2],
+    100*$bleu[3],
+    100*$bleu[4],
+    $brevity_penalty,
+    $length_translation / $length_reference,
+    $length_translation,
+    $length_reference;
+print STDERR "It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n";
+sub my_log {
+  return -9999999999 unless $_[0];
+  return log($_[0]);
+}

mosesdecoder/scripts/generic/multi_moses.py ADDED Viewed

	@@ -0,0 +1,332 @@

+#!/usr/bin/env python
+# Written by Michael Denkowski
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+'''Parallelize decoding with multiple instances of moses on a local machine
+To use with mert-moses.pl, activate --multi-moses and set the number of moses
+instances and threads per instance with --decoder-flags='--threads P:T:E'
+This script runs a specified number of moses instances, each using one or more
+threads.  The highest speed is generally seen with many single-threaded
+instances while the lowest memory usage is seen with a single many-threaded
+instance.  It is recommended to use the maximum number of instances that will
+fit into memory (up to the number of available CPUs) and distribute CPUs across
+them equally.  For example, a machine with 32 CPUs that can fit 3 copies of
+moses into memory would use --threads 2:11:10 for 2 instances with 11 threads
+each and an extra instance with 10 threads (3 instances total using all CPUs).
+Memory mapped models can be shared by multiple processes and increase the number
+of instances that can fit into memory:
+Mmaped phrase tables (Ulrich Germann)
+http://www.statmt.org/moses/?n=Advanced.Incremental#ntoc3
+Mmaped mapped language models (Kenneth Heafield)
+http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19
+'''
+import collections
+import os
+import Queue
+import signal
+import subprocess
+import sys
+import threading
+import time
+HELP = '''Multiple process decoding with Moses
+Usage:
+    {} moses --config moses.ini [options] [decoder flags]
+Options:
+    --threads P:T:E
+            P: Number of parallel instances to run
+            T: Number of threads per instance
+            E: Number of threads in optional extra instance
+            (default 1:1:0, overrides [threads] in moses.ini.  Specifying T
+             and E is optional, e.g. --threads 16 starts 16 single-threaded
+             instances)
+    --n-best-list nbest.out N [distinct]: location and size of N-best list
+    --show-weights: for mert-moses.pl, just call moses and exit
+Other options (decoder flags) are passed through to moses instances
+'''
+# Defaults
+INPUT = sys.stdin
+PROCS = 1
+THREADS = 1
+EXTRA = 0
+DONE = threading.Event()
+PID = os.getpid()
+# A very long time, used as Queue operation timeout even though we don't
+# actually want a timeout but we do want interruptibility
+# (https://bugs.python.org/issue1360)
+NEVER = 60 * 60 * 24 * 365 * 1000
+# Single unit of computation: decode a line, output result, signal done
+Task = collections.namedtuple('Task', ['id', 'line', 'out', 'event'])
+def kill_main(msg):
+    '''kill -9 the main thread to stop everything immediately'''
+    sys.stderr.write('{}\n'.format(msg))
+    os.kill(PID, signal.SIGKILL)
+def gzopen(f):
+    '''Open plain or gzipped text'''
+    return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r')
+def run_instance(cmd_base, threads, tasks, cpu_affinity, cpu_offset, n_best=False):
+    '''Run an instance of moses that processes tasks (input lines) from a
+    queue using a specified number of threads'''
+    cmd = cmd_base[:]
+    cmd.append('--threads')
+    cmd.append(str(threads))
+    if cpu_affinity:
+       cmd.append('--cpu-affinity-offset')
+       cmd.append(str(cpu_offset))
+    #print 'BEFORE'
+    #print cmd
+    #print 'AFTER\n'
+    try:
+        # Queue of tasks instance is currently working on, limited to the number
+        # of threads * 2 (minimal buffering).  The queue should be kept full for
+        # optimal CPU usage.
+        work = Queue.Queue(maxsize=(threads * 2))
+        # Multi-threaded instance
+        moses = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        # Read and handle instance output as available
+        def handle_output():
+            while True:
+                # Output line triggers task completion
+                line = moses.stdout.readline()
+                # End of output (instance finished)
+                if not line:
+                    break
+                task = work.get(timeout=NEVER)
+                if n_best:
+                    # Read and copy lines until sentinel line, copy real line id
+                    # id ||| hypothesis words  ||| feature scores ||| total score
+                    (first_i, rest) = line.split(' ||| ', 1)
+                    task.out.append(' ||| '.join((task.id, rest)))
+                    while True:
+                        line = moses.stdout.readline()
+                        (i, rest) = line.split(' ||| ', 1)
+                        # Sentinel
+                        if i != first_i:
+                            break
+                        task.out.append(' ||| '.join((task.id, rest)))
+                else:
+                    task.out.append(line)
+                # Signal task done
+                task.event.set()
+        # Output thread
+        handler = threading.Thread(target=handle_output, args=())
+        # Daemon: guaranteed to finish before non-daemons
+        handler.setDaemon(True)
+        handler.start()
+        # Input thread: take tasks as they are available and add them to work
+        # queue.  Stop when DONE encountered.
+        while True:
+            task = tasks.get(timeout=NEVER)
+            work.put(task, timeout=NEVER)
+            if task.event == DONE:
+                break
+            if n_best:
+                # Input line followed by blank line (sentinel)
+                moses.stdin.write(task.line)
+                moses.stdin.write('\n')
+            else:
+                moses.stdin.write(task.line)
+        # Cleanup
+        moses.stdin.close()
+        moses.wait()
+        handler.join()
+    except:
+        kill_main('Error with moses instance: see stderr')
+def write_results(results, n_best=False, n_best_out=None):
+    '''Write out results (output lines) from a queue as they are populated'''
+    while True:
+        task = results.get(timeout=NEVER)
+        if task.event == DONE:
+            break
+        task.event.wait()
+        if n_best:
+            # Write top-best and N-best
+            # id ||| hypothesis words  ||| feature scores ||| total score
+            top_best = task.out[0].split(' ||| ', 2)[1]
+            # Except don't write top-best if writing N-best to stdout "-"
+            if n_best_out != sys.stdout:
+                sys.stdout.write('{}\n'.format(top_best))
+                sys.stdout.flush()
+            for line in task.out:
+                n_best_out.write(line)
+            n_best_out.flush()
+        else:
+            sys.stdout.write(task.out[0])
+            sys.stdout.flush()
+def main(argv):
+    # Defaults
+    moses_ini = None
+    input = INPUT
+    procs = PROCS
+    threads = THREADS
+    extra = EXTRA
+    n_best = False
+    n_best_file = None
+    n_best_size = None
+    n_best_distinct = False
+    n_best_out = None
+    show_weights = False
+    cpu_affinity = False
+    # Decoder command
+    cmd = argv[1:]
+    # Parse special options and remove from cmd
+    i = 1
+    while i < len(cmd):
+        if cmd[i] in ('-f', '-config', '--config'):
+            moses_ini = cmd[i + 1]
+            # Do not remove from cmd
+            i += 2
+        elif cmd[i] in ('-i', '-input-file', '--input-file'):
+            input = gzopen(cmd[i + 1])
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] in ('-th', '-threads', '--threads'):
+            # P:T:E
+            args = cmd[i + 1].split(':')
+            procs = int(args[0])
+            if len(args) > 1:
+                threads = int(args[1])
+            if len(args) > 2:
+                extra = int(args[2])
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] in ('-n-best-list', '--n-best-list'):
+            n_best = True
+            n_best_file = cmd[i + 1]
+            n_best_size = cmd[i + 2]
+            # Optional "distinct"
+            if i + 3 < len(cmd) and cmd[i + 3] == 'distinct':
+                n_best_distinct = True
+                cmd = cmd[:i] + cmd[i + 4:]
+            else:
+                cmd = cmd[:i] + cmd[i + 3:]
+        # Handled specially for mert-moses.pl
+        elif cmd[i] in ('-show-weights', '--show-weights'):
+            show_weights = True
+            # Do not remove from cmd
+            i += 1
+        elif cmd[i] in ('-cpu-affinity', '--cpu-affinity'):
+            cpu_affinity = True
+            cmd = cmd[:i] + cmd[i + 1:]
+        else:
+            i += 1
+    # If mert-moses.pl passes -show-weights, just call moses
+    if show_weights:
+        sys.stdout.write(subprocess.check_output(cmd))
+        sys.stdout.flush()
+        return
+    # Check inputs
+    if not (len(cmd) > 0 and moses_ini):
+        sys.stderr.write(HELP.format(os.path.basename(argv[0])))
+        sys.exit(2)
+    if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)):
+        raise Exception('moses "{}" is not executable\n'.format(cmd[0]))
+    # Report settings
+    sys.stderr.write('Moses flags: {}\n'.format(' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:])))
+    sys.stderr.write('Instances:   {}\n'.format(procs))
+    sys.stderr.write('Threads per: {}\n'.format(threads))
+    if extra:
+        sys.stderr.write('Extra:       {}\n'.format(extra))
+    if n_best:
+        sys.stderr.write('N-best list: {} ({}{})\n'.format(n_best_file, n_best_size, ', distinct' if n_best_distinct else ''))
+    # Task and result queues (buffer 8 * total threads input lines)
+    tasks = Queue.Queue(maxsize=(8 * ((procs * threads) + extra)))
+    results = Queue.Queue()
+    # N-best capture
+    if n_best:
+        cmd.append('--n-best-list')
+        cmd.append('-')
+        cmd.append(n_best_size)
+        if n_best_distinct:
+            cmd.append('distinct')
+        if n_best_file == '-':
+            n_best_out = sys.stdout
+        else:
+            n_best_out = open(n_best_file, 'w')
+    # Start instances
+    cpu_offset = -threads
+    instances = []
+    for i in range(procs + (1 if extra else 0)):
+        if cpu_affinity:
+           cpu_offset += threads
+        t = threading.Thread(target=run_instance, args=(cmd, (threads if i < procs else extra), tasks, cpu_affinity, cpu_offset, n_best))
+        instances.append(t)
+        # Daemon: guaranteed to finish before non-daemons
+        t.setDaemon(True)
+        t.start()
+        #time.sleep(1)
+    # Start results writer
+    writer = threading.Thread(target=write_results, args=(results, n_best, n_best_out))
+    writer.start()
+    # Main loop: queue task for each input line
+    id = 0
+    while True:
+        line = input.readline()
+        if not line:
+            break
+        # (input, out lines, err lines, "done" event)
+        task = Task(str(id), line, [], threading.Event())
+        results.put(task, timeout=NEVER)
+        tasks.put(task, timeout=NEVER)
+        id += 1
+    # Tell instances to exit
+    for t in instances:
+        tasks.put(Task(None, None, None, DONE), timeout=NEVER)
+    for t in instances:
+        t.join()
+    # Stop results writer
+    results.put(Task(None, None, None, DONE), timeout=NEVER)
+    writer.join()
+    # Cleanup
+    if n_best:
+        n_best_out.close()
+if __name__ == '__main__':
+    try:
+        main(sys.argv)
+    except:
+        kill_main('Error with main I/O: see stderr')

mosesdecoder/scripts/generic/ph_numbers.perl ADDED Viewed

	@@ -0,0 +1,106 @@

+#!/usr/bin/env perl
+package ph_numbers;
+# Script to recognize and replace numbers in Moses training corpora
+# and decoder input
+#
+# (c) 2013 TAUS
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use strict;
+run() unless caller();
+use Getopt::Std;
+my $debug = $ENV{DEBUG} || 0;
+sub run {
+    my %opts;
+    if(!getopts('s:t:cm:hl',\%opts) || $opts{h}) {
+        print "Usage: perl $0 [-s source_locale][-t target_locale][-c][-h][-l][-m symbol] < in > out\n";
+        exit;
+    }
+    my $sourceLocale = $opts{s} || "";
+    my $targetLocale = $opts{t} || "";
+    my $numberSymbol = $opts{m} || '@num@';
+    while(<>) {
+        chomp;
+        print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
+    }
+}
+sub mark_numbers {
+    my $input = shift;
+    my $corpusMode = shift;
+    my $legacyMode = shift;
+    my $numberSymbol = shift || '@num@';
+    my $numref = recognize($input);
+    my $input_length = length($input);
+    my $output = "";
+    my $position = 0;
+    for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) {
+        my $numstart = $numref->[$i][0];
+        my $numend = $numref->[$i][1];
+        if($position < $numstart) {
+            $output .= substr($input,$position,$numstart-$position);
+        }
+        my $number = substr($input,$numstart,$numend-$numstart);
+        if($corpusMode) {
+      $output .= $numberSymbol;
+        }
+        else {
+            if($legacyMode) {
+                $output .= "<ne translation=\"$number\">$numberSymbol</ne>";
+            }
+            else {
+                $output .= "<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
+            }
+        }
+        $position = $numend;
+    }
+    $output .= substr($input,$position);
+    return $output;
+}
+sub recognize {
+    my $input = shift;
+    #print STDERR "input=$input\n";
+    my @recognized = ();
+    while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) {
+        my $start = $-[3];
+        my $end = $+[3];
+        while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) {
+            $end = $+[2];
+        }
+        # ALL characters in the word must be
+				my $isRecognized = 1;
+				if ($start == 0 || substr($input, $start - 1, 1) eq " ") {
+          # 1st word, or previous char is a space
+        }
+        else {
+          $isRecognized = 0;
+        }
+        if ($end == length($input) || substr($input, $end, 1) eq " ") {
+        # last word, or next char is a space
+        }
+        else {
+          $isRecognized = 0;
+        }
+        #print STDERR "start=$start end=$end len=" .length($input) ."\n";
+        if ($isRecognized) {
+          push @recognized,[$start,$end];
+        }
+    }
+    return \@recognized;
+}
+1;

mosesdecoder/scripts/generic/reverse-alignment.perl ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use strict;
+my $line;
+while ($line = <STDIN>)
+{
+  chomp($line);
+  my @toks = split(/ /, $line);
+  foreach (my $i = 0; $i < @toks; ++$i)
+  {
+    my $tok = $toks[$i];
+    my @alignPair = split(/-/, $tok);
+    (@alignPair == 2) or die("Something wrong");
+    print $alignPair[1]."-".$alignPair[0]." ";
+  }
+  print "\n";
+}

mosesdecoder/scripts/generic/score-parallel.perl ADDED Viewed

	@@ -0,0 +1,428 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+# example
+# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e  --GoodTuring ./phrase-table.2.coc 0
+# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.inv.sorted.gz ./lex.2.e2f ./phrase-table.2.half.e2f  --Inverse 1
+use warnings;
+use strict;
+use File::Basename;
+sub RunFork($);
+sub systemCheck($);
+sub GetSourcePhrase($);
+sub NumStr($);
+sub CutContextFile($$$);
+my $GZIP_EXEC;
+if(`which pigz`) {
+  $GZIP_EXEC = 'pigz';
+}
+else {
+  $GZIP_EXEC = 'gzip';
+}
+print STDERR "using $GZIP_EXEC \n";
+#my $EXTRACT_SPLIT_LINES = 5000000;
+my $EXTRACT_SPLIT_LINES = 50000000;
+print STDERR "Started ".localtime() ."\n";
+my $numParallel	= $ARGV[0];
+$numParallel = 1 if $numParallel < 1;
+my $sortCmd			= $ARGV[1];
+my $scoreCmd		= $ARGV[2];
+my $extractFile = $ARGV[3]; # 1st arg of extract argument
+my $lexFile 		= $ARGV[4];
+my $ptHalf 			= $ARGV[5]; # output
+my $inverse = 0;
+my $sourceLabelsFile;
+my $partsOfSpeechFile;
+my $targetSyntacticPreferencesLabelsFile;
+my $otherExtractArgs= "";
+for (my $i = 6; $i < $#ARGV; ++$i)
+{
+  if ($ARGV[$i] eq '--SourceLabels') {
+    $sourceLabelsFile = $ARGV[++$i];
+    $otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS ";
+    next;
+  }
+  if ($ARGV[$i] eq '--PartsOfSpeech') {
+    $partsOfSpeechFile = $ARGV[++$i];
+    $otherExtractArgs .= "--PartsOfSpeech ";
+    next;
+  }
+  if ($ARGV[$i] eq '--TargetSyntacticPreferences') {
+    $targetSyntacticPreferencesLabelsFile = $ARGV[++$i];
+    $otherExtractArgs .= "--TargetSyntacticPreferences ";
+    next;
+  }
+  if ($ARGV[$i] eq '--Inverse') {
+    $inverse = 1;
+    $otherExtractArgs .= $ARGV[$i] ." ";
+    next;
+  }
+  $otherExtractArgs .= $ARGV[$i] ." ";
+}
+#$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs
+my $FlexibilityScore	= $otherExtractArgs =~ /--FlexibilityScore/;
+my $FlexibilityCmd	= $otherExtractArgs;
+$otherExtractArgs =~ s/--FlexibilityScore=\S+//; # don't pass flexibility_score command to score program
+if ($FlexibilityCmd =~ /--FlexibilityScore=(\S+)/) {
+	$FlexibilityCmd = $1;
+}
+my $doSort			= $ARGV[$#ARGV]; # last arg
+my $TMPDIR=dirname($ptHalf)  ."/tmp.$$";
+mkdir $TMPDIR;
+my $cmd;
+my $extractFileContext;
+if ($FlexibilityScore) {
+  $extractFileContext = $extractFile;
+  $extractFileContext =~ s/extract./extract.context./;
+}
+my $fileCount = 0;
+if ($numParallel <= 1)
+{ # don't do parallel. Just link the extract file into place
+  $cmd = "ln -s $extractFile $TMPDIR/extract.0.gz";
+  if ($FlexibilityScore) {
+    $cmd .= " && ln -s $extractFileContext $TMPDIR/extract.context.0.gz";
+  }
+  print STDERR "$cmd \n";
+  systemCheck($cmd);
+  $fileCount = 1;
+}
+else
+{	# cut up extract file into smaller mini-extract files.
+	if ($extractFile =~ /\.gz$/) {
+		open(IN, "gunzip -c $extractFile |") || die "can't open pipe to $extractFile";
+	}
+	else {
+		open(IN, $extractFile) || die "can't open $extractFile";
+	}
+	my $lastlineContext;
+	if ($FlexibilityScore) {
+		$lastlineContext = "";
+		if ($extractFileContext =~ /\.gz$/) {
+			open(IN_CONTEXT, "gunzip -c $extractFileContext |") || die "can't open pipe to $extractFileContext";
+		}
+		else {
+			open(IN_CONTEXT, $extractFileContext) || die "can't open $extractFileContext";
+		}
+	}
+	my $filePath  = "$TMPDIR/extract.$fileCount.gz";
+	open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
+	my $lineCount = 0;
+	my $line;
+	my $prevSourcePhrase = "";
+	while ($line=<IN>)
+	{
+		chomp($line);
+		++$lineCount;
+		if ($lineCount > $EXTRACT_SPLIT_LINES)
+		{ # over line limit. Cut off at next source phrase change
+			my $sourcePhrase = GetSourcePhrase($line);
+			if ($prevSourcePhrase eq "")
+			{ # start comparing
+				$prevSourcePhrase = $sourcePhrase;
+			}
+			elsif ($sourcePhrase eq $prevSourcePhrase)
+			{ # can't cut off yet. Do nothing
+			}
+			else
+			{ # cut off, open next min-extract file & write to that instead
+				close OUT;
+				if ($FlexibilityScore) {
+					$lastlineContext = CutContextFile($prevSourcePhrase, $fileCount, $lastlineContext);
+				}
+				$prevSourcePhrase = "";
+				$lineCount = 0;
+				++$fileCount;
+				my $filePath  = $fileCount;
+				$filePath     = "$TMPDIR/extract.$filePath.gz";
+				open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
+			}
+		}
+		else
+		{ # keep on writing to current mini-extract file
+		}
+		print OUT "$line\n";
+	}
+	close OUT;
+	if ($FlexibilityScore) {
+		$lastlineContext = CutContextFile($prevSourcePhrase, $fileCount, $lastlineContext);
+	}
+	++$fileCount;
+}
+# create run scripts
+my @runFiles = (0..($numParallel-1));
+for (my $i = 0; $i < $numParallel; ++$i)
+{
+  my $path = "$TMPDIR/run.$i.sh";
+  open(my $fh, ">", $path) or die "cannot open $path: $!";
+  $runFiles[$i] = $fh;
+}
+# write scoring of mini-extracts to run scripts
+for (my $i = 0; $i < $fileCount; ++$i)
+{
+  my $numStr = NumStr($i);
+  my $fileInd = $i % $numParallel;
+  my $fh = $runFiles[$fileInd];
+  my $cmd = "$scoreCmd $TMPDIR/extract.$i.gz $lexFile $TMPDIR/phrase-table.half.$numStr.gz $otherExtractArgs 2>> /dev/stderr \n";
+  print STDERR $cmd;
+  if ($FlexibilityScore) {
+    $cmd .= "gzip -cd $TMPDIR/phrase-table.half.$numStr.gz | $FlexibilityCmd $TMPDIR/extract.context.$i.gz";
+    $cmd .= " --Inverse" if ($otherExtractArgs =~ /--Inverse/);
+    $cmd .= " --Hierarchical" if ($otherExtractArgs =~ /--Hierarchical/);
+    $cmd .= " | $GZIP_EXEC -c > $TMPDIR/phrase-table.half.$numStr.flex.gz\n";
+    $cmd .= "mv $TMPDIR/phrase-table.half.$numStr.flex.gz $TMPDIR/phrase-table.half.$numStr.gz\n";
+  }
+  print $fh $cmd;
+}
+# close run script files
+for (my $i = 0; $i < $numParallel; ++$i)
+{
+  close($runFiles[$i]);
+  my $path = "$TMPDIR/run.$i.sh";
+  systemCheck("chmod +x $path");
+}
+# run each score script in parallel
+my @children;
+for (my $i = 0; $i < $numParallel; ++$i)
+{
+  my $cmd = "$TMPDIR/run.$i.sh";
+	my $pid = RunFork($cmd);
+	push(@children, $pid);
+}
+# wait for everything is finished
+foreach (@children) {
+	waitpid($_, 0);
+}
+# merge & sort
+$cmd = "\n\nOH SHIT. This should have been filled in \n\n";
+if ($fileCount == 1 && !$doSort && !$FlexibilityScore)
+{
+  my $numStr = NumStr(0);
+  $cmd = "mv $TMPDIR/phrase-table.half.$numStr.gz $ptHalf";
+}
+else
+{
+  $cmd = "gunzip -c $TMPDIR/phrase-table.half.*.gz 2>> /dev/stderr";
+  if ($doSort) {
+    $cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR ";
+  }
+  $cmd .= " | $GZIP_EXEC -c > $ptHalf  2>> /dev/stderr ";
+}
+print STDERR $cmd;
+systemCheck($cmd);
+# merge coc
+my $numStr = NumStr(0);
+my $cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc";
+if (-e $cocPath)
+{
+  my @arrayCOC;
+  my $line;
+  # 1st file
+  open(FHCOC, $cocPath) || die "can't open pipe to $cocPath";
+  while ($line = <FHCOC>)
+  {
+    my $coc = int($line);
+    push(@arrayCOC, $coc);
+  }
+  close(FHCOC);
+  # all other files
+  for (my $i = 1; $i < $fileCount; ++$i)
+  {
+  	$numStr = NumStr($i);
+    $cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc";
+    open(FHCOC, $cocPath) || die "can't open pipe to $cocPath";
+    my $arrayInd = 0;
+    while ($line = <FHCOC>)
+    {
+      my $coc = int($line);
+      $arrayCOC[$arrayInd] += $coc;
+      ++$arrayInd;
+    }
+    close(FHCOC);
+  }
+  # output
+  $cocPath = "$ptHalf.coc";
+  open(FHCOC, ">", $cocPath) or die "cannot open $cocPath: $!";
+  for (my $i = 0; $i < @arrayCOC; ++$i)
+  {
+    print FHCOC $arrayCOC[$i]."\n";
+  }
+  close(FHCOC);
+}
+# merge source labels files
+if (!$inverse && defined($sourceLabelsFile))
+{
+  my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; echo \"SSTART 2\"; echo \"SEND 3\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $sourceLabelsFile";
+  print STDERR "Merging source labels files: $cmd \n";
+  `$cmd`;
+}
+# merge parts-of-speech files
+if (!$inverse && defined($partsOfSpeechFile))
+{
+  my $cmd = "(echo \"SSTART 0\"; echo \"SEND 1\"; cat $TMPDIR/phrase-table.half.*.gz.partsOfSpeech | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $partsOfSpeechFile";
+  print STDERR "Merging parts-of-speech files: $cmd \n";
+  `$cmd`;
+}
+# merge target syntactic preferences labels files
+if (!$inverse && defined($targetSyntacticPreferencesLabelsFile))
+{
+  my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.tgtpref | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $targetSyntacticPreferencesLabelsFile";
+  print STDERR "Merging target syntactic preferences labels files: $cmd \n";
+  `$cmd`;
+}
+$cmd = "rm -rf $TMPDIR \n";
+print STDERR $cmd;
+systemCheck($cmd);
+print STDERR "Finished ".localtime() ."\n";
+# -----------------------------------------
+# -----------------------------------------
+sub RunFork($)
+{
+  my $cmd = shift;
+  my $pid = fork();
+  if ($pid == 0)
+  { # child
+    print STDERR $cmd;
+    systemCheck($cmd);
+    exit();
+  }
+  return $pid;
+}
+sub systemCheck($)
+{
+  my $cmd = shift;
+  my $retVal = system($cmd);
+  if ($retVal != 0)
+  {
+    exit(1);
+  }
+}
+sub GetSourcePhrase($)
+{
+  my $line = shift;
+  my $pos = index($line, "|||");
+  my $sourcePhrase = substr($line, 0, $pos);
+  return $sourcePhrase;
+}
+sub NumStr($)
+{
+    my $i = shift;
+    my $numStr;
+    if ($i < 10) {
+	$numStr = "000000$i";
+    }
+    elsif ($i < 100) {
+	$numStr = "00000$i";
+    }
+    elsif ($i < 1000) {
+	$numStr = "0000$i";
+    }
+    elsif ($i < 10000) {
+	$numStr = "000$i";
+    }
+    elsif ($i < 100000) {
+	$numStr = "00$i";
+    }
+    elsif ($i < 1000000) {
+	$numStr = "0$i";
+    }
+    else {
+	$numStr = $i;
+    }
+    return $numStr;
+}
+sub CutContextFile($$$)
+{
+    my($lastsourcePhrase, $fileCount, $lastline) = @_;
+    my $line;
+    my $sourcePhrase;
+    my $filePath  = "$TMPDIR/extract.context.$fileCount.gz";
+    open (OUT_CONTEXT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
+    if ($lastline ne "") {
+        print OUT_CONTEXT "$lastline\n";
+    }
+    #write all lines in context file until we meet last source phrase in extract file
+    while ($line=<IN_CONTEXT>)
+    {
+    chomp($line);
+    $sourcePhrase = GetSourcePhrase($line);
+    print OUT_CONTEXT "$line\n";
+    if ($sourcePhrase eq $lastsourcePhrase) {last;}
+    }
+    #write all lines in context file that correspond to last source phrase in extract file
+    while ($line=<IN_CONTEXT>)
+    {
+    chomp($line);
+    $sourcePhrase = GetSourcePhrase($line);
+    if ($sourcePhrase ne $lastsourcePhrase) {last;}
+    print OUT_CONTEXT "$line\n";
+    }
+    close(OUT_CONTEXT);
+    return $line;
+}

mosesdecoder/scripts/generic/score_parallel.py ADDED Viewed

	@@ -0,0 +1,776 @@

+#! /usr/bin/env python
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+#
+# Script contributed by Precision Translation Tools.
+"""Run Moses `score` jobs in parallel.
+This script is a replacement for `score-parallel.perl`.  The two are similar,
+but there are differences in usage.  In addition, this script can be called
+directly from Python code without the need to run it as a separate process.
+"""
+from __future__ import (
+    absolute_import,
+    print_function,
+    unicode_literals,
+    )
+__metaclass__ = type
+from argparse import ArgumentParser
+from contextlib import contextmanager
+from datetime import datetime
+import errno
+import gzip
+from multiprocessing import Pool
+import os
+import os.path
+import pipes
+from shutil import rmtree
+from subprocess import check_call
+import sys
+import tempfile
+def get_unicode_type():
+    """Return the Unicode string type appropriate to this Python version."""
+    if sys.version_info.major <= 2:
+        # Unicode string type.  In Python 2 this is the "unicode" type,
+        # while "str" is a binary string type.
+        return unicode
+    else:
+        # Unicode string type.  In Python 3 this is the default "str" type.
+        # The binary string type is now called "bytes".
+        return str
+UNICODE_TYPE = get_unicode_type()
+class CommandLineError(Exception):
+    """Invalid command line."""
+class ProgramFailure(Exception):
+    """Failure, not a bug, which is reported neatly to the user."""
+def parse_args():
+    """Parse command line arguments, return as `Namespace`."""
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--extract-file', '-e', metavar='PATH', required=True,
+        help=(
+            "Path to input file: extract file (e.g. 'extract.sorted.gz' or "
+            "'extract.inv.sorted.gz').  Required."))
+    parser.add_argument(
+        '--lex-file', '-l', metavar='PATH', required=True,
+        help=(
+            "Path to input file: lex file (e.g. 'lex.f2e' or 'lex.e2f').  "
+            "Required."))
+    parser.add_argument(
+        '--output', '-o', metavar='PATH', required=True,
+        help=(
+            "Write phrase table to file PATH (e.g. 'phrase-table.half.f2e' "
+            "or 'phrase-table.half.e2f').  Required."))
+    parser.add_argument(
+        '--inverse', '-i', action='store_true',
+        help="Inverse scoring.  Defaults to direct scoring.")
+    parser.add_argument(
+        '--labels-file', '-L', metavar='PATH',
+        help="Also write source labels to file PATH.")
+    parser.add_argument(
+        '--parts-of-speech', '-p', metavar='PATH',
+        help="Also write parts-of-speech file to PATH.")
+    parser.add_argument(
+        '--flexibility-score', '-F', metavar='PATH',
+        help="Path to the 'flexibility_score.py' script.  Defaults to none.")
+    parser.add_argument(
+        '--hierarchical', '-H', action='store_true',
+        help="Process hierarchical rules.")
+    parser.add_argument(
+        '--args', '-a', metavar='ARGUMENTS',
+        help="Additional arguments for `score` and `flexibility_score`.")
+    parser.add_argument(
+        '--sort', '-s', action='store_true',
+        help="Sort output file.")
+    parser.add_argument(
+        '--jobs', '-j', metavar='N', type=int, default=1,
+        help="Run up to N jobs in parallel.  Defaults to %(default)s.")
+    parser.add_argument(
+        '--score-exe', '-x', metavar='PROGRAM',
+        help="Name of, or path to, the 'score' executable.")
+    parser.add_argument(
+        '--sort-command', '-S', metavar='COMMAND-LINE',
+        help=(
+            "Command line for sorting text files to standard output.  "
+            "Must support operation as a pipe, as well as input files named "
+            "as command-line arguments."))
+    parser.add_argument(
+        '--gzip-command', '-z', metavar='PROGRAM',
+        help="Path to a gzip or pigz executable.")
+    parser.add_argument(
+        '--verbose', '-v', action='store_true',
+        help="Print what's going on.")
+    parser.add_argument(
+        '--debug', '-d', action='store_true',
+        help="Don't delete temporary directories when done.")
+    return parser.parse_args()
+def normalize_path(optional_path=None):
+    """Return a cleaned-up version of a given filesystem path, or None.
+    Converts the path to the operating system's native conventions, and
+    removes redundancies like `.`.
+    The return value will be `None`, an absolute path, or a relative path,
+    same as the argument.  But it will have redundant path separators,
+    unnecessary detours through parent directories, and use of the current
+    directory "." removed.
+    """
+    if optional_path is None:
+        return None
+    else:
+        path = os.path.normpath(optional_path)
+        path = path.replace('/', os.path.sep)
+        path = path.replace('\\', os.path.sep)
+        return path
+def quote(path):
+    """Quote and escape a filename for use in a shell command.
+    The Windows implementation is very limited and will break on anything
+    more advanced than a space.
+    """
+    if os.name == 'posix':
+        return pipes.quote(path)
+    else:
+        # TODO: Improve escaping for Windows.
+        return '"%s"' % path
+def sanitize_args(args):
+    """Check `args` for sanity, clean up, and set nontrivial defaults."""
+    if args.jobs < 1:
+        raise CommandLineError("Number of parallel jobs must be 1 or more.")
+    if args.sort_command is None:
+        args.sort_command = find_first_executable(
+            ['neandersort', 'gsort', 'sort'])
+    if args.sort_command is None:
+        raise CommandLineError(
+            "No 'sort' command is available.  "
+            "Choose one using the --sort-command option.")
+    if args.gzip_command is None:
+        args.gzip_command = find_first_executable(['pigz', 'gzip'])
+    if args.gzip_command is None:
+        raise CommandLineError(
+            "No 'gzip' or 'pigz' command is available.  "
+            "Choose one using the --gzip-command option.")
+    if args.score_exe is None:
+        # Look for "score" executable.  It may be in the current project
+        # directory somewhere, or in the PATH.
+        moses_dir = os.path.dirname(os.path.dirname(
+            os.path.abspath(__file__)))
+        args.score_exe = find_first_executable(
+            ['score'],
+            [
+                moses_dir,
+                os.path.join(moses_dir, 'phrase-extract'),
+                os.path.join(moses_dir, 'binaries'),
+            ])
+    args.extract_file = normalize_path(args.extract_file)
+    args.lex_file = normalize_path(args.lex_file)
+    args.output = normalize_path(args.output)
+    args.labels_file = normalize_path(args.labels_file)
+    args.parts_of_speech = normalize_path(args.parts_of_speech)
+    args.flexibility_score = normalize_path(args.flexibility_score)
+    args.score_exe = normalize_path(args.score_exe)
+def add_exe_suffix(program):
+    """Return the full filename for an executable.
+    On Windows, this adds a `.exe` suffix to the name.  On other
+    systems, it returns the original name unchanged.
+    """
+    if os.name == 'nt':
+        # Windows.
+        return program + '.exe'
+    else:
+        # Assume POSIX or similar.
+        return program
+def find_executable(exe, extra_path=None):
+    """Return full path to an executable of the given name, or `None`.
+    If the given name is a qualified path to an executable, it will be returned
+    unchanged.  A qualified path where no executable is found results in a
+    `CommandLineError`.
+    """
+    if extra_path is None:
+        extra_path = []
+    if os.path.sep in exe:
+        # The executable name includes a path.  Only one place it can be.
+        if not os.path.isfile(exe) or not os.access(exe, os.X_OK):
+            raise CommandLineError("Not an executable: '%s'." % exe)
+        return exe
+    for path in extra_path + os.getenv('PATH').split(os.pathsep):
+        full_path = os.path.join(path, exe)
+        if os.access(full_path, os.X_OK):
+            return full_path
+    return None
+def find_first_executable(candidates, extra_path=None):
+    """Find the first available of the given candidate programs.
+    :raise ProgramFailure: If none of `candidates` was found.
+    """
+    for program in candidates:
+        executable = find_executable(add_exe_suffix(program), extra_path)
+        if executable is not None:
+            return executable
+    raise ProgramFailure(
+        "Could not find any of these executables in path: %s."
+        % ', '.join(candidates))
+def execute_shell(command, verbose=False):
+    """Run `command` string through the shell.
+    Inherits environment, but sets `LC_ALL` to `C` for predictable results,
+    especially from sort commands.
+    This uses a full-featured shell, including pipes, substitution, etc.  So
+    remember to quote/escape arguments where appropriate!
+    """
+    assert isinstance(command, UNICODE_TYPE), (
+        "Wrong argument for execute_shell.")
+    if verbose:
+        print("Executing: %s" % command)
+    env = os.environ.copy()
+    if os.name == 'posix':
+        env['LC_ALL'] = 'C'
+    check_call(command, shell=True, env=env)
+@contextmanager
+def tempdir(keep=False):
+    """Context manager: temporary directory."""
+    directory = tempfile.mkdtemp()
+    yield directory
+    if not keep:
+        rmtree(directory)
+def make_dirs(path):
+    """Equivalent to `mkdir -p -- path`."""
+    try:
+        os.makedirs(path)
+    except OSError as error:
+        if error.errno != errno.EEXIST:
+            raise
+def open_file(path, mode='r'):
+    """Open a file, which may be gzip-compressed."""
+    if path.endswith('.gz'):
+        return gzip.open(path, mode)
+    else:
+        return open(path, mode)
+def count_lines(filename):
+    """Count the number of lines in `filename` (may be gzip-compressed)."""
+    count = 0
+    with open_file(filename) as stream:
+        for _ in stream:
+            count += 1
+    return count
+def set_temp_dir():
+    """Set temporary directory to `$MOSES_TEMP_DIR`, if set.
+    Create the directory if necessary.
+    """
+    temp_dir = os.getenv('MOSES_TEMP_DIR')
+    if temp_dir is not None:
+        make_dirs(temp_dir)
+        tempfile.tempdir = temp_dir
+def strip_newline(line):
+    """Remove trailing carriage return and/or line feed, if present."""
+    if line.endswith('\n'):
+        line = line[:-1]
+    if line.endswith('\r'):
+        line = line[:-1]
+    return line
+def open_chunk_file(split_dir, chunk_number):
+    """Open a file to write one chunk of the extract file."""
+    return open_file(
+        os.path.join(split_dir, 'extract.%d.gz' % chunk_number), 'w')
+def name_context_chunk_file(split_dir, chunk_number):
+    """Compose file name for one chunk of the extract context file."""
+    return os.path.join(
+        split_dir, 'extract.context.%d.gz' % chunk_number)
+def extract_source_phrase(line):
+    """Extract the source phrase from an extract-file line."""
+    return line.split(b'|||', 1)[0]
+def cut_context_file(last_source_phrase, chunk_file, last_line,
+                     context_stream):
+    """Write one chunk of extract context file into its own file.
+    :param last_source_phrase: Last source phrase that should be in the
+        chunk.  Stop processing after this source phrase.
+    :param chunk_file: Path to the extract context file for this chunk.
+    :param last_line: Previously read line that may still need writing.
+    :param context_stream: Extract context file, opened for reading.
+    :return: Last line read from `context_stream`.  This line will still
+        need processing.
+    """
+    # TODO: Use open_file.
+    with gzip.open(chunk_file, 'w') as chunk:
+        if last_line is not None:
+            chunk.write('%s\n' % last_line)
+        # Are we processing our last source phrase yet?
+        on_last_source_phrase = False
+        # Write all lines in context file until we meet last source phrase
+        # in extract file.
+        for line in context_stream:
+            # Reading from a gzip file returns lines *including the newline*.
+            # Either way, we want to ignore carriage returns as well.
+            line = strip_newline(line)
+            source_phrase = extract_source_phrase(line)
+            if on_last_source_phrase and source_phrase != last_source_phrase:
+                # First new source phrase after our last one.  We're done.
+                return line
+            else:
+                # Still adding lines to our chunk.
+                chunk.write('%s\n' % line)
+                if source_phrase == last_source_phrase:
+                    # We're on our last source phrase now.
+                    on_last_source_phrase = True
+def split_extract_files(split_dir, extract_file, extract_context_file=None,
+                        jobs=1):
+    """Split extract file into chunks, so we can process them in parallel.
+    :param split_dir: A temporary directory where this function can write
+        temporary files.  The caller must ensure that this directory will be
+        cleaned up after it's done with the files.
+    :return: An iterable of tuples.  Each tuple hols a partial extract file,
+        and the corresponding context file.  The files may be in `split_dir`,
+        or there may just be the original extract file.
+    """
+    if jobs == 1:
+        # No splitting needed.  Read the original file(s).
+        return [(extract_file, extract_context_file)]
+    # Otherwise: split files.
+    files = []
+    num_lines = count_lines(extract_file)
+    chunk_size = (num_lines + jobs - 1) / jobs
+    assert isinstance(chunk_size, int)
+    line_count = 0
+    chunk_number = 0
+    prev_source_phrase = None
+    last_line_context = None
+    extract_stream = open_file(extract_file)
+    chunk_file = open_chunk_file(split_dir, chunk_number)
+    if extract_context_file is None:
+        chunk_context_file = None
+    if extract_context_file is not None:
+        context_stream = open_file(extract_context_file)
+    for line in extract_stream:
+        line_count += 1
+        line = line.decode('utf-8')
+        line = strip_newline(line)
+        if line_count >= chunk_size:
+            # At or over chunk size.  Cut off at next source phrase change.
+            source_phrase = extract_source_phrase(line)
+            if prev_source_phrase is None:
+                # Start looking for a different source phrase.
+                prev_source_phrase = source_phrase
+            elif source_phrase == prev_source_phrase:
+                # Can't cut yet.  Still working on the same source phrase.
+                pass
+            else:
+                # Hit first new source phrase after chunk limit.  Cut new
+                # file(s).
+                chunk_file.close()
+                if extract_context_file is not None:
+                    chunk_context_file = name_context_chunk_file(
+                        split_dir, chunk_number)
+                    last_line_context = cut_context_file(
+                        prev_source_phrase, chunk_context_file,
+                        last_line_context, context_stream)
+                files.append((chunk_file.name, chunk_context_file))
+                # Start on new chunk.
+                prev_source_phrase = None
+                line_count = 0
+                chunk_number += 1
+                chunk_file = open_chunk_file(split_dir, chunk_number)
+        chunk_file.write(('%s\n' % line).encode('utf-8'))
+    chunk_file.close()
+    if extract_context_file is not None:
+        chunk_context_file = name_context_chunk_file(split_dir, chunk_number)
+        last_line_context = cut_context_file(
+            prev_source_phrase, chunk_number, last_line_context,
+            context_stream)
+    files.append((chunk_file.name, chunk_context_file))
+    return files
+def compose_score_command(extract_file, context_file, half_file,
+                          flex_half_file, args):
+    """Compose command line text to run one instance of `score`.
+    :param extract_file: One chunk of extract file.
+    :param context_file: If doing flexibility scoring, one chunk of
+        extract context file.  Otherwise, None.
+    :param half_file: ???
+    :param flex_half_file: ???
+    :param args: Arguments namespace.
+    """
+    command = [
+        args.score_exe,
+        extract_file,
+        args.lex_file,
+        half_file,
+        ]
+    if args.args not in (None, ''):
+        command.append(args.args)
+    other_args = build_score_args(args)
+    if other_args != '':
+        command.append(other_args)
+    if context_file is not None:
+        command += [
+            '&&',
+            find_first_executable(['bzcat']),
+            half_file,
+            '|',
+            quote(args.flexibility_score),
+            quote(context_file),
+            ]
+        if args.inverse:
+            command.append('--Inverse')
+        if args.hierarchical:
+            command.append('--Hierarchical')
+        command += [
+            '|',
+            quote(args.gzip_command),
+            '-c',
+            '>%s' % quote(flex_half_file),
+            ]
+    return ' '.join(command)
+def score_parallel(split_dir, file_pairs, args):
+    """Run the `score` command in parallel.
+    :param split_dir: Temporary directory where we can create split files.
+    :param file_pairs: Sequence of tuples for the input files, one tuple
+        per chunk of the work.  Each tuple consists of a partial extract
+        file, and optionally a partial extract context file.
+    :param args: Arguments namespace.
+    :return: A list of tuples.  Each tuple contains two file paths.  The first
+        is for a partial half-phrase-table file.  The second is for the
+        corresponding partial flex file, if a context file is given; or
+        `None` otherwise.
+    """
+    partial_files = []
+    # Pool of worker processes for executing the partial "score" invocations
+    # concurrently.
+    pool = Pool(args.jobs)
+    try:
+        for chunk_num, file_pair in enumerate(file_pairs):
+            half_file = os.path.join(
+                split_dir, 'phrase-table.half.%06d.gz' % chunk_num)
+            extract_file, context_file = file_pair
+            if context_file is None:
+                flex_half_file = None
+            else:
+                flex_half_file = os.path.join(
+                    split_dir, 'phrase-table.half.%06d.flex.gz' % chunk_num)
+            # Pickling of arguments for the pool is awkward on Windows, so
+            # keep them simple.  Compose the command line in the parent
+            # process, then hand them to worker processes which execute them.
+            command_line = compose_score_command(
+                extract_file, context_file, half_file, flex_half_file, args)
+            pool.apply_async(
+                execute_shell, (command_line, ), {'verbose': args.verbose})
+            partial_files.append((half_file, flex_half_file))
+        pool.close()
+    except BaseException:
+        pool.terminate()
+        raise
+    finally:
+        pool.join()
+    return partial_files
+def merge_and_sort(files, output, sort_command=None, gzip_exe=None,
+                   verbose=False):
+    """Merge partial files.
+    :param files: List of partial half-phrase-table files.
+    :param output: Path for resulting combined phrase-table file.
+    """
+# TODO: The Perl code mentioned "sort" and "flexibility_score" here.
+# What do we do with those?
+    # Sort whether we're asked to or not, as a way of combining the input
+    # files.
+    if sort_command == 'neandersort':
+        # Neandersort transparently decompresses input and compresses output.
+        check_call([
+            'neandersort',
+            '-o', output,
+            ] + files)
+    else:
+        command = (
+            "%(gzip)s -c -d %(files)s | "
+            "%(sort)s | "
+            "%(gzip)s -c >>%(output)s"
+            % {
+                'gzip': quote(gzip_exe),
+                'sort': sort_command,
+                'files': ' '.join(map(quote, files)),
+                'output': quote(output),
+            })
+        execute_shell(command, verbose=verbose)
+def build_score_args(args):
+    """Compose command line for the `score` program."""
+    command_line = []
+    if args.labels_file:
+        command_line += [
+            '--SourceLabels',
+            '--SourceLabelCountsLHS',
+            '--SourceLabelSet',
+            ]
+    if args.parts_of_speech:
+        command_line.append('--PartsOfSpeech')
+    if args.inverse:
+        command_line.append('--Inverse')
+    if args.args is not None:
+        command_line.append(args.args)
+    return ' '.join(command_line)
+def list_existing(paths):
+    """Return, in the same order, those of the given files which exist."""
+    return filter(os.path.exists, paths)
+def compose_coc_path_for(path):
+    """Compose COC-file path for the given file."""
+    return '%s.coc' % path
+def read_cocs(path):
+    """Read COC file at `path`, return contents as tuple of ints."""
+    with open(path) as lines:
+        return tuple(
+            int(line.rstrip('\r\n'))
+            for line in lines
+            )
+def add_cocs(original, additional):
+    """Add two tuples of COCs.  Extend as needed."""
+    assert not (original is None and additional is None), "No COCs to add!"
+    if original is None:
+        return additional
+    elif additional is None:
+        return original
+    else:
+        common = tuple(lhs + rhs for lhs, rhs in zip(original, additional))
+        return (
+            common +
+            tuple(original[len(common):]) +
+            tuple(additional[len(common):]))
+def merge_coc(files, output):
+    """Merge COC files for the given partial files.
+    Each COC file is a series of integers, one per line.  This reads them, and
+    adds them up line-wise into one file of the same format: the sum of the
+    numbers the respective files have at line 1, the sum of the numbers the
+    respective files have at line 2, and so on.
+    """
+    assert len(files) > 0, "No partial files - no work to do."
+    extract_files = [extract_file for extract_file, _ in files]
+    if not os.path.exists(compose_coc_path_for(extract_files[0])):
+        # Nothing to merge.
+        return
+    totals = None
+# TODO: Shouldn't we just fail if any of these files is missing?
+    for coc_path in list_existing(map(compose_coc_path_for, extract_files)):
+        totals = add_cocs(totals, read_cocs(coc_path))
+    # Write to output file.
+    with open(output, 'w') as output_stream:
+        for entry in totals:
+            output_stream.write('%d\n' % entry)
+def suffix_line_numbers(infile, outfile):
+    """Rewrite `infile` to `outfile`; suffix line number to each line.
+    The line number is zero-based, and separated from the rest of the line
+    by a single space.
+    """
+    temp_file = '%s.numbering' % outfile
+    with open(infile, 'r') as instream, open(outfile, 'w') as outstream:
+        line_no = 0
+        for line in instream:
+            outstream.write(line)
+            outstream.write(' %d\n' % line_no)
+            line_no += 1
+    os.rename(temp_file, outfile)
+def compose_source_labels_path_for(path):
+    """Return source labels file path for given file."""
+    return '%s.syntaxLabels.src' % path
+def merge_numbered_files(inputs, output, header_lines, sort_command,
+                         verbose=False):
+    """Sort and merge files `inputs`, add header and line numbers.
+    :param inputs: Iterable of input files.
+    :param output: Output file.
+    :header_lines: Iterable of header lines.
+    :sort_command: Command line for sorting input files.
+    """
+    sort_temp = '%s.sorting' % output
+    with open(sort_temp, 'w') as stream:
+        for line in header_lines:
+            stream.write(line)
+            stream.write('\n')
+    execute_shell(
+        "%s %s >>%s" % (
+            sort_command,
+            ' '.join(map(quote, inputs)),
+            quote(sort_temp)),
+        verbose=verbose)
+    suffix_line_numbers(sort_temp, output)
+def merge_source_labels(files, output, sort_command, verbose=False):
+    """Merge source labels files."""
+# TODO: Shouldn't we just fail if any of these files is missing?
+    labels_files = list_existing(map(compose_source_labels_path_for, files))
+    header = [
+        'GlueTop',
+        'GlueX',
+        'SSTART',
+        'SEND',
+        ]
+    merge_numbered_files(
+        labels_files, output, header, sort_command, verbose=verbose)
+def compose_parts_of_speech_path_for(path):
+    """Return parts-of-speech file path for given file."""
+    return '%s.partsOfSpeech' % path
+def merge_parts_of_speech(files, output, sort_command, verbose=False):
+    """Merge parts-of-speech files into output."""
+# TODO: Shouldn't we just fail if any of these files is missing?
+    parts_files = list_existing(map(compose_parts_of_speech_path_for, files))
+    header = [
+        'SSTART',
+        'SEND',
+        ]
+    merge_numbered_files(
+        parts_files, output, header, sort_command, verbose=verbose)
+def main():
+    """Command-line entry point.  Marshals and forwards to `score_parallel`."""
+    args = parse_args()
+    sanitize_args(args)
+    set_temp_dir()
+    if args.flexibility_score is None:
+        extract_context_file = None
+    else:
+        extract_context_file = args.extract_file.replace(
+            'extract.', 'extract.context.')
+    if args.verbose:
+        print("Started %s." % datetime.now())
+        print("Using '%s' for gzip." % args.gzip_command)
+    with tempdir(args.debug) as split_dir:
+        extract_files = split_extract_files(
+            split_dir, args.extract_file,
+            extract_context_file=extract_context_file, jobs=args.jobs)
+        scored_files = score_parallel(split_dir, extract_files, args)
+        if args.verbose:
+            sys.stderr.write("Finished score %s.\n" % datetime.now())
+# TODO: Pass on "sort" and "flexibility-score" arguments?
+        merge_and_sort(
+            [phrase_chunk for phrase_chunk, _ in scored_files], args.output,
+            sort_command=args.sort_command, gzip_exe=args.gzip_command,
+            verbose=args.verbose)
+        merge_coc(extract_files, compose_coc_path_for(args.output))
+        if not args.inverse and args.labels_file is not None:
+            if args.verbose:
+                print("Merging source labels files.")
+            merge_source_labels(
+                extract_files, args.labels_file,
+                sort_command=args.sort_command, verbose=args.verbose)
+        if not args.inverse and args.parts_of_speech is not None:
+            if args.verbose:
+                print("Merging parts-of-speech files.")
+            merge_parts_of_speech(
+                extract_files, args.parts_of_speech,
+                sort_command=args.sort_command, verbose=args.verbose)
+if __name__ == '__main__':
+    try:
+        main()
+    except ProgramFailure as error:
+        sys.stderr.write('%s\n' % error)
+        sys.exit(1)
+    except CommandLineError as error:
+        sys.stderr.write("Command line error: %s\n" % error)
+        sys.exit(2)

mosesdecoder/scripts/generic/strip-xml.perl ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use strict;
+while (my $line = <STDIN>) {
+  chomp($line);
+  #print "$line\n";
+  my $len = length($line);
+  my $inXML = 0;
+  my $prevSpace = 1;
+  my $prevBar = 0;
+  for (my $i = 0; $i < $len; ++$i) {
+    my $c = substr($line, $i, 1);
+    if ($c eq "<" && !$prevBar) {
+      ++$inXML;
+    }
+    elsif ($c eq ">" && $inXML>0) {
+      --$inXML;
+    }
+    elsif ($prevSpace == 1 && $c eq " ")
+    { # duplicate space. Do nothing
+    }
+    elsif ($inXML == 0) {
+      if ($c eq " ") {
+        $prevSpace = 1;
+        $prevBar = 0;
+      }
+      elsif ($c eq "|") {
+        $prevSpace = 0;
+        $prevBar = 1;
+      }
+      else {
+        $prevSpace = 0;
+        $prevBar = 0;
+      }
+      print $c;
+    }
+  }
+  print "\n";
+}

mosesdecoder/scripts/generic/trainlm-irst2.perl ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+# Compatible with sri LM-creating script, eg.
+#    ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
+# To use it in the EMS, add this to the [LM] section
+#    lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irst-dir"
+#    settings = ""
+# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section.
+# It should point to the root of the LM toolkit, eg
+#    irst-dir = /Users/hieu/workspace/irstlm/trunk/bin
+# Set smoothing method in settings, if different from modified Kneser-Ney
+use warnings;
+use strict;
+use FindBin qw($RealBin);
+use Getopt::Long;
+my $order = 3; # order of language model (default trigram)
+my $corpusPath; # input text data
+my $lmPath; # generated language model
+my $cores = 2; # number of CPUs used
+my $irstPath; # bin directory of IRSTLM
+my $tempPath = "tmp"; # temp dir
+my $pruneSingletons = 1; # 1 = prune singletons, 0 = keep singletons
+my $smoothing = "msb"; # smoothing method: wb = witten-bell, sb = kneser-ney, msb = modified-kneser-ney
+my $dummy;
+GetOptions("order=s"  => \$order,
+           "text=s"   => \$corpusPath,
+           "lm=s"     => \$lmPath,
+           "cores=s"  => \$cores,
+           "irst-dir=s"  => \$irstPath,
+           "temp-dir=s"  => \$tempPath,
+           "p=i" => \$pruneSingletons,   # irstlm parameter: prune singletons
+           "s=s" => \$smoothing, # irstlm parameter: smoothing method
+	   "interpolate!" => \$dummy,  #ignore
+	   "kndiscount!" => \$dummy    #ignore
+	   ) or exit 1;
+#die("ERROR: please set order") unless defined($order);
+die("ERROR: please set text") unless defined($corpusPath);
+die("ERROR: please set lm") unless defined($lmPath);
+die("ERROR: please set irst-dir") unless defined($irstPath);
+$tempPath .= "/irstlm-build-tmp.$$";
+`mkdir -p $tempPath`;
+# add <s> and </s>
+my $cmd = "cat $corpusPath | $irstPath/add-start-end.sh > $tempPath/setagged";
+print STDERR "EXECUTING $cmd\n";
+`$cmd`;
+# collect n-gram counts
+$cmd = "$irstPath/ngt -i=$tempPath/setagged -n=$order -b=yes -o=$tempPath/counts";
+print STDERR "EXECUTING $cmd\n";
+`$cmd`;
+# build lm
+$cmd = "$irstPath/tlm -o=$lmPath -lm=$smoothing -bo=yes -n=$order -tr=$tempPath/counts";
+$cmd .= " -ps=no" unless $pruneSingletons;
+print STDERR "EXECUTING $cmd\n";
+`$cmd`;
+$cmd = "rm -rf $tempPath";
+print STDERR "EXECUTING $cmd\n";
+`$cmd`;
+print STDERR "FINISH.\n";

mosesdecoder/scripts/share/nonbreaking_prefixes/README.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+The language suffix can be found here:
+http://www.loc.gov/standards/iso639-2/php/code_list.php
+This code includes data from Daniel Naber's Language Tools (czech abbreviations).
+This code includes data from czech wiktionary (also czech abbreviations).

mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.as ADDED Viewed

	@@ -0,0 +1,65 @@

+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#common exceptions
+# Dr
+ড
+#others
+#phonetics
+# A
+এ
+# B
+বি
+# C
+সি
+# D
+ডি
+# E
+ই
+# F
+এফ
+# G
+জি
+# H
+এইচ
+# I
+আম
+# J
+জে
+# K
+কে
+# L
+এল
+# M
+এম
+# N
+এন
+# O
+হে
+# P
+পি
+# Q
+কিউ
+# R
+আর
+# S
+এস
+# T
+টি
+# U
+ইউ
+# V
+ভি
+# W
+ডব্লু
+# X
+এক্স
+# Y
+ওয়াই
+# Z
+জেড
+#consonants

mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ca ADDED Viewed

	@@ -0,0 +1,75 @@

+Dr
+Dra
+pàg
+p
+c
+av
+Sr
+Sra
+adm
+esq
+Prof
+S.A
+S.L
+p.e
+ptes
+Sta
+St
+pl
+màx
+cast
+dir
+nre
+fra
+admdora
+Emm
+Excma
+espf
+dc
+admdor
+tel
+angl
+aprox
+ca
+dept
+dj
+dl
+dt
+ds
+dg
+dv
+ed
+entl
+al
+i.e
+maj
+smin
+n
+núm
+pta
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z

mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.cs ADDED Viewed

	@@ -0,0 +1,390 @@

+Bc
+BcA
+Ing
+Ing.arch
+MUDr
+MVDr
+MgA
+Mgr
+JUDr
+PhDr
+RNDr
+PharmDr
+ThLic
+ThDr
+Ph.D
+Th.D
+prof
+doc
+CSc
+DrSc
+dr. h. c
+PaedDr
+Dr
+PhMr
+DiS
+abt
+ad
+a.i
+aj
+angl
+anon
+apod
+atd
+atp
+aut
+bd
+biogr
+b.m
+b.p
+b.r
+cca
+cit
+cizojaz
+c.k
+col
+čes
+čín
+čj
+ed
+facs
+fasc
+fol
+fot
+franc
+h.c
+hist
+hl
+hrsg
+ibid
+il
+ind
+inv.č
+jap
+jhdt
+jv
+koed
+kol
+korej
+kl
+krit
+lat
+lit
+m.a
+maď
+mj
+mp
+násl
+např
+nepubl
+něm
+no
+nr
+n.s
+okr
+odd
+odp
+obr
+opr
+orig
+phil
+pl
+pokrač
+pol
+port
+pozn
+př.kr
+př.n.l
+přel
+přeprac
+příl
+pseud
+pt
+red
+repr
+resp
+revid
+rkp
+roč
+roz
+rozš
+samost
+sect
+sest
+seš
+sign
+sl
+srv
+stol
+sv
+šk
+šk.ro
+špan
+tab
+t.č
+tis
+tj
+tř
+tzv
+univ
+uspoř
+vol
+vl.jm
+vs
+vyd
+vyobr
+zal
+zejm
+zkr
+zprac
+zvl
+n.p
+např
+než
+MUDr
+abl
+absol
+adj
+adv
+ak
+ak. sl
+akt
+alch
+amer
+anat
+angl
+anglosas
+arab
+arch
+archit
+arg
+astr
+astrol
+att
+bás
+belg
+bibl
+biol
+boh
+bot
+bulh
+círk
+csl
+č
+čas
+čes
+dat
+děj
+dep
+dět
+dial
+dór
+dopr
+dosl
+ekon
+epic
+etnonym
+eufem
+f
+fam
+fem
+fil
+film
+form
+fot
+fr
+fut
+fyz
+gen
+geogr
+geol
+geom
+germ
+gram
+hebr
+herald
+hist
+hl
+hovor
+hud
+hut
+chcsl
+chem
+ie
+imp
+impf
+ind
+indoevr
+inf
+instr
+interj
+ión
+iron
+it
+kanad
+katalán
+klas
+kniž
+komp
+konj
+konkr
+kř
+kuch
+lat
+lék
+les
+lid
+lit
+liturg
+lok
+log
+m
+mat
+meteor
+metr
+mod
+ms
+mysl
+n
+náb
+námoř
+neklas
+něm
+nesklon
+nom
+ob
+obch
+obyč
+ojed
+opt
+part
+pas
+pejor
+pers
+pf
+pl
+plpf
+práv
+prep
+předl
+přivl
+r
+rcsl
+refl
+reg
+rkp
+ř
+řec
+s
+samohl
+sg
+sl
+souhl
+spec
+srov
+stfr
+střv
+stsl
+subj
+subst
+superl
+sv
+sz
+táz
+tech
+telev
+teol
+trans
+typogr
+var
+vedl
+verb
+vl. jm
+voj
+vok
+vůb
+vulg
+výtv
+vztaž
+zahr
+zájm
+zast
+zejm
+zeměd
+zkr
+zř
+mj
+dl
+atp
+sport
+Mgr
+horn
+MVDr
+JUDr
+RSDr
+Bc
+PhDr
+ThDr
+Ing
+aj
+apod
+PharmDr
+pomn
+ev
+slang
+nprap
+odp
+dop
+pol
+st
+stol
+p. n. l
+před n. l
+n. l
+př. Kr
+po Kr
+př. n. l
+odd
+RNDr
+tzv
+atd
+tzn
+resp
+tj
+p
+br
+č. j
+čj
+č. p
+čp
+a. s
+s. r. o
+spol. s r. o
+p. o
+s. p
+v. o. s
+k. s
+o. p. s
+o. s
+v. r
+v z
+ml
+vč
+kr
+mld
+hod
+popř
+ap
+event
+rus
+slov
+rum
+švýc
+P. T
+zvl
+hor
+dol
+S.O.S