Spaces:

suricodes
/

hindi-sindhi-translator

Paused

App Files Files Community

suricodes commited on Oct 18, 2024

Commit

a691d7a

verified ·

1 Parent(s): b7a24d4

Upload 49 files

Browse files

Files changed (50) hide show

.gitattributes +2 -0
mosesdecoder/probingpt/CreateProbingPT.cpp +113 -0
mosesdecoder/probingpt/InputFileStream.cpp +59 -0
mosesdecoder/probingpt/InputFileStream.h +46 -0
mosesdecoder/probingpt/Jamfile +32 -0
mosesdecoder/probingpt/OutputFileStream.cpp +87 -0
mosesdecoder/probingpt/OutputFileStream.h +81 -0
mosesdecoder/probingpt/StoreTarget.cpp +264 -0
mosesdecoder/probingpt/StoreTarget.h +51 -0
mosesdecoder/probingpt/StoreVocab.cpp +13 -0
mosesdecoder/probingpt/StoreVocab.h +60 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT +3 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT.o +0 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/InputFileStream.o +0 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/OutputFileStream.o +0 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/StoreTarget.o +0 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/StoreVocab.o +0 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/hash.o +0 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/libprobingpt.a +3 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/line_splitter.o +0 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/probing_hash_utils.o +0 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/querying.o +0 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/storing.o +0 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/util.o +0 -0
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/vocabid.o +0 -0
mosesdecoder/probingpt/gzfilebuf.h +94 -0
mosesdecoder/probingpt/hash.cpp +44 -0
mosesdecoder/probingpt/hash.h +17 -0
mosesdecoder/probingpt/line_splitter.cpp +103 -0
mosesdecoder/probingpt/line_splitter.h +56 -0
mosesdecoder/probingpt/probing_hash_utils.cpp +40 -0
mosesdecoder/probingpt/probing_hash_utils.h +55 -0
mosesdecoder/probingpt/querying.cpp +179 -0
mosesdecoder/probingpt/querying.h +79 -0
mosesdecoder/probingpt/storing.cpp +302 -0
mosesdecoder/probingpt/storing.h +92 -0
mosesdecoder/probingpt/util.cpp +25 -0
mosesdecoder/probingpt/util.h +24 -0
mosesdecoder/probingpt/vocabid.cpp +59 -0
mosesdecoder/probingpt/vocabid.h +29 -0
mosesdecoder/regression-testing/Jamfile +81 -0
mosesdecoder/regression-testing/MosesRegressionTesting.pm +93 -0
mosesdecoder/regression-testing/compare-results.perl +91 -0
mosesdecoder/regression-testing/ensure-regression-data-here.perl +35 -0
mosesdecoder/regression-testing/run-single-test.perl +247 -0
mosesdecoder/regression-testing/run-test-detokenizer.perl +309 -0
mosesdecoder/regression-testing/run-test-extract.perl +95 -0
mosesdecoder/regression-testing/run-test-mert.perl +130 -0
mosesdecoder/regression-testing/run-test-misc.perl +90 -0
mosesdecoder/regression-testing/run-test-scorer.perl +96 -0

.gitattributes CHANGED Viewed

@@ -124,3 +124,5 @@ mosesdecoder/phrase-extract/score-stsg/bin/gcc-9/release/link-static/threading-m
 mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_fragment_tokenizer_test filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_test filter=lfs diff=lfs merge=lfs -text

 mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_fragment_tokenizer_test filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_test filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/libprobingpt.a filter=lfs diff=lfs merge=lfs -text

mosesdecoder/probingpt/CreateProbingPT.cpp ADDED Viewed

	@@ -0,0 +1,113 @@

+#include <string>
+#include <boost/program_options.hpp>
+#include "util/usage.hh"
+#include "storing.h"
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+#include "moses/Util.h"
+using namespace std;
+std::string ReformatSCFGFile(const std::string &path);
+int main(int argc, char* argv[])
+{
+  string inPath, outPath;
+  int num_scores = 4;
+  int num_lex_scores = 0;
+  bool log_prob = false;
+  bool scfg = false;
+  int max_cache_size = 50000;
+  namespace po = boost::program_options;
+  po::options_description desc("Options");
+  desc.add_options()
+  ("help", "Print help messages")
+  ("input-pt", po::value<string>()->required(), "Text pt")
+  ("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
+  ("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
+  ("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
+  ("log-prob", "log (and floor) probabilities before storing")
+  ("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
+  ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")
+  ;
+  po::variables_map vm;
+  try {
+    po::store(po::parse_command_line(argc, argv, desc),
+              vm); // can throw
+    /** --help option
+     */
+    if ( vm.count("help")) {
+      std::cout << desc << std::endl;
+      return EXIT_SUCCESS;
+    }
+    po::notify(vm); // throws on error, so do after help in case
+    // there are any problems
+  } catch(po::error& e) {
+    std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
+    std::cerr << desc << std::endl;
+    return EXIT_FAILURE;
+  }
+  if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
+  if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
+  if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
+  if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
+  if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
+  if (vm.count("log-prob")) log_prob = true;
+  if (vm.count("scfg")) scfg = true;
+  if (scfg) {
+    inPath = ReformatSCFGFile(inPath);
+  }
+  probingpt::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);
+  //util::PrintUsage(std::cout);
+  return 0;
+}
+std::string ReformatSCFGFile(const std::string &path)
+{
+  probingpt::InputFileStream inFile(path);
+  string reformattedPath = path + ".reformat.gz";
+  probingpt::OutputFileStream outFile(reformattedPath);
+  string line;
+  while (getline(inFile, line)) {
+    vector<string> toks = Moses::TokenizeMultiCharSeparator(line, "|||");
+    assert(toks.size() >= 3);
+    // source
+    vector<string> sourceToks = Moses::Tokenize(toks[0], " ");
+    for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
+      outFile << sourceToks[i] << " ";
+    }
+    // other columns
+    for (size_t i = 1; i < toks.size(); ++i) {
+      outFile << "|||" << toks[i];
+    }
+    outFile << endl;
+  }
+  inFile.Close();
+  outFile.Close();
+  string sortedPath = path + ".reformat.sorted.gz";
+  string tmpPath = path + ".tmp ";
+  string cmd = "mkdir " + tmpPath
+               + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
+  system(cmd.c_str());
+  cmd = "rm -rf " + tmpPath + " " + reformattedPath;
+  system(cmd.c_str());
+  return sortedPath;
+}

mosesdecoder/probingpt/InputFileStream.cpp ADDED Viewed

	@@ -0,0 +1,59 @@

+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "InputFileStream.h"
+#include "gzfilebuf.h"
+#include <iostream>
+using namespace std;
+namespace probingpt
+{
+InputFileStream::InputFileStream(const std::string &filePath) :
+  std::istream(NULL), m_streambuf(NULL)
+{
+  if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
+    m_streambuf = new gzfilebuf(filePath.c_str());
+  } else {
+    std::filebuf* fb = new std::filebuf();
+    fb = fb->open(filePath.c_str(), std::ios::in);
+    if (!fb) {
+      cerr << "Can't read " << filePath.c_str() << endl;
+      exit(1);
+    }
+    m_streambuf = fb;
+  }
+  this->init(m_streambuf);
+}
+InputFileStream::~InputFileStream()
+{
+  delete m_streambuf;
+  m_streambuf = NULL;
+}
+void InputFileStream::Close()
+{
+}
+}

mosesdecoder/probingpt/InputFileStream.h ADDED Viewed

	@@ -0,0 +1,46 @@

+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <cstdlib>
+#include <fstream>
+#include <string>
+namespace probingpt
+{
+/** Used in place of std::istream, can read zipped files if it ends in .gz
+ */
+class InputFileStream: public std::istream
+{
+protected:
+  std::streambuf *m_streambuf;
+public:
+  explicit InputFileStream(const std::string &filePath);
+  ~InputFileStream();
+  void Close();
+};
+}

mosesdecoder/probingpt/Jamfile ADDED Viewed

	@@ -0,0 +1,32 @@

+alias deps :  ..//z ..//boost_iostreams ..//boost_filesystem  ;
+lib probingpt :
+  StoreTarget.cpp
+  StoreVocab.cpp
+  hash.cpp
+  line_splitter.cpp
+  probing_hash_utils.cpp
+  querying.cpp
+  storing.cpp
+  vocabid.cpp
+  OutputFileStream.cpp
+  InputFileStream.cpp
+  util.cpp
+#  ../util/string_piece.cc
+#  ../util/exception.cc
+#  ../util/file.cc
+#  ../util/file_piece.cc
+#  ../util/murmur_hash.cc
+#  ../util/mmap.cc
+#  ../util/read_compressed.cc
+#  ../util/parallel_read.cc
+#  ../util/ersatz_progress.cc
+  deps
+   ;
+exe CreateProbingPT : CreateProbingPT.cpp probingpt ../util//kenutil ;
+alias programs : CreateProbingPT ;

mosesdecoder/probingpt/OutputFileStream.cpp ADDED Viewed

	@@ -0,0 +1,87 @@

+// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include "OutputFileStream.h"
+#include "gzfilebuf.h"
+using namespace std;
+using namespace boost::algorithm;
+namespace probingpt
+{
+OutputFileStream::OutputFileStream() :
+  boost::iostreams::filtering_ostream(), m_outFile(NULL), m_open(false)
+{
+}
+OutputFileStream::OutputFileStream(const std::string &filePath) :
+  m_outFile(NULL), m_open(false)
+{
+  Open(filePath);
+}
+OutputFileStream::~OutputFileStream()
+{
+  Close();
+}
+bool OutputFileStream::Open(const std::string &filePath)
+{
+  assert(!m_open);
+  if (filePath == std::string("-")) {
+    // Write to standard output.  Leave m_outFile null.
+    this->push(std::cout);
+  } else {
+    m_outFile = new ofstream(filePath.c_str(),
+                             ios_base::out | ios_base::binary);
+    if (m_outFile->fail()) {
+      return false;
+    }
+    if (ends_with(filePath, ".gz")) {
+      this->push(boost::iostreams::gzip_compressor());
+    }
+    this->push(*m_outFile);
+  }
+  m_open = true;
+  return true;
+}
+void OutputFileStream::Close()
+{
+  if (!m_open) return;
+  this->flush();
+  if (m_outFile) {
+    this->pop(); // file
+    m_outFile->close();
+    delete m_outFile;
+    m_outFile = NULL;
+  }
+  m_open = false;
+}
+}

mosesdecoder/probingpt/OutputFileStream.h ADDED Viewed

	@@ -0,0 +1,81 @@

+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <iostream>
+#include <boost/iostreams/filtering_stream.hpp>
+namespace probingpt
+{
+/** Version of std::ostream with transparent compression.
+ *
+ * Transparently compresses output when writing to a file whose name ends in
+ * ".gz".  Or, writes to stdout instead of a file when given a filename
+ * consisting of just a dash ("-").
+ */
+class OutputFileStream: public boost::iostreams::filtering_ostream
+{
+private:
+  /** File that needs flushing & closing when we close this stream.
+   *
+   * Is NULL when no file is opened, e.g. when writing to standard output.
+   */
+  std::ofstream *m_outFile;
+  /// Is this stream open?
+  bool m_open;
+public:
+  /** Create an unopened OutputFileStream.
+   *
+   * Until it's been opened, nothing can be done with this stream.
+   */
+  OutputFileStream();
+  /// Create an OutputFileStream, and open it by calling Open().
+  OutputFileStream(const std::string &filePath);
+  virtual ~OutputFileStream();
+  // TODO: Can we please just always throw an exception when this fails?
+  /** Open stream.
+   *
+   * If filePath is "-" (just a dash), this opens the stream for writing to
+   * standard output.  Otherwise, it opens the given file.  If the filename
+   * has the ".gz" suffix, output will be transparently compressed.
+   *
+   * Call Close() to close the file.
+   *
+   * Returns whether opening the file was successful.  It may also throw an
+   * exception on failure.
+   */
+  bool Open(const std::string &filePath);
+  /// Flush and close stream.  After this, the stream can be opened again.
+  void Close();
+};
+}

mosesdecoder/probingpt/StoreTarget.cpp ADDED Viewed

	@@ -0,0 +1,264 @@

+/*
+ * StoreTarget.cpp
+ *
+ *  Created on: 19 Jan 2016
+ *      Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include "StoreTarget.h"
+#include "line_splitter.h"
+#include "probing_hash_utils.h"
+#include "OutputFileStream.h"
+#include "moses2/legacy/Util2.h"
+using namespace std;
+namespace probingpt
+{
+StoreTarget::StoreTarget(const std::string &basepath)
+  :m_basePath(basepath)
+  ,m_vocab(basepath + "/TargetVocab.dat")
+{
+  std::string path = basepath + "/TargetColl.dat";
+  m_fileTargetColl.open(path.c_str(),
+                        std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc);
+  if (!m_fileTargetColl.is_open()) {
+    throw "can't create file ";
+  }
+}
+StoreTarget::~StoreTarget()
+{
+  assert(m_coll.empty());
+  m_fileTargetColl.close();
+  // vocab
+  m_vocab.Save();
+}
+uint64_t StoreTarget::Save()
+{
+  uint64_t ret = m_fileTargetColl.tellp();
+  // save to disk
+  uint64_t numTP = m_coll.size();
+  m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t));
+  for (size_t i = 0; i < m_coll.size(); ++i) {
+    Save(*m_coll[i]);
+  }
+  // clear coll
+  Moses2::RemoveAllInColl(m_coll);
+  m_coll.clear();
+  // starting position of coll
+  return ret;
+}
+void StoreTarget::Save(const target_text &rule)
+{
+  // metadata for each tp
+  TargetPhraseInfo tpInfo;
+  tpInfo.alignTerm = GetAlignId(rule.word_align_term);
+  tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term);
+  tpInfo.numWords = rule.target_phrase.size();
+  tpInfo.propLength = rule.property.size();
+  //cerr << "TPInfo=" << sizeof(TPInfo);
+  m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo));
+  // scores
+  for (size_t i = 0; i < rule.prob.size(); ++i) {
+    float prob = rule.prob[i];
+    m_fileTargetColl.write((char*) &prob, sizeof(prob));
+  }
+  // tp
+  for (size_t i = 0; i < rule.target_phrase.size(); ++i) {
+    uint32_t vocabId = rule.target_phrase[i];
+    m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId));
+  }
+  // prop TODO
+}
+void StoreTarget::SaveAlignment()
+{
+  std::string path = m_basePath + "/Alignments.dat";
+  probingpt::OutputFileStream file(path);
+  BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) {
+    file << valPair.second << "\t";
+    const std::vector<size_t> &aligns = valPair.first;
+    BOOST_FOREACH(size_t align, aligns) {
+      file << align << " ";
+    }
+    file << endl;
+  }
+}
+void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg)
+{
+  target_text *rule = new target_text;
+  //cerr << "line.target_phrase=" << line.target_phrase << endl;
+  // target_phrase
+  vector<bool> nonTerms;
+  util::TokenIter<util::SingleCharacter> it;
+  it = util::TokenIter<util::SingleCharacter>(line.target_phrase,
+       util::SingleCharacter(' '));
+  while (it) {
+    StringPiece word = *it;
+    //cerr << "word=" << word << endl;
+    bool nonTerm = false;
+    if (scfg) {
+      // not really sure how to handle factored SCFG and NT
+      if (scfg && word[0] == '[' && word[word.size() - 1] == ']') {
+        //cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl;
+        nonTerm = true;
+      }
+      nonTerms.push_back(nonTerm);
+    }
+    util::TokenIter<util::SingleCharacter> itFactor;
+    itFactor = util::TokenIter<util::SingleCharacter>(word,
+               util::SingleCharacter('|'));
+    while (itFactor) {
+      StringPiece factor = *itFactor;
+      string factorStr = factor.as_string();
+      uint32_t vocabId = m_vocab.GetVocabId(factorStr);
+      rule->target_phrase.push_back(vocabId);
+      itFactor++;
+    }
+    it++;
+  }
+  // probs
+  it = util::TokenIter<util::SingleCharacter>(line.prob,
+       util::SingleCharacter(' '));
+  while (it) {
+    string tok = it->as_string();
+    float prob = Moses2::Scan<float>(tok);
+    if (log_prob) {
+      prob = Moses2::FloorScore(log(prob));
+      if (prob == 0.0f) prob = 0.0000000001;
+    }
+    rule->prob.push_back(prob);
+    it++;
+  }
+  /*
+  cerr << "nonTerms=";
+  for (size_t i = 0; i < nonTerms.size(); ++i) {
+    cerr << nonTerms[i] << " ";
+  }
+  cerr << endl;
+  */
+  // alignment
+  it = util::TokenIter<util::SingleCharacter>(line.word_align,
+       util::SingleCharacter(' '));
+  while (it) {
+    string tokPair = Moses2::Trim(it->as_string());
+    if (tokPair.empty()) {
+      break;
+    }
+    vector<size_t> alignPair = Moses2::Tokenize<size_t>(tokPair, "-");
+    assert(alignPair.size() == 2);
+    bool nonTerm = false;
+    size_t sourcePos = alignPair[0];
+    size_t targetPos = alignPair[1];
+    if (scfg) {
+      nonTerm = nonTerms[targetPos];
+    }
+    //cerr << targetPos << "=" << nonTerm << endl;
+    if (nonTerm) {
+      rule->word_align_non_term.push_back(sourcePos);
+      rule->word_align_non_term.push_back(targetPos);
+      //cerr << (int) rule->word_all1.back() << " ";
+    } else {
+      rule->word_align_term.push_back(sourcePos);
+      rule->word_align_term.push_back(targetPos);
+    }
+    it++;
+  }
+  // extra scores
+  string prop = line.property.as_string();
+  AppendLexRO(prop, rule->prob, log_prob);
+  //cerr << "line.property=" << line.property << endl;
+  //cerr << "prop=" << prop << endl;
+  // properties
+  /*
+   for (size_t i = 0; i < prop.size(); ++i) {
+   rule->property.push_back(prop[i]);
+   }
+   */
+  m_coll.push_back(rule);
+}
+uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align)
+{
+  boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter =
+    m_aligns.find(align);
+  if (iter == m_aligns.end()) {
+    uint32_t ind = m_aligns.size();
+    m_aligns[align] = ind;
+    return ind;
+  } else {
+    return iter->second;
+  }
+}
+void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector,
+                              bool log_prob) const
+{
+  size_t startPos = prop.find("{{LexRO ");
+  if (startPos != string::npos) {
+    size_t endPos = prop.find("}}", startPos + 8);
+    string lexProb = prop.substr(startPos + 8, endPos - startPos - 8);
+    //cerr << "lexProb=" << lexProb << endl;
+    // append lex probs to pt probs
+    vector<float> scores = Moses2::Tokenize<float>(lexProb);
+    if (log_prob) {
+      for (size_t i = 0; i < scores.size(); ++i) {
+        scores[i] = Moses2::FloorScore(log(scores[i]));
+        if (scores[i] == 0.0f) scores[i] = 0.0000000001;
+      }
+    }
+    for (size_t i = 0; i < scores.size(); ++i) {
+      retvector.push_back(scores[i]);
+    }
+    // exclude LexRO property from property column
+    prop = prop.substr(0, startPos)
+           + prop.substr(endPos + 2, prop.size() - endPos - 2);
+    //cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl;
+  }
+}
+} /* namespace Moses2 */

mosesdecoder/probingpt/StoreTarget.h ADDED Viewed

	@@ -0,0 +1,51 @@

+/*
+ * StoreTarget.h
+ *
+ *  Created on: 19 Jan 2016
+ *      Author: hieu
+ */
+#pragma once
+#include <string>
+#include <fstream>
+#include <vector>
+#include <inttypes.h>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include "StoreVocab.h"
+namespace probingpt
+{
+class line_text;
+class target_text;
+class StoreTarget
+{
+public:
+  StoreTarget(const std::string &basepath);
+  virtual ~StoreTarget();
+  uint64_t Save();
+  void SaveAlignment();
+  void Append(const line_text &line, bool log_prob, bool scfg);
+protected:
+  std::string m_basePath;
+  std::fstream m_fileTargetColl;
+  StoreVocab<uint32_t> m_vocab;
+  typedef boost::unordered_map<std::vector<size_t>, uint32_t> Alignments;
+  Alignments m_aligns;
+  std::vector<target_text*> m_coll;
+  uint32_t GetAlignId(const std::vector<size_t> &align);
+  void Save(const target_text &rule);
+  void AppendLexRO(std::string &prop, std::vector<float> &retvector,
+                   bool log_prob) const;
+};
+} /* namespace Moses2 */

mosesdecoder/probingpt/StoreVocab.cpp ADDED Viewed

	@@ -0,0 +1,13 @@

+/*
+ * StoreVocab.cpp
+ *
+ *  Created on: 15 Jun 2016
+ *      Author: hieu
+ */
+#include <fstream>
+#include "StoreVocab.h"
+namespace probingpt
+{
+} /* namespace Moses2 */

mosesdecoder/probingpt/StoreVocab.h ADDED Viewed

	@@ -0,0 +1,60 @@

+/*
+ * StoreVocab.h
+ *
+ *  Created on: 15 Jun 2016
+ *      Author: hieu
+ */
+#pragma once
+#include <string>
+#include <boost/unordered_map.hpp>
+#include "OutputFileStream.h"
+#include "moses2/legacy/Util2.h"
+namespace probingpt
+{
+template<typename VOCABID>
+class StoreVocab
+{
+protected:
+  std::string m_path;
+  typedef boost::unordered_map<std::string, VOCABID> Coll;
+  Coll m_vocab;
+public:
+  StoreVocab(const std::string &path)
+    :m_path(path)
+  {}
+  virtual ~StoreVocab() {}
+  VOCABID GetVocabId(const std::string &word) {
+    typename Coll::iterator iter = m_vocab.find(word);
+    if (iter == m_vocab.end()) {
+      VOCABID ind = m_vocab.size() + 1;
+      m_vocab[word] = ind;
+      return ind;
+    } else {
+      return iter->second;
+    }
+  }
+  void Insert(VOCABID id, const std::string &word) {
+    m_vocab[word] = id;
+  }
+  void Save() {
+    OutputFileStream strme(m_path);
+    typename Coll::const_iterator iter;
+    for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) {
+      strme << iter->first << "\t" << iter->second << std::endl;
+    }
+    strme.Close();
+  }
+};
+} /* namespace Moses2 */

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fc6b9a78300ab10f59601d7e9650d090b3f50f6696b61ccb969f4ca349ae21c
+size 1571424

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT.o ADDED Viewed

Binary file (274 kB). View file

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/InputFileStream.o ADDED Viewed

Binary file (23 kB). View file

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/OutputFileStream.o ADDED Viewed

Binary file (213 kB). View file

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/StoreTarget.o ADDED Viewed

Binary file (170 kB). View file

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/StoreVocab.o ADDED Viewed

Binary file (113 kB). View file

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/hash.o ADDED Viewed

Binary file (15.8 kB). View file

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/libprobingpt.a ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cca17ced5f31f0998cebf65d6fc9c7508886356fbac3586505cbe916634c9b7c
+size 1227034

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/line_splitter.o ADDED Viewed

Binary file (25.7 kB). View file

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/probing_hash_utils.o ADDED Viewed

Binary file (7.44 kB). View file

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/querying.o ADDED Viewed

Binary file (213 kB). View file

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/storing.o ADDED Viewed

Binary file (190 kB). View file

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/util.o ADDED Viewed

Binary file (19 kB). View file

mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/vocabid.o ADDED Viewed

Binary file (144 kB). View file

mosesdecoder/probingpt/gzfilebuf.h ADDED Viewed

	@@ -0,0 +1,94 @@

+#ifndef moses_gzfile_buf_h
+#define moses_gzfile_buf_h
+#include <stdexcept>
+#include <streambuf>
+#include <zlib.h>
+#include <cstring>
+namespace probingpt
+{
+/** wrapper around gzip input stream. Unknown parentage
+ *  @todo replace with boost version - output stream already uses it
+ */
+class gzfilebuf: public std::streambuf
+{
+public:
+  gzfilebuf(const char *filename) {
+    _gzf = gzopen(filename, "rb");
+    if (!_gzf) throw std::runtime_error(
+        "Could not open " + std::string(filename) + ".");
+    setg(_buff + sizeof(int),     // beginning of putback area
+         _buff + sizeof(int),     // read position
+         _buff + sizeof(int));    // end position
+  }
+  ~gzfilebuf() {
+    gzclose(_gzf);
+  }
+protected:
+  virtual int_type overflow(int_type /* c */) {
+    throw;
+  }
+  // write multiple characters
+  virtual std::streamsize xsputn(const char* /* s */, std::streamsize /* num */) {
+    throw;
+  }
+  virtual std::streampos seekpos(std::streampos /* sp */,
+                                 std::ios_base::openmode /* which = std::ios_base::in | std::ios_base::out */) {
+    throw;
+  }
+  //read one character
+  virtual int_type underflow() {
+    // is read position before end of _buff?
+    if (gptr() < egptr()) {
+      return traits_type::to_int_type(*gptr());
+    }
+    /* process size of putback area
+     * - use number of characters read
+     * - but at most four
+     */
+    unsigned int numPutback = gptr() - eback();
+    if (numPutback > sizeof(int)) {
+      numPutback = sizeof(int);
+    }
+    /* copy up to four characters previously read into
+     * the putback _buff (area of first four characters)
+     */
+    std::memmove(_buff + (sizeof(int) - numPutback), gptr() - numPutback,
+                 numPutback);
+    // read new characters
+    int num = gzread(_gzf, _buff + sizeof(int), _buffsize - sizeof(int));
+    if (num <= 0) {
+      // ERROR or EOF
+      return EOF;
+    }
+    // reset _buff pointers
+    setg(_buff + (sizeof(int) - numPutback),   // beginning of putback area
+         _buff + sizeof(int),                // read position
+         _buff + sizeof(int) + num);           // end of buffer
+    // return next character
+    return traits_type::to_int_type(*gptr());
+  }
+  std::streamsize xsgetn(char* s, std::streamsize num) {
+    return gzread(_gzf, s, num);
+  }
+private:
+  gzFile _gzf;
+  static const unsigned int _buffsize = 1024;
+  char _buff[_buffsize];
+};
+}
+#endif

mosesdecoder/probingpt/hash.cpp ADDED Viewed

	@@ -0,0 +1,44 @@

+#include <iostream>
+#include "hash.h"
+using namespace std;
+namespace probingpt
+{
+uint64_t getHash(StringPiece text)
+{
+  std::size_t len = text.size();
+  uint64_t key = util::MurmurHashNative(text.data(), len);
+  return key;
+}
+std::vector<uint64_t> getVocabIDs(const StringPiece &textin)
+{
+  //Tokenize
+  std::vector<uint64_t> output;
+  util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
+  while (itWord) {
+    StringPiece word = *itWord;
+    uint64_t id = 0;
+    util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
+    while (itFactor) {
+      StringPiece factor = *itFactor;
+      //cerr << "factor=" << factor << endl;
+      id += getHash(factor);
+      itFactor++;
+    }
+    output.push_back(id);
+    itWord++;
+  }
+  return output;
+}
+}

mosesdecoder/probingpt/hash.h ADDED Viewed

	@@ -0,0 +1,17 @@

+#pragma once
+#include "util/string_piece.hh"
+#include "util/murmur_hash.hh"
+#include "util/string_piece.hh"  //Tokenization and work with StringPiece
+#include "util/tokenize_piece.hh"
+#include <vector>
+namespace probingpt
+{
+//Gets the MurmurmurHash for give string
+uint64_t getHash(StringPiece text);
+std::vector<uint64_t> getVocabIDs(const StringPiece &textin);
+}

mosesdecoder/probingpt/line_splitter.cpp ADDED Viewed

	@@ -0,0 +1,103 @@

+#include "line_splitter.h"
+namespace probingpt
+{
+line_text splitLine(const StringPiece &textin, bool scfg)
+{
+  const char delim[] = "|||";
+  line_text output;
+  //Tokenize
+  util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
+  //Get source phrase
+  output.source_phrase = Trim(*it);
+  //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
+  //Get target_phrase
+  it++;
+  output.target_phrase = Trim(*it);
+  //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
+  if (scfg) {
+    /*
+    std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
+    std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
+    reformatSCFG(output);
+    std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
+    std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
+    */
+  }
+  //Get probabilities
+  it++;
+  output.prob = Trim(*it);
+  //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
+  //Get WordAllignment
+  it++;
+  if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
+  output.word_align = Trim(*it);
+  //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
+  //Get count
+  it++;
+  if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
+  output.counts = Trim(*it);
+  //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
+  //Get sparse_score
+  it++;
+  if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
+  output.sparse_score = Trim(*it);
+  //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
+  //Get property
+  it++;
+  if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
+  output.property = Trim(*it);
+  //std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
+  return output;
+}
+std::vector<unsigned char> splitWordAll1(const StringPiece &textin)
+{
+  const char delim[] = " ";
+  const char delim2[] = "-";
+  std::vector<unsigned char> output;
+  //Case with no word alignments.
+  if (textin.size() == 0) {
+    return output;
+  }
+  //Split on space
+  util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
+  //For each int
+  while (it) {
+    //Split on dash (-)
+    util::TokenIter<util::MultiCharacter> itInner(*it,
+        util::MultiCharacter(delim2));
+    //Insert the two entries in the vector. User will read entry 0 and 1 to get the first,
+    //2 and 3 for second etc. Use unsigned char instead of int to save space, as
+    //word allignments are all very small numbers that fit in a single byte
+    output.push_back((unsigned char) (atoi(itInner->data())));
+    itInner++;
+    output.push_back((unsigned char) (atoi(itInner->data())));
+    it++;
+  }
+  return output;
+}
+void reformatSCFG(line_text &output)
+{
+}
+}

mosesdecoder/probingpt/line_splitter.h ADDED Viewed

	@@ -0,0 +1,56 @@

+#pragma once
+#include <vector>
+#include <cstdlib> //atof
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"  //Tokenization and work with StringPiece
+#include "util/tokenize_piece.hh"
+namespace probingpt
+{
+//Struct for holding processed line
+struct line_text {
+  StringPiece source_phrase;
+  StringPiece target_phrase;
+  StringPiece prob;
+  StringPiece word_align;
+  StringPiece counts;
+  StringPiece sparse_score;
+  StringPiece property;
+  std::string property_to_be_binarized;
+};
+//Struct for holding processed line
+struct target_text {
+  std::vector<unsigned int> target_phrase;
+  std::vector<float> prob;
+  std::vector<size_t> word_align_term;
+  std::vector<size_t> word_align_non_term;
+  std::vector<char> counts;
+  std::vector<char> sparse_score;
+  std::vector<char> property;
+  /*
+  void Reset()
+  {
+    target_phrase.clear();
+    prob.clear();
+    word_all1.clear();
+    counts.clear();
+    sparse_score.clear();
+    property.clear();
+  }
+  */
+};
+//Ask if it's better to have it receive a pointer to a line_text struct
+line_text splitLine(const StringPiece &textin, bool scfg);
+void reformatSCFG(line_text &output);
+std::vector<unsigned char> splitWordAll1(const StringPiece &textin);
+}

mosesdecoder/probingpt/probing_hash_utils.cpp ADDED Viewed

	@@ -0,0 +1,40 @@

+#include <iostream>
+#include "probing_hash_utils.h"
+#include "util/file.hh"
+namespace probingpt
+{
+//Read table from disk, return memory map location
+char * readTable(const char * filename, util::LoadMethod load_method, util::scoped_fd &file, util::scoped_memory &memory)
+{
+  //std::cerr << "filename=" << filename << std::endl;
+  file.reset(util::OpenReadOrThrow(filename));
+  uint64_t total_size_ = util::SizeFile(file.get());
+  MapRead(load_method, file.get(), 0, total_size_, memory);
+  return (char*) memory.get();
+}
+void serialize_table(char *mem, size_t size, const std::string &filename)
+{
+  std::ofstream os(filename.c_str(), std::ios::binary);
+  os.write((const char*) &mem[0], size);
+  os.close();
+}
+uint64_t getKey(const uint64_t source_phrase[], size_t size)
+{
+  //TOO SLOW
+  //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
+  uint64_t key = 0;
+  for (size_t i = 0; i < size; i++) {
+    key += (source_phrase[i] << i);
+  }
+  return key;
+}
+}

mosesdecoder/probingpt/probing_hash_utils.h ADDED Viewed

	@@ -0,0 +1,55 @@

+#pragma once
+#include "util/probing_hash_table.hh"
+#if defined(_WIN32) || defined(_WIN64)
+#include <mman.h>
+#else
+#include <sys/mman.h>
+#endif
+#include <boost/functional/hash.hpp>
+#include <fcntl.h>
+#include <fstream>
+namespace probingpt
+{
+#define API_VERSION 15
+//Hash table entry
+struct Entry {
+  typedef uint64_t Key;
+  Key key;
+  Key GetKey() const {
+    return key;
+  }
+  void SetKey(Key to) {
+    key = to;
+  }
+  uint64_t value;
+};
+#define NONE       std::numeric_limits<uint64_t>::max()
+//Define table
+typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table;
+void serialize_table(char *mem, size_t size, const std::string &filename);
+char * readTable(const char * filename, util::LoadMethod load_method, util::scoped_fd &file, util::scoped_memory &memory);
+uint64_t getKey(const uint64_t source_phrase[], size_t size);
+struct TargetPhraseInfo {
+  uint32_t alignTerm;
+  uint32_t alignNonTerm;
+  uint16_t numWords;
+  uint16_t propLength;
+  uint16_t filler;
+};
+}

mosesdecoder/probingpt/querying.cpp ADDED Viewed

	@@ -0,0 +1,179 @@

+#include "querying.h"
+#include "util/exception.hh"
+#include "moses2/legacy/Util2.h"
+using namespace std;
+namespace probingpt
+{
+QueryEngine::QueryEngine(const char * filepath, util::LoadMethod load_method)
+{
+  //Create filepaths
+  std::string basepath(filepath);
+  std::string path_to_config = basepath + "/config";
+  std::string path_to_hashtable = basepath + "/probing_hash.dat";
+  std::string path_to_source_vocabid = basepath + "/source_vocabids";
+  std::string alignPath = basepath + "/Alignments.dat";
+  file_exits(basepath);
+  ///Source phrase vocabids
+  read_map(source_vocabids, path_to_source_vocabid.c_str());
+  // alignments
+  read_alignments(alignPath);
+  // target phrase
+  string targetCollPath = basepath + "/TargetColl.dat";
+  memTPS = readTable(targetCollPath.c_str(), load_method, fileTPS_, memoryTPS_);
+  //Read config file
+  boost::unordered_map<std::string, std::string> keyValue;
+  std::ifstream config(path_to_config.c_str());
+  std::string line;
+  while (getline(config, line)) {
+    std::vector<std::string> toks = Moses2::Tokenize(line, "\t");
+    UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
+    keyValue[ toks[0] ] = toks[1];
+  }
+  bool found;
+  //Check API version:
+  int version;
+  found = Get(keyValue, "API_VERSION", version);
+  if (!found) {
+    std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
+  } else if (version != API_VERSION) {
+    std::cerr << "The ProbingPT API has changed. " << version << "!="
+              << API_VERSION << " Please rebinarize your phrase tables." << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  //Get tablesize.
+  int tablesize;
+  found = Get(keyValue, "uniq_entries", tablesize);
+  if (!found) {
+    std::cerr << "uniq_entries not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  //Number of scores
+  found = Get(keyValue, "num_scores", num_scores);
+  if (!found) {
+    std::cerr << "num_scores not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  //How may scores from lex reordering models
+  found = Get(keyValue, "num_lex_scores", num_lex_scores);
+  if (!found) {
+    std::cerr << "num_lex_scores not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  // have the scores been log() and FloorScore()?
+  found = Get(keyValue, "log_prob", logProb);
+  if (!found) {
+    std::cerr << "logProb not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  config.close();
+  //Read hashtable
+  table_filesize = Table::Size(tablesize, 1.2);
+  mem = readTable(path_to_hashtable.c_str(), load_method, file_, memory_);
+  Table table_init(mem, table_filesize);
+  table = table_init;
+  std::cerr << "Initialized successfully! " << std::endl;
+}
+QueryEngine::~QueryEngine()
+{
+  //Clear mmap content from memory.
+  //munmap(mem, table_filesize);
+}
+uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
+{
+  //TOO SLOW
+  //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
+  return probingpt::getKey(source_phrase, size);
+}
+std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
+{
+  std::pair<bool, uint64_t> ret;
+  const Entry * entry;
+  ret.first = table.Find(key, entry);
+  if (ret.first) {
+    ret.second = entry->value;
+  }
+  return ret;
+}
+void QueryEngine::read_alignments(const std::string &alignPath)
+{
+  std::ifstream strm(alignPath.c_str());
+  string line;
+  while (getline(strm, line)) {
+    vector<string> toks = Moses2::Tokenize(line, "\t ");
+    UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
+    uint32_t alignInd = Moses2::Scan<uint32_t>(toks[0]);
+    if (alignInd >= alignColl.size()) {
+      alignColl.resize(alignInd + 1);
+    }
+    Alignments &aligns = alignColl[alignInd];
+    for (size_t i = 1; i < toks.size(); ++i) {
+      size_t pos = Moses2::Scan<size_t>(toks[i]);
+      aligns.push_back(pos);
+    }
+  }
+}
+void QueryEngine::file_exits(const std::string &basePath)
+{
+  if (!Moses2::FileExists(basePath + "/Alignments.dat")) {
+    UTIL_THROW2("Require file does not exist in: " << basePath << "/Alignments.dat");
+  }
+  if (!Moses2::FileExists(basePath + "/TargetColl.dat")) {
+    UTIL_THROW2("Require file does not exist in: " << basePath << "/TargetColl.dat");
+  }
+  if (!Moses2::FileExists(basePath + "/TargetVocab.dat")) {
+    UTIL_THROW2("Require file does not exist in: " << basePath << "/TargetVocab.dat");
+  }
+  if (!Moses2::FileExists(basePath + "/cache")) {
+    UTIL_THROW2("Require file does not exist in: " << basePath << "/cache");
+  }
+  if (!Moses2::FileExists(basePath + "/config")) {
+    UTIL_THROW2("Require file does not exist in: " << basePath << "/config");
+  }
+  if (!Moses2::FileExists(basePath + "/probing_hash.dat")) {
+    UTIL_THROW2("Require file does not exist in: " << basePath << "/probing_hash.dat");
+  }
+  if (!Moses2::FileExists(basePath + "/source_vocabids")) {
+    UTIL_THROW2("Require file does not exist in: " << basePath << "/source_vocabids");
+  }
+  /*
+    if (!FileExists(path_to_config) || !FileExists(path_to_hashtable) ||
+  	  !FileExists(path_to_source_vocabid) || !FileExists(basepath + alignPath) ||
+  	  !FileExists(basepath + "/TargetColl.dat") || !FileExists(basepath + "/TargetVocab.dat") ||
+  	  !FileExists(basepath + "/cache")) {
+      UTIL_THROW2("A required table doesn't exist in: " << basepath);
+    }
+  */
+}
+}

mosesdecoder/probingpt/querying.h ADDED Viewed

	@@ -0,0 +1,79 @@

+#pragma once
+#include <boost/iostreams/device/mapped_file.hpp>
+#include <boost/unordered_map.hpp>
+#include <sys/stat.h> //For finding size of file
+#include <algorithm> //toLower
+#include <deque>
+#include "vocabid.h"
+#include "probing_hash_utils.h"
+#include "hash.h" //Includes line splitter
+#include "line_splitter.h"
+#include "util.h"
+#include "moses2/legacy/Util2.h"
+namespace probingpt
+{
+class QueryEngine
+{
+  std::map<uint64_t, std::string> source_vocabids;
+  typedef std::vector<unsigned char> Alignments;
+  std::vector<Alignments> alignColl;
+  Table table;
+  char *mem; //Memory for the table, necessary so that we can correctly destroy the object
+  size_t table_filesize;
+  bool is_reordering;
+  util::scoped_fd file_;
+  util::scoped_memory memory_;
+  // target phrases
+  boost::iostreams::mapped_file_source file;
+  util::scoped_fd fileTPS_;
+  util::scoped_memory memoryTPS_;
+  void read_alignments(const std::string &alignPath);
+  void file_exits(const std::string &basePath);
+public:
+  int num_scores;
+  int num_lex_scores;
+  bool logProb;
+  const char *memTPS;
+  QueryEngine(const char *, util::LoadMethod load_method);
+  ~QueryEngine();
+  std::pair<bool, uint64_t> query(uint64_t key);
+  const std::map<uint64_t, std::string> &getSourceVocab() const {
+    return source_vocabids;
+  }
+  const std::vector<Alignments> &getAlignments() const {
+    return alignColl;
+  }
+  uint64_t getKey(uint64_t source_phrase[], size_t size) const;
+  template<typename T>
+  inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const {
+    boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
+    if (iter == keyValue.end()) {
+      return false;
+    }
+    const std::string &foundStr = iter->second;
+    found = Scan<T>(foundStr);
+    return true;
+  }
+};
+}

mosesdecoder/probingpt/storing.cpp ADDED Viewed

	@@ -0,0 +1,302 @@

+#include <sys/stat.h>
+#include <boost/foreach.hpp>
+#include "line_splitter.h"
+#include "storing.h"
+#include "StoreTarget.h"
+#include "StoreVocab.h"
+#include "moses2/legacy/Util2.h"
+#include "InputFileStream.h"
+using namespace std;
+namespace probingpt
+{
+///////////////////////////////////////////////////////////////////////
+void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos)
+{
+  if (pos < sourcePhrase.size()) {
+    uint64_t vocabId = sourcePhrase[pos];
+    Node *child;
+    Children::iterator iter = m_children.find(vocabId);
+    if (iter == m_children.end()) {
+      // New node. Write other children then discard them
+      BOOST_FOREACH(Children::value_type &valPair, m_children) {
+        Node &otherChild = valPair.second;
+        otherChild.Write(table);
+      }
+      m_children.clear();
+      // create new node
+      child = &m_children[vocabId];
+      assert(!child->done);
+      child->key = key + (vocabId << pos);
+    } else {
+      child = &iter->second;
+    }
+    child->Add(table, sourcePhrase, pos + 1);
+  } else {
+    // this node was written previously 'cos it has rules
+    done = true;
+  }
+}
+void Node::Write(Table &table)
+{
+  //cerr << "START write " << done << " " << key << endl;
+  BOOST_FOREACH(Children::value_type &valPair, m_children) {
+    Node &child = valPair.second;
+    child.Write(table);
+  }
+  if (!done) {
+    // save
+    Entry sourceEntry;
+    sourceEntry.value = NONE;
+    sourceEntry.key = key;
+    //Put into table
+    table.Insert(sourceEntry);
+  }
+}
+///////////////////////////////////////////////////////////////////////
+void createProbingPT(const std::string &phrasetable_path,
+                     const std::string &basepath, int num_scores, int num_lex_scores,
+                     bool log_prob, int max_cache_size, bool scfg)
+{
+#if defined(_WIN32) || defined(_WIN64)
+  std::cerr << "Create not implemented for Windows" << std::endl;
+#else
+  std::cerr << "Starting..." << std::endl;
+  //Get basepath and create directory if missing
+  mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+  StoreTarget storeTarget(basepath);
+  //Get uniq lines:
+  unsigned long uniq_entries = countUniqueSource(phrasetable_path);
+  //Source phrase vocabids
+  StoreVocab<uint64_t> sourceVocab(basepath + "/source_vocabids");
+  //Read the file
+  util::FilePiece filein(phrasetable_path.c_str());
+  //Init the probing hash table
+  size_t size = Table::Size(uniq_entries, 1.2);
+  char * mem = new char[size];
+  memset(mem, 0, size);
+  Table sourceEntries(mem, size);
+  std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> cache;
+  float totalSourceCount = 0;
+  //Keep track of the size of each group of target phrases
+  size_t line_num = 0;
+  //Read everything and processs
+  std::string prevSource;
+  Node sourcePhrases;
+  sourcePhrases.done = true;
+  sourcePhrases.key = 0;
+  while (true) {
+    try {
+      //Process line read
+      line_text line;
+      line = splitLine(filein.ReadLine(), scfg);
+      //cerr << "line=" << line.source_phrase << endl;
+      ++line_num;
+      if (line_num % 1000000 == 0) {
+        std::cerr << line_num << " " << std::flush;
+      }
+      //Add source phrases to vocabularyIDs
+      add_to_map(sourceVocab, line.source_phrase);
+      if (prevSource.empty()) {
+        // 1st line
+        prevSource = line.source_phrase.as_string();
+        storeTarget.Append(line, log_prob, scfg);
+      } else if (prevSource == line.source_phrase) {
+        //If we still have the same line, just append to it:
+        storeTarget.Append(line, log_prob, scfg);
+      } else {
+        assert(prevSource != line.source_phrase);
+        //Create a new entry even
+        // save
+        uint64_t targetInd = storeTarget.Save();
+        // next line
+        storeTarget.Append(line, log_prob, scfg);
+        //Create an entry for the previous source phrase:
+        Entry sourceEntry;
+        sourceEntry.value = targetInd;
+        //The key is the sum of hashes of individual words bitshifted by their position in the phrase.
+        //Probably not entirerly correct, but fast and seems to work fine in practise.
+        std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
+        if (scfg) {
+          // storing prefixes?
+          sourcePhrases.Add(sourceEntries, vocabid_source);
+        }
+        sourceEntry.key = getKey(vocabid_source);
+        /*
+        cerr << "prevSource=" << prevSource << flush
+            << " vocabids=" << Debug(vocabid_source) << flush
+            << " key=" << sourceEntry.key << endl;
+        */
+        //Put into table
+        sourceEntries.Insert(sourceEntry);
+        // update cache - CURRENT source phrase, not prev
+        if (max_cache_size) {
+          std::string countStr = line.counts.as_string();
+          countStr = Moses2::Trim(countStr);
+          if (!countStr.empty()) {
+            std::vector<float> toks = Moses2::Tokenize<float>(countStr);
+            //cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl;
+            if (toks.size() >= 2) {
+              totalSourceCount += toks[1];
+              // compute key for CURRENT source
+              std::vector<uint64_t> currVocabidSource = getVocabIDs(line.source_phrase.as_string());
+              uint64_t currKey = getKey(currVocabidSource);
+              CacheItem *item = new CacheItem(
+                Moses2::Trim(line.source_phrase.as_string()),
+                currKey,
+                toks[1]);
+              cache.push(item);
+              if (max_cache_size > 0 && cache.size() > max_cache_size) {
+                cache.pop();
+              }
+            }
+          }
+        }
+        //Set prevLine
+        prevSource = line.source_phrase.as_string();
+      }
+    } catch (util::EndOfFileException e) {
+      std::cerr
+          << "Reading phrase table finished, writing remaining files to disk."
+          << std::endl;
+      //After the final entry is constructed we need to add it to the phrase_table
+      //Create an entry for the previous source phrase:
+      uint64_t targetInd = storeTarget.Save();
+      Entry sourceEntry;
+      sourceEntry.value = targetInd;
+      //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
+      std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
+      sourceEntry.key = getKey(vocabid_source);
+      //Put into table
+      sourceEntries.Insert(sourceEntry);
+      break;
+    }
+  }
+  sourcePhrases.Write(sourceEntries);
+  storeTarget.SaveAlignment();
+  serialize_table(mem, size, (basepath + "/probing_hash.dat"));
+  sourceVocab.Save();
+  serialize_cache(cache, (basepath + "/cache"), totalSourceCount);
+  delete[] mem;
+  //Write configfile
+  std::ofstream configfile;
+  configfile.open((basepath + "/config").c_str());
+  configfile << "API_VERSION\t" << API_VERSION << '\n';
+  configfile << "uniq_entries\t" << uniq_entries << '\n';
+  configfile << "num_scores\t" << num_scores << '\n';
+  configfile << "num_lex_scores\t" << num_lex_scores << '\n';
+  configfile << "log_prob\t" << log_prob << '\n';
+  configfile.close();
+#endif
+}
+size_t countUniqueSource(const std::string &path)
+{
+  size_t ret = 0;
+  InputFileStream strme(path);
+  std::string line, prevSource;
+  while (std::getline(strme, line)) {
+    std::vector<std::string> toks = Moses2::TokenizeMultiCharSeparator(line, "|||");
+    assert(toks.size() != 0);
+    if (prevSource != toks[0]) {
+      prevSource = toks[0];
+      ++ret;
+    }
+  }
+  return ret;
+}
+void serialize_cache(
+  std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
+  const std::string &path, float totalSourceCount)
+{
+  std::vector<const CacheItem*> vec(cache.size());
+  size_t ind = cache.size() - 1;
+  while (!cache.empty()) {
+    const CacheItem *item = cache.top();
+    vec[ind] = item;
+    cache.pop();
+    --ind;
+  }
+  std::ofstream os(path.c_str());
+  os << totalSourceCount << std::endl;
+  for (size_t i = 0; i < vec.size(); ++i) {
+    const CacheItem *item = vec[i];
+    os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl;
+    delete item;
+  }
+  os.close();
+}
+uint64_t getKey(const std::vector<uint64_t> &vocabid_source)
+{
+  return probingpt::getKey(vocabid_source.data(), vocabid_source.size());
+}
+std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos)
+{
+  assert(endPos < vocabid_source.size());
+  std::vector<uint64_t> ret(endPos + 1);
+  for (size_t i = 0; i <= endPos; ++i) {
+    ret[i] = vocabid_source[i];
+  }
+  return ret;
+}
+}

mosesdecoder/probingpt/storing.h ADDED Viewed

	@@ -0,0 +1,92 @@

+#pragma once
+#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
+#include <cstdio>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <queue>
+#include <sys/stat.h> //mkdir
+#include "hash.h" //Includes line_splitter
+#include "probing_hash_utils.h"
+#include "vocabid.h"
+#include "util/file_piece.hh"
+#include "util/file.hh"
+namespace probingpt
+{
+typedef std::vector<uint64_t> SourcePhrase;
+class Node
+{
+  typedef boost::unordered_map<uint64_t, Node> Children;
+  Children m_children;
+public:
+  uint64_t key;
+  bool done;
+  Node()
+    :done(false)
+  {}
+  void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0);
+  void Write(Table &table);
+};
+void createProbingPT(const std::string &phrasetable_path,
+                     const std::string &basepath, int num_scores, int num_lex_scores,
+                     bool log_prob, int max_cache_size, bool scfg);
+uint64_t getKey(const std::vector<uint64_t> &source_phrase);
+std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos);
+template<typename T>
+std::string Debug(const std::vector<T> &vec)
+{
+  std::stringstream strm;
+  for (size_t i = 0; i < vec.size(); ++i) {
+    strm << vec[i] << " ";
+  }
+  return strm.str();
+}
+size_t countUniqueSource(const std::string &path);
+class CacheItem
+{
+public:
+  std::string source;
+  uint64_t sourceKey;
+  float count;
+  CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount)
+    :source(vSource)
+    ,sourceKey(vSourceKey)
+    ,count(vCount) {
+  }
+  bool operator<(const CacheItem &other) const {
+    return count > other.count;
+  }
+};
+class CacheItemOrderer
+{
+public:
+  bool operator()(const CacheItem* a, const CacheItem* b) const {
+    return (*a) < (*b);
+  }
+};
+void serialize_cache(
+  std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
+  const std::string &path, float totalSourceCount);
+}

mosesdecoder/probingpt/util.cpp ADDED Viewed

	@@ -0,0 +1,25 @@

+#include <cctype>
+#include "util.h"
+#include "util/exception.hh"
+namespace probingpt
+{
+template<>
+bool Scan<bool>(const std::string &input)
+{
+  std::string lc = ToLower(input);
+  if (lc == "yes" || lc == "y" || lc == "true" || lc == "1") return true;
+  if (lc == "no" || lc == "n" || lc == "false" || lc == "0") return false;
+  UTIL_THROW2("Could not interpret " << input << " as a boolean.  After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0.");
+}
+const std::string ToLower(const std::string& str)
+{
+  std::string lc(str);
+  std::transform(lc.begin(), lc.end(), lc.begin(), (int (*)(int))std::tolower);
+  return
+    lc  ;
+}
+}

mosesdecoder/probingpt/util.h ADDED Viewed

	@@ -0,0 +1,24 @@

+#pragma once
+#include <string>
+#include <sstream>
+namespace probingpt
+{
+//! convert string to variable of type T. Used to reading floats, int etc from files
+template<typename T>
+inline T Scan(const std::string &input)
+{
+  std::stringstream stream(input);
+  T ret;
+  stream >> ret;
+  return ret;
+}
+//! Specialisation to understand yes/no y/n true/false 0/1
+template<>
+bool Scan<bool>(const std::string &input);
+const std::string ToLower(const std::string& str);
+}

mosesdecoder/probingpt/vocabid.cpp ADDED Viewed

	@@ -0,0 +1,59 @@

+#include <boost/foreach.hpp>
+#include "vocabid.h"
+#include "StoreVocab.h"
+#include "moses2/legacy/Util2.h"
+namespace probingpt
+{
+void add_to_map(StoreVocab<uint64_t> &sourceVocab,
+                const StringPiece &textin)
+{
+  //Tokenize
+  util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
+  while (itWord) {
+    StringPiece word = *itWord;
+    util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
+    while (itFactor) {
+      StringPiece factor = *itFactor;
+      sourceVocab.Insert(getHash(factor), factor.as_string());
+      itFactor++;
+    }
+    itWord++;
+  }
+}
+void serialize_map(const std::map<uint64_t, std::string> &karta,
+                   const std::string &filename)
+{
+  std::ofstream os(filename.c_str());
+  std::map<uint64_t, std::string>::const_iterator iter;
+  for (iter = karta.begin(); iter != karta.end(); ++iter) {
+    os << iter->first << '\t' << iter->second << std::endl;
+  }
+  os.close();
+}
+void read_map(std::map<uint64_t, std::string> &karta, const char* filename)
+{
+  std::ifstream is(filename);
+  std::string line;
+  while (getline(is, line)) {
+    std::vector<std::string> toks = Moses2::Tokenize(line, "\t");
+    assert(toks.size() == 2);
+    uint64_t ind = Moses2::Scan<uint64_t>(toks[1]);
+    karta[ind] = toks[0];
+  }
+  //Close the stream after we are done.
+  is.close();
+}
+}

mosesdecoder/probingpt/vocabid.h ADDED Viewed

	@@ -0,0 +1,29 @@

+//Serialization
+#include <boost/serialization/serialization.hpp>
+#include <boost/serialization/map.hpp>
+#include <boost/archive/text_iarchive.hpp>
+#include <boost/archive/text_oarchive.hpp>
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include <map> //Container
+#include "hash.h" //Hash of elements
+#include "util/string_piece.hh"  //Tokenization and work with StringPiece
+#include "util/tokenize_piece.hh"
+namespace probingpt
+{
+template<typename VOCABID>
+class StoreVocab;
+void add_to_map(StoreVocab<uint64_t> &sourceVocab,
+                const StringPiece &textin);
+void serialize_map(const std::map<uint64_t, std::string> &karta,
+                   const std::string &filename);
+void read_map(std::map<uint64_t, std::string> &karta, const char* filename);
+}

mosesdecoder/regression-testing/Jamfile ADDED Viewed

	@@ -0,0 +1,81 @@

+import option path ;
+with-regtest = [ option.get "with-regtest" ] ;
+skip-compact = [ option.get "regtest-skip-compactpt" : : "yes" ] ;
+with-xmlrpc  = [ option.get "with-xmlrpc-c" ] ;
+if $(with-regtest) {
+  with-regtest = [ path.root $(with-regtest) [ path.pwd ] ] ;
+} else if [ option.get "with-regtest" : : "yes" ] {
+  shell_or_fail "git submodule init" ;
+  shell_or_fail "git submodule update" ;
+  with-regtest = $(TOP)/regression-testing/tests ;
+}
+if $(with-regtest) {
+  test-dir = $(with-regtest)/tests ;
+  rule reg_test ( name : tests * : programs * : action ) {
+    alias $(name) : $(tests:D=).passed ;
+    for test in $(tests) {
+      make $(test:D=).passed : $(programs) : $(action) ;
+      alias $(test) : $(test:D=).passed ;
+    }
+  }
+  actions reg_test_decode {
+    $(TOP)/regression-testing/run-single-test.perl --decoder=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
+  }
+  if $(with-xmlrpc)  {
+    actions reg_test_decode_server {
+      $(TOP)/regression-testing/run-single-test.perl --server --decoder=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
+    }
+    reg_test phrase-server : [ glob $(test-dir)/phrase-server.* ] : ../moses-cmd//moses : @reg_test_decode_server ;
+  }
+  if $(skip-compact) {
+    reg_test phrase : [ glob $(test-dir)/phrase.* : $(test-dir)/*withDALM $(test-dir)/*compactptable ] : ../moses-cmd//moses : @reg_test_decode ;
+    reg_test chart  : [ glob $(test-dir)/chart.*  : $(test-dir)/*withDALM $(test-dir)/*compactptable ] : ../moses-cmd//moses : @reg_test_decode ;
+    reg_test moses2  : [ glob $(test-dir)/moses2.*  : $(test-dir)/*withDALM $(test-dir)/*compactptable ] : ../moses-cmd//moses2 : @reg_test_decode ;
+  } else {
+    reg_test phrase : [ glob $(test-dir)/phrase.* : $(test-dir)/*withDALM ] : ../moses-cmd//moses : @reg_test_decode ;
+    reg_test chart  : [ glob $(test-dir)/chart.*  : $(test-dir)/*withDALM ] : ../moses-cmd//moses : @reg_test_decode ;
+    reg_test moses2  : [ glob $(test-dir)/moses2.*  : $(test-dir)/*withDALM ] : ../moses2//moses2 : @reg_test_decode ;
+  }
+  if [ option.get "with-dalm" : : "yes" ] {
+    reg_test dalm : [ glob $(test-dir)/*withDALM ] : ../moses-cmd//moses : @reg_test_decode ;
+  } else {
+    alias dalm ;
+  }
+  actions reg_test_score {
+    $(TOP)/regression-testing/run-test-scorer.perl --scorer=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
+  }
+  reg_test score : [ glob $(test-dir)/score.* :  ] : ../phrase-extract//score : @reg_test_score ;
+  actions reg_test_extract {
+    $(TOP)/regression-testing/run-test-extract.perl --extractor=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
+  }
+  reg_test extract : [ glob $(test-dir)/extract.* ] : ../phrase-extract//extract : @reg_test_extract ;
+  actions reg_test_extractrules {
+    $(TOP)/regression-testing/run-test-extract.perl --extractor=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
+  }
+  reg_test extractrules : [ glob $(test-dir)/extract-rules.* : $(with-regtest)/extract-rules.hierarchical ] : ../phrase-extract//extract-rules : @reg_test_extractrules ;
+  pwd = [ path.pwd ] ;
+  actions reg_test_mert {
+    $(TOP)/regression-testing/run-test-mert.perl --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) --bin-dir=$(pwd)/$(>:D) && touch $(<)
+  }
+  reg_test mert : [ glob $(test-dir)/mert.* ] : ../mert//mert ../mert//extractor ../mert//pro ../mert//hgdecode : @reg_test_mert ;
+  actions reg_test_misc {
+    $(TOP)/regression-testing/run-test-misc.perl --moses-root=$(TOP) --moses-bin=$(BINDIR) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
+  }
+  reg_test misc : [ glob $(test-dir)/misc.* : $(test-dir)/misc.mml*  ] : ..//prefix-bin ..//prefix-lib : @reg_test_misc ;
+  reg_test misc-mml : [ glob $(test-dir)/misc.mml*  ] : $(TOP)/scripts/ems/support/mml-filter.py $(TOP)/scripts/ems/support/defaultconfig.py  : @reg_test_misc ;
+   alias all : phrase chart mert score extract extractrules misc misc-mml dalm ;
+}

mosesdecoder/regression-testing/MosesRegressionTesting.pm ADDED Viewed

	@@ -0,0 +1,93 @@

+package MosesRegressionTesting;
+use strict;
+# if your tests need a new version of the test data, increment this
+# and make sure that a moses-regression-tests-vX.Y is available for
+# download from statmt.org (redpony AT umd dot edu for more info)
+# find the data directory in a few likely locations and make sure
+# that it is the correct version
+sub find_data_directory
+{
+  my ($test_script_root, $data_dir) = @_;
+  my @ds = ();
+	my $mrtp = "moses-reg-test-data";
+	push @ds, $data_dir if defined $data_dir;
+  push @ds, "$test_script_root/$mrtp";
+  push @ds, "/export/ws06osmt/regression-testing/$mrtp";
+	push @ds, "/tmp/$mrtp";
+  push @ds, "/var/tmp/$mrtp";
+	foreach my $d (@ds) {
+	  next unless (-d $d);
+		if (!-d "$d/models") {
+			print STDERR "Found $d but it is malformed: missing subdir models/\n";
+			next;
+		}
+		if (!-d "$d/lm") {
+			print STDERR "Found $d but it is malformed: missing subdir lm/\n";
+			next;
+		}
+		return $d;
+	}
+	print STDERR<<EOT;
+You do not appear to have the regression testing data installed.  You may
+either specify a non-standard location when running the test suite with
+the --data-dir option, or, you may install it in any one of the following
+standard locations: $test_script_root, /tmp, or /var/tmp with these
+commands:
+  cd <DESIRED_INSTALLATION_DIRECTORY>
+  git clone https://github.com/hieuhoang/moses-reg-test-data.git
+EOT
+	exit 1;
+}
+sub get_localized_moses_ini
+{
+  use File::Temp;
+  my ($moses_ini, $data_dir, $results_dir) = @_;
+  my $LM_PATH = "$data_dir/lm";
+  my $MODEL_PATH = "$data_dir/models";
+  use Cwd qw/ abs_path /; use File::Basename; my $TEST_PATH = dirname(abs_path($moses_ini));
+  my $local_moses_ini = new File::Temp( UNLINK => 0, SUFFIX => '.ini' );
+  open MI, "<$moses_ini" or die "Couldn't read $moses_ini";
+  open MO, ">$local_moses_ini" or die "Couldn't open $local_moses_ini for writing";
+  while (my $l = <MI>) {
+	$l =~ s/\$\{LM_PATH\}/$LM_PATH/g;
+	$l =~ s/\$\{MODEL_PATH\}/$MODEL_PATH/g;
+	$l =~ s/\$\{TEST_PATH\}/$TEST_PATH/g;
+	$l =~ s/\$\{RESULTS_PATH\}/$results_dir/g;
+	print $local_moses_ini $l;
+  }
+  close MO;
+  close MI;
+  return $local_moses_ini->filename;
+}
+sub get_nbestlist
+{
+  my ($moses_ini) = @_;
+  my $nbestfile = undef;
+  my $nbestsize = undef;
+  open MI, "<$moses_ini" or die "Couldn't read $moses_ini";
+  while (my $l = <MI>) {
+    if ($l =~ /^\[n-best-list\]/i){
+      chomp($nbestfile = <MI>);
+      chomp($nbestsize = <MI>);
+    }
+  }
+  close MI;
+  return ($nbestfile,$nbestsize);
+}
+1;

mosesdecoder/regression-testing/compare-results.perl ADDED Viewed

	@@ -0,0 +1,91 @@

+#!/usr/bin/env perl
+# $Id$
+use warnings;
+use strict;
+my ($results, $truth) = @ARGV;
+my ($report, $pass, $fail) = compare_results("$results/results.txt", "$truth/results.txt");
+open OUT, ">$results/Summary";
+print OUT $report;
+print $report;
+close OUT;
+if ($fail > 0) {
+  print <<EOT;
+There were failures in this test run.  Please analyze the results carefully.
+EOT
+  exit 1;
+}
+exit 0;
+sub compare_results {
+  my ($testf, $truthf) = @_;
+  my $test = read_results($testf);
+  my $truth = read_results($truthf);
+  my $ct1 = delete $truth->{'COMPARISON_TYPE'};
+  my $ct2 = delete $test->{'COMPARISON_TYPE'};
+  my $pass = 0;
+  my $fail = 0;
+  my $report = '';
+  foreach my $k (sort keys %$truth) {
+    $report .= "test-name=$k\tresult=";
+    if (!exists $test->{$k}) {
+      $report .= "missing from test results\n";
+      $fail++;
+      next;
+    }
+    my $truthv = (defined($truth->{$k}))?$truth->{$k}:'';
+    my $testv = '';
+    if (defined($test->{$k})){
+      $testv = $test->{$k};
+      delete $test->{$k};
+    }
+    if ($ct1->{$k} eq '=') {
+      if ($truthv eq $testv) {
+        $report .= "pass\n";
+        $pass++;
+      } else {
+        $report .= "fail\n\tTRUTH=$truthv\n\t TEST=$testv\n";
+        $fail++;
+      }
+    } else { # numeric difference
+      my $diff = $testv - $truthv;
+      if ($diff == 0) { $report .= "identical\n"; next; }
+      $report .= "BASELINE=$truthv, TEST=$testv\t  DELTA=$diff";
+      if ($truthv != 0) {
+        my $pct = $diff/$truthv;
+        my $t = sprintf "\t PCT CHANGE=%4.2f", $pct*100;
+        $report .= $t;
+      }
+      $report .= "\n";
+    }
+  }
+  foreach my $k (sort keys %$test) {
+    $fail++;
+    $report .= "test-name=$k\tfound in TEST but not in TRUTH.\n";
+  }
+  $report .= "\nTESTS PASSED=$pass\nTESTS FAILED=$fail\n";
+  return $report, $pass, $fail;
+}
+sub read_results {
+  my ($file) = @_;
+  open IN, "<$file" or die "Could not open $file!";
+  my %res;
+  while (my $l = <IN>) {
+    if ($l =~ /^([A-Za-z0-9_]+)\s*([=~])\s*(.+)$/) {
+      my ($key, $comparison_type, $value) = ($1, $2, $3);
+      $res{$key} = $value;
+      $res{'COMPARISON_TYPE'}->{$key}=$comparison_type;
+    }
+  }
+  close IN;
+  return \%res;
+}

mosesdecoder/regression-testing/ensure-regression-data-here.perl ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/usr/bin/env perl
+# downloads the regression data
+use warnings;
+use strict;
+use MosesRegressionTesting;
+my $data_version = MosesRegressionTesting::TESTING_DATA_VERSION;
+exit 0 if -d "moses-reg-test-data-$data_version";
+  # data in place
+safesystem("wget http://www.statmt.org/moses/reg-testing/moses-reg-test-data-$data_version.tgz")
+  or die "wget failed";
+safesystem("tar xzf moses-reg-test-data-$data_version.tgz")
+  or die "untar failed";
+safesystem("rm moses-reg-test-data-$data_version.tgz");
+sub safesystem {
+  # print STDERR "Executing: @_\n";
+  system(@_);
+  if ($? == -1) {
+      print STDERR "Failed to execute: @_\n  $!\n";
+      exit(1);
+  }
+  elsif ($? & 127) {
+      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
+          ($? & 127),  ($? & 128) ? 'with' : 'without';
+      exit(1);
+  }
+  else {
+    my $exitcode = $? >> 8;
+    print STDERR "Exit code: $exitcode\n" if $exitcode;
+    return ! $exitcode;
+  }
+}

mosesdecoder/regression-testing/run-single-test.perl ADDED Viewed

	@@ -0,0 +1,247 @@

+#!/usr/bin/env perl
+# $Id$
+use Encode;
+use utf8;
+use warnings;
+use strict;
+my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
+use MosesRegressionTesting;
+use Getopt::Long;
+use File::Temp qw ( tempfile );
+use POSIX qw ( strftime );
+use POSIX ":sys_wait_h";
+my @SIGS = qw ( SIGHUP SIGINT SIGQUIT SIGILL SIGTRAP SIGABRT SIGIOT SIGBUS SIGFPE SIGKILL SIGUSR1 SIGSEGV SIGUSR2 SIGPIPE SIGALRM SIGTERM SIGSTKFLT SIGCHLD SIGCONT SIGSTOP SIGTSTP SIGTTIN SIGTTOU SIGURG SIGXCPU SIGXFSZ SIGVTALRM SIGPROF SIGWINCH SIGIO SIGPWR SIGSYS SIGUNUSED SIGRTMIN );
+my ($decoder, $test_name);
+my $test_dir = "$script_dir/tests";
+my $data_dir;
+my $BIN_TEST = $script_dir;
+my $results_dir;
+my $NBEST = 0;
+my $run_server_test = 0;
+my $serverport = int(rand(9999)) + 10001;
+my $url = "http://localhost:$serverport/RPC2";
+my $startupTest = 0;
+GetOptions("decoder=s" => \$decoder,
+           "test=s"    => \$test_name,
+           "data-dir=s"=> \$data_dir,
+           "test-dir=s"=> \$test_dir,
+           "results-dir=s"=> \$results_dir,
+           "server"=> \$run_server_test,
+           "startuptest"=> \$startupTest
+          ) or exit 1;
+if($run_server_test)
+{
+  eval {
+    require XMLRPC::Lite;
+    import XMLRPC::Lite;
+  };
+  if ($@) {
+    die "Error: XMLRPC::Lite not installed, moses server regression tests will not be run. $@";
+  }
+  exit(0) if($startupTest);
+}
+die "Please specify a decoder with --decoder\n" unless $decoder;
+die "Please specify a test to run with --test\n" unless $test_name;
+die "Please specify the location of the data directory with --data-dir\n" unless $data_dir;
+die "Cannot locate test dir at $test_dir" unless (-d $test_dir);
+$test_dir .= "/$test_name";
+die "Cannot locate test dir at $test_dir" unless (-d $test_dir);
+#### get place to put results
+unless (defined $results_dir) { $results_dir = "$data_dir/results"; }
+if (!-d $results_dir) {
+  print STDERR "[WARNING] Results directory not found.\n";
+  mkdir ($results_dir) || die "Failed to create $results_dir";
+}
+$results_dir .= "/$test_name";
+if (!-d $results_dir) {
+  print STDERR "[WARNING] Results directory for test=$test_name could not be found.\n";
+  mkdir ($results_dir) || die "Failed to create $results_dir";
+}
+##########
+my $conf = "$test_dir/moses.ini";
+my $input = "$test_dir/to-translate.txt";
+die "Cannot locate executable called $decoder\n" unless (-x $decoder);
+die "Cannot find $conf\n" unless (-f $conf);
+die "Cannot locate input at $input" unless (-f $input);
+my $local_moses_ini = MosesRegressionTesting::get_localized_moses_ini($conf, $data_dir, $results_dir);
+my ($nbestfile,$nbestsize) = MosesRegressionTesting::get_nbestlist($conf);
+if (defined($nbestsize) && $nbestsize > 0) {
+  $NBEST=$nbestsize;
+}
+my $ts = get_timestamp($decoder);
+my $results = "$results_dir/$ts";
+mkdir($results) || die "Failed to create results directory: $results\n";
+my $truth = "$test_dir/truth";
+if (!-d $truth) {
+  die "Could not find truth/ in $test_dir!\n";
+}
+print "RESULTS AVAILABLE IN: $results\n\n";
+my ($o, $elapsed, $ec, $sig);
+if($run_server_test) {
+  ($o, $elapsed, $ec, $sig) = exec_moses_server($decoder, $local_moses_ini, $input, $results);
+}
+else {
+  ($o, $elapsed, $ec, $sig) = exec_moses($decoder, $local_moses_ini, $input, $results);
+}
+my $error = ($sig || $ec > 0);
+if ($error) {
+  open OUT, ">$results/Summary";
+  print STDERR "MOSES CRASHED.\n\texit_code=$ec\n\tsignal=$sig\n";
+  print OUT    "MOSES CRASHED.\n\texit_code=$ec\n\tsignal=$sig\n";
+  print STDERR "FAILURE, for debugging, local moses.ini=$local_moses_ini\n";
+  print OUT    "FAILURE, for debugging, local moses.ini=$local_moses_ini\n";
+  close OUT;
+  exit 2 if $sig;
+  exit 3;
+}
+($o, $ec, $sig) = run_command("$test_dir/filter-stdout.pl $results/run.stdout > $results/results.txt");
+warn "filter-stdout failed!" if ($ec > 0 || $sig);
+($o, $ec, $sig) = run_command("$test_dir/filter-stderr.pl $results/run.stderr >> $results/results.txt");
+warn "filter-stderr failed!" if ($ec > 0 || $sig);
+if($NBEST > 0){
+  ($o, $ec, $sig) = run_command("$test_dir/filter-nbest.pl $results/run.nbest >> $results/results.txt");
+  warn "filter-nbest failed!" if ($ec > 0 || $sig);
+}
+open OUT, ">>$results/results.txt";
+print OUT "TOTAL_WALLTIME ~ $elapsed\n";
+close OUT;
+run_command("gzip $results/run.stdout");
+run_command("gzip $results/run.stderr");
+if($NBEST > 0){
+  run_command("gzip $results/run.nbest");
+}
+($o, $ec, $sig) = run_command("$BIN_TEST/compare-results.perl $results $truth");
+print $o;
+if ($ec) {
+  print STDERR "FAILURE, for debugging, local moses.ini=$local_moses_ini\n";
+  exit 1;
+}
+unlink $local_moses_ini or warn "Couldn't remove $local_moses_ini\n";
+exit 0;
+sub exec_moses {
+  my ($decoder, $conf, $input, $results) = @_;
+  my $start_time = time;
+  my ($o, $ec, $sig);
+  my $cmd;
+  if ($NBEST > 0){
+        print STDERR "Nbest output file is $results/run.nbest\n";
+        print STDERR "Nbest size is $NBEST\n";
+	$cmd = "$decoder -f $conf -i $input -n-best-list $results/run.nbest $NBEST 1> $results/run.stdout 2> $results/run.stderr";
+  }
+  else{
+      $cmd = "$decoder -f $conf -i $input 1> $results/run.stdout 2> $results/run.stderr";
+  }
+  open  CMD, ">$results/cmd_line";
+  print CMD "$cmd\n";
+  close CMD;
+  ($o, $ec, $sig) = run_command($cmd);
+  my $elapsed = time - $start_time;
+  return ($o, $elapsed, $ec, $sig);
+}
+sub exec_moses_server {
+  my ($decoder, $conf, $input, $results) = @_;
+  my $start_time = time;
+  my ($o, $ec, $sig);
+  $ec = 0; $sig = 0; $o = 0;
+  my $pid = fork();
+  if (not defined $pid) {
+      warn "resources not avilable to fork Moses server\n";
+      $ec = 1; # to generate error
+  } elsif ($pid == 0) {
+      setpgrp(0, 0);
+      warn "Starting Moses server on port $serverport ...\n";
+      my $cmd = "$decoder --server --server-port $serverport -f $conf -verbose 2 --server-log $results/run.stderr.server 2> $results/run.stderr ";
+      open  CMD, ">$results/cmd_line";
+      print CMD "$cmd\n";
+      close CMD;
+      ($o, $ec, $sig) = run_command($cmd);
+      exit;
+      # this should not be reached unless the server fails to start
+  }
+  while( 1==1 ) # wait until the server is listening for requests
+  {
+      sleep 5;
+      my $res = waitpid($pid, WNOHANG);
+      die "Moses crashed or aborted! Check $results/run.stderr for error messages.\n" if ($res);
+      my $str = `grep "Listening on port $serverport" $results/run.stderr`;
+      last if($str =~ /Listening/);
+  }
+  my $proxy = XMLRPC::Lite->proxy($url);
+  warn "Opening file $input to write to $results\n";
+  open(TEXTIN, "$input") or die "Can not open the input file to translate with Moses server\n";
+  binmode TEXTIN, ':utf8';
+  open(TEXTOUT, ">$results/run.stdout");
+  binmode TEXTOUT, ':utf8';
+  while(<TEXTIN>)
+  {
+    chop;
+    my $encoded = SOAP::Data->type(string => $_); # NOTE: assuming properly encoded UTF-8 input: check tests before adding them!
+    my %param = ("text" => $encoded);
+    my $result = $proxy->call("translate",\%param)->result;
+    print TEXTOUT $result->{'text'} . "\n";
+  }
+  close(TEXTIN);
+  close(TEXTOUT);
+  my $elapsed = time - $start_time;
+  print STDERR "Finished translating file $input\n";
+  if(waitpid($pid, WNOHANG) <= 0)
+  {
+    warn "Killing process group $pid of the $decoder --server ... \n";
+    kill 9, -$pid;
+  }
+  return ($o, $elapsed, $ec, $sig);
+}
+sub run_command {
+  my ($cmd) = @_;
+  my $o = `$cmd`;
+  my $exit_code = $? >> 8;
+  my $signal = $? & 127;
+  my $core_dumped = $? & 128;
+  if ($signal) { $signal = sig_name($signal); }
+  return $o, $exit_code, $signal;
+}
+sub sig_name {
+  my $sig = shift;
+  return $SIGS[$sig];
+}
+sub get_timestamp {
+  my ($file) = @_;
+	my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
+		 $atime,$mtime,$ctime,$blksize,$blocks)
+								= stat($file);
+  my $timestamp = strftime("%Y%m%d-%H%M%S", gmtime $mtime);
+  my $timestamp2 = strftime("%Y%m%d-%H%M%S", gmtime);
+  my $username = `whoami`; chomp $username;
+  return "moses.v$timestamp-$username-at-$timestamp2";
+}

mosesdecoder/regression-testing/run-test-detokenizer.perl ADDED Viewed

	@@ -0,0 +1,309 @@

+#!/usr/bin/env perl
+#
+# Detokenization tests.
+#
+use warnings;
+use strict;
+# This is here to suppress (false) warnings about OLDOUT and OLDERR being used only once.  Maybe there is a less brutish way to suppress that, but I don't know it.
+no warnings 'once';
+use utf8;
+use Cwd ('abs_path');
+use File::Spec::Functions;
+use File::Basename ('dirname');
+use IPC::Run3;
+use Getopt::Long;
+use Test::More;
+GetOptions("detokenizer=s" => \(my $detokenizer),
+           "results-dir=s"=> \(my $results_dir)
+          ) or exit 1;
+unless (defined $results_dir) {
+    print STDERR "Usage: run-test-detokenizer.t --results-dir <RESULTS-DIRECTORY> [--detokenizer <DETOKENIZER-SCRIPT>]\n";
+    exit 1;
+}
+die "ERROR: Results directory ".$results_dir." doesn't exist or is not a writable directory. Dying" unless (-d $results_dir && -w $results_dir);
+$detokenizer = catfile(dirname(dirname(abs_path($0))), "scripts", "tokenizer", "detokenizer.perl") unless $detokenizer;
+die "ERROR: Detokenizer script ".$detokenizer." does not exist. Dying" unless -f $detokenizer;
+my @testCases = ();
+######################################
+# Definitions of individual test cases
+######################################
+# A simple English test
+&addDetokenizerTest("TEST_ENGLISH_EASY", "en",
+<<'TOK'
+This sentence is really simple , so it should not be hard to detokenize .
+This one is no more difficult , but , hey , it is on a new line .
+TOK
+,
+<<'EXP'
+This sentence is really simple, so it should not be hard to detokenize.
+This one is no more difficult, but, hey, it is on a new line.
+EXP
+);
+# An English test involving double-quotes
+&addDetokenizerTest("TEST_ENGLISH_DOUBLEQUOTES", "en",
+<<'TOK'
+This is a somewhat " less simple " test .
+TOK
+,
+<<'EXP'
+This is a somewhat "less simple" test.
+EXP
+);
+# A simple French test
+&addDetokenizerTest("TEST_FRENCH_EASY", "fr",
+<<'TOK'
+Voici une phrase simple .
+TOK
+,
+<<'EXP'
+Voici une phrase simple.
+EXP
+);
+# A French test involving an apostrophe
+&addDetokenizerTest("TEST_FRENCH_APOSTROPHE", "fr",
+<<'TOK'
+Moi , j' ai une apostrophe .
+TOK
+,
+<<'EXP'
+Moi, j'ai une apostrophe.
+EXP
+);
+# A French test involving an apostrophe on the second-last word
+&addDetokenizerTest("TEST_FRENCH_APOSTROPHE_PENULTIMATE", "fr",
+<<'TOK'
+de musique rap issus de l' immigration
+TOK
+,
+<<'EXP'
+de musique rap issus de l'immigration
+EXP
+);
+# A German test involving non-ASCII characters
+# Note: We don't specify a language because the detokenizer errors if you pass in a language for which it has no special rules, of which German is an example.
+&addDetokenizerTest("TEST_GERMAN_NONASCII", undef,
+<<'TOK'
+Ich hoffe , daß Sie schöne Ferien hatten .
+Frau Präsidentin ! Frau Díez González und ich hatten einige Anfragen
+TOK
+,
+<<'EXP'
+Ich hoffe, daß Sie schöne Ferien hatten.
+Frau Präsidentin! Frau Díez González und ich hatten einige Anfragen
+EXP
+);
+# A simple Chinese test
+&addDetokenizerTest("TEST_CHINESE_EASY", undef,
+<<'TOK'
+这 是 一个 简单 的的 汉语 句子 。
+TOK
+,
+<<'EXP'
+这是一个简单的的汉语句子。
+EXP
+);
+# A simple Japanese test
+&addDetokenizerTest("TEST_JAPANESE_EASY", undef,
+<<'TOK'
+どう しょ う か な 。
+どこ で 食べ たい 。
+TOK
+,
+<<'EXP'
+どうしょうかな。
+どこで食べたい。
+EXP
+);
+######################################
+# Now run those babies ...
+######################################
+plan tests => scalar(@testCases);
+foreach my $testCase (@testCases) {
+    &runDetokenizerTest($testCase);
+}
+############
+## Utilities
+############
+# Creates a new detokenizer test case, adds it to the array of test cases to be run, and returns it.
+sub addDetokenizerTest {
+    my ($testName, $language, $tokenizedText, $rightAnswer) = @_;
+    my $testCase = new DetokenizerTestCase($testName, $language, $tokenizedText, $rightAnswer);
+    push(@testCases, $testCase);
+    return $testCase;
+}
+sub runDetokenizerTest {
+    my ($testCase) = @_;
+    my $testOutputDir = catfile($results_dir, $testCase->getName());
+    my $tokenizedFile = catfile($testOutputDir, "input.txt");
+    my $expectedFile = catfile($testOutputDir, "expected.txt");
+    # Fail if we can't make the test output directory
+    unless (mkdir($testOutputDir)) {
+	return fail($testCase->getName().": Failed to create output directory ".$testOutputDir." [".$!."]");
+    }
+    open TOK, ">".$tokenizedFile;
+    binmode TOK, ":utf8";
+    print TOK $testCase->getTokenizedText();
+    close TOK;
+    open TRUTH, ">".$expectedFile;
+    binmode TRUTH, ":utf8";
+    print TRUTH $testCase->getRightAnswer();
+    close TRUTH;
+    &runTest($testCase->getName(), $testOutputDir, $tokenizedFile, sub {
+	return defined($testCase->getLanguage()) ? [$detokenizer, "-l", $testCase->getLanguage()] : [$detokenizer];
+    }, sub {
+	&verifyIdentical($testCase->getName(), $expectedFile, catfile($testOutputDir, "stdout.txt"))
+    }, 1, $testCase->getFailureExplanation());
+}
+# $stdinFile, if defined, is a file to send to the command via STDIN
+# $buildCommandRoutineReference is a reference to a zero-argument subroutine that returns the
+#                               system command to run in the form of an array reference
+# $validationRoutineReference is a reference to a zero-argument subroutine that makes exactly one call
+#                             to ok() or similar to validate the contents of the output directory
+# $separateStdoutFromStderr is an optional boolean argument; if omitted or false, the command's
+#                           STDOUT and STDERR are mixed together in out output file called
+#                           stdout-and-stderr.txt; otherwise, they are printed to separate output
+#                           files called stdout.txt and stderr.txt, respectively
+# $failureExplanation is an explanation of why the test is expected to fail.  If the test is expected
+#                     to pass, then this should be left undefined.  Even in the case of a test that
+#                     is expected to fail, the system command is still expected to exit normally --
+#                     only the validation routine is expected to fail.
+sub runTest {
+    my ($testName, $outputDir, $stdinFile, $buildCommandRoutineReference, $validationRoutineReference, $separateStdoutFromStderr, $failureExplanation) = @_;
+    my ($stdoutFile, $stderrFile);
+    if ($separateStdoutFromStderr) {
+	$stdoutFile = catfile($outputDir, "stdout.txt");
+	$stderrFile = catfile($outputDir, "stderr.txt");
+    } else {
+	$stdoutFile = catfile($outputDir, "stdout-and-stderr.txt");
+	$stderrFile = $stdoutFile;
+    }
+    my $commandRef = $buildCommandRoutineReference->();
+    my $exitStatus = &runVerbosely($commandRef, $stdinFile, $stdoutFile, $stderrFile);
+    return fail($testName.": command exited with status ".$exitStatus) unless $exitStatus == 0;
+    if (defined $failureExplanation) {
+      TODO: {
+	  local $TODO = $failureExplanation;
+	  $validationRoutineReference->();
+	}
+    } else {
+	$validationRoutineReference->();
+    }
+}
+# Announce that we're going to run the given command, then run it.
+# $stdinFile, if defined, is a file to send to the command via STDIN
+# $stdoutFile and $stderrFile, if defined, are file paths to which the command's standard output
+# and standard error, respectively, are written. They can be the same file.
+# The exit code of the command is returned.
+sub runVerbosely {
+    my ($commandRef, $stdinFile, $stdoutFile, $stderrFile) = @_;
+    my @command = @{$commandRef};
+    note("Executing command:\n  @command\n");
+    note("standard input coming from: ".$stdinFile) if defined $stdinFile;
+    note("standard output going to: ".$stdoutFile) if defined $stdoutFile;
+    note("standard error going to: ".$stderrFile) if defined $stderrFile;
+    run3($commandRef, $stdinFile, $stdoutFile, $stderrFile);
+    return $?;
+}
+# Verify that the given output file is identical to the given reference file.
+sub verifyIdentical {
+    my ($testName, $referenceFile, $outputFile) = @_;
+    open(REF, $referenceFile) or return fail($testName.": Can't open reference file ".$referenceFile." [".$!."].");
+    open(OUT, $outputFile) or return fail($testName.": Can't open output file ".$outputFile." [".$!."].");
+    my @referenceFileAsArray = <REF>;
+    my @outputFileAsArray = <OUT>;
+    close(REF);
+    close(OUT);
+    is_deeply(\@outputFileAsArray, \@referenceFileAsArray, $testName.": Output file ".$outputFile." matches reference file ".$referenceFile.".");
+}
+##%%%%%%%%%%%%%%%%%%%%%%%%%%%##
+## DetokenizerTestCase class ##
+package DetokenizerTestCase;
+# Constructor
+sub new {
+    my $class = shift;
+    my $self = {
+	_name                 => shift,
+	_language             => shift,
+	_tokenizedText        => shift,
+	_rightAnswer          => shift,
+	_failureExplanation   => undef
+    };
+    bless $self, $class;
+}
+sub getName {
+    my ($self) = @_;
+    return $self->{_name};
+}
+sub getLanguage {
+    my ($self) = @_;
+    return $self->{_language};
+}
+sub getTokenizedText {
+    my ($self) = @_;
+    return $self->{_tokenizedText};
+}
+sub getRightAnswer {
+    my ($self) = @_;
+    return $self->{_rightAnswer};
+}
+# Call this routine to indicate that this test case is expected to fail.
+# (The detokenizer script is still expected to exit normally, but the output is not expected to
+# match the right answer because of a bug or unimplemented use case.)
+sub setExpectedToFail {
+    my ($self, $failureExplanation) = @_;
+    $self->{_failureExplanation} = $failureExplanation || "This test is expected to fail.";
+}
+# Returns a string explaining why this test is expected to fail, or undef if this test is expected
+# to pass.
+sub getFailureExplanation {
+    my ($self) = @_;
+    return $self->{_failureExplanation};
+}

mosesdecoder/regression-testing/run-test-extract.perl ADDED Viewed

	@@ -0,0 +1,95 @@

+#!/usr/bin/env perl
+use warnings;
+use strict;
+BEGIN {
+use Cwd qw/ abs_path /;
+use File::Basename;
+my $script_dir = dirname(abs_path($0));
+print STDERR  "script_dir=$script_dir\n";
+push @INC, $script_dir;
+}
+use FindBin qw($Bin);
+use MosesRegressionTesting;
+use Getopt::Long;
+use File::Temp qw ( tempfile );
+use POSIX qw ( strftime );
+my $extractorExe;
+my $test_name;
+my $data_dir;
+my $test_dir;
+my $results_dir;
+GetOptions("extractor=s" => \$extractorExe,
+           "test=s"    => \$test_name,
+           "data-dir=s"=> \$data_dir,
+           "test-dir=s"=> \$test_dir,
+           "results-dir=s"=> \$results_dir,
+          ) or exit 1;
+# output dir
+unless (defined $results_dir)
+{
+  my $ts = get_timestamp($extractorExe);
+  $results_dir = "$data_dir/results/$test_name/$ts";
+}
+`mkdir -p $results_dir`;
+my $outPath = "$results_dir";
+my $extractorArgs = `cat $test_dir/$test_name/args.txt`;
+$_ = $extractorArgs;
+s/(\$\w+)/$1/eeg;
+$extractorArgs = $_;
+my $cmdMain = "$extractorExe $extractorArgs \n";
+open  CMD, ">$results_dir/cmd_line";
+print CMD "$cmdMain";
+close CMD;
+`$cmdMain`;
+my $truthPath = "$test_dir/$test_name/truth/";
+if (-e $outPath)
+{
+  my $cmd = "diff --exclude=.DS_Store --exclude=._* --exclude=cmd_line $outPath/ $truthPath/ | wc -l";
+  my $numDiff = `$cmd`;
+  if ($numDiff == 0)
+  {
+    #  print STDERR "FAILURE. Ran $cmdMain\n";
+    print STDERR "SUCCESS\n";
+    exit 0;
+  }
+  else
+  {
+    print STDERR "FAILURE. Ran $cmdMain\n";
+    exit 1;
+  }
+}
+else
+{
+  print STDERR "FAILURE. Output does not exists. Ran $cmdMain\n";
+  exit 1;
+}
+###################################
+sub get_timestamp {
+  my ($file) = @_;
+	my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
+		 $atime,$mtime,$ctime,$blksize,$blocks)
+								= stat($file);
+  my $timestamp = strftime("%Y%m%d-%H%M%S", gmtime $mtime);
+  my $timestamp2 = strftime("%Y%m%d-%H%M%S", gmtime);
+  my $username = `whoami`; chomp $username;
+  return "moses.v$timestamp-$username-at-$timestamp2";
+}

mosesdecoder/regression-testing/run-test-mert.perl ADDED Viewed

	@@ -0,0 +1,130 @@

+#!/usr/bin/env perl
+use warnings;
+use strict;
+my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
+#use MertRegressionTesting;
+use Getopt::Long;
+use File::Temp qw ( tempfile );
+use POSIX qw ( strftime );
+my @SIGS = qw ( SIGHUP SIGINT SIGQUIT SIGILL SIGTRAP SIGABRT SIGIOT SIGBUS SIGFPE SIGKILL SIGUSR1 SIGSEGV SIGUSR2 SIGPIPE SIGALRM SIGTERM SIGSTKFLT SIGCHLD SIGCONT SIGSTOP SIGTSTP SIGTTIN SIGTTOU SIGURG SIGXCPU SIGXFSZ SIGVTALRM SIGPROF SIGWINCH SIGIO SIGPWR SIGSYS SIGUNUSED SIGRTMIN );
+my ($decoder, $test_name);
+my $test_dir = "$script_dir/tests";
+my $bin_dir = "$script_dir/../bin";
+my $data_dir;
+my $BIN_TEST = $script_dir;
+my $results_dir;
+GetOptions("test=s"    => \$test_name,
+           "data-dir=s"=> \$data_dir,
+           "bin-dir=s"=> \$bin_dir,
+           "test-dir=s"=> \$test_dir,
+           "results-dir=s"=> \$results_dir,
+          ) or exit 1;
+die "Please specify a test to run with --test\n" unless $test_name;
+die "Please specify the location of the data directory with --data-dir\n" unless $data_dir;
+die "Please specify the location of the mert directory with --mert-dir\n" unless $bin_dir;
+die "Cannot locate test dir at $test_dir" unless (-d $test_dir);
+$test_dir .= "/$test_name";
+die "Cannot locate test dir at $test_dir" unless (-d $test_dir);
+#### get place to put results
+unless (defined $results_dir) { $results_dir = "$data_dir/results"; }
+if (!-d $results_dir) {
+  print STDERR "[WARNING] Results directory not found.\n";
+  mkdir ($results_dir) || die "Failed to create $results_dir";
+}
+$results_dir .= "/$test_name";
+if (!-d $results_dir) {
+  print STDERR "[WARNING] Results directory for test=$test_name could not be found.\n";
+  mkdir ($results_dir) || die "Failed to create $results_dir";
+}
+##########
+my $ts = get_timestamp("$test_dir/command");
+my $results = "$results_dir/$ts";
+mkdir($results) || die "Failed to create results directory: $results\n";
+my $truth = "$test_dir/truth";
+if (!-d $truth) {
+  die "Could not find truth/ in $test_dir!\n";
+}
+print "RESULTS AVAILABLE IN: $results\n\n";
+my ($o, $elapsed, $ec, $sig) = exec_test($test_dir, $results);
+my $error = ($sig || $ec > 0);
+if ($error) {
+  open OUT, ">$results/Summary";
+  print STDERR "$test_name CRASHED.\n\texit_code=$ec\n\tsignal=$sig\n";
+  print OUT    "$test_name CRASHED.\n\texit_code=$ec\n\tsignal=$sig\n";
+  close OUT;
+  exit 2 if $sig;
+  exit 3;
+}
+($o, $ec, $sig) = run_command("$test_dir/filter-stdout $results/run.stdout > $results/results.txt");
+warn "filter-stdout failed!" if ($ec > 0 || $sig);
+($o, $ec, $sig) = run_command("$test_dir/filter-stderr $results/run.stderr >> $results/results.txt");
+warn "filter-stderr failed!" if ($ec > 0 || $sig);
+open OUT, ">> $results/results.txt";
+print OUT "TOTAL_WALLTIME ~ $elapsed\n";
+close OUT;
+run_command("gzip $results/run.stdout");
+run_command("gzip $results/run.stderr");
+($o, $ec, $sig) = run_command("$BIN_TEST/compare-results.perl $results $truth");
+print $o;
+if ($ec) {
+  print STDERR "FAILURE, for debugging see  $test_dir\n";
+  exit 1;
+}
+exit 0;
+sub exec_test {
+  my ($test_dir,$results) = @_;
+  my $start_time = time;
+  my $cmd = "sh $test_dir/command $bin_dir $test_dir 1> $results/run.stdout 2> $results/run.stderr";
+  open CMD, ">$results/cmd_line";
+  print CMD "$cmd";
+  close CMD;
+  my ($o, $ec, $sig) = run_command($cmd);
+  my $elapsed = 0;
+  $elapsed = time - $start_time;
+  return ($o, $elapsed, $ec, $sig);
+}
+sub run_command {
+  my ($cmd) = @_;
+  my $o = `$cmd`;
+  my $exit_code = $? >> 8;
+  my $signal = $? & 127;
+  my $core_dumped = $? & 128;
+  if ($signal) { $signal = sig_name($signal); }
+  return $o, $exit_code, $signal;
+}
+sub sig_name {
+  my $sig = shift;
+  return $SIGS[$sig];
+}
+sub get_timestamp {
+  my ($file) = @_;
+  my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,$mtime,$ctime,$blksize,$blocks) = stat($file);
+  my $timestamp = strftime("%Y%m%d-%H%M%S", gmtime $mtime);
+  my $timestamp2 = strftime("%Y%m%d-%H%M%S", gmtime);
+  my $username = `whoami`; chomp $username;
+  return "command.v$timestamp-$username-at-$timestamp2";
+}

mosesdecoder/regression-testing/run-test-misc.perl ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/env perl
+use warnings;
+use strict;
+BEGIN {
+use Cwd qw/ abs_path cwd /;
+use File::Basename;
+my $script_dir = dirname(abs_path($0));
+print STDERR  "script_dir=$script_dir\n";
+push @INC, $script_dir;
+}
+use FindBin qw($Bin);
+use MosesRegressionTesting;
+use Getopt::Long;
+use File::Temp qw ( tempfile );
+use POSIX qw ( strftime );
+my ($mosesRoot, $mosesBin, $test_name, $data_dir, $test_dir, $results_dir);
+GetOptions("moses-root=s" => \$mosesRoot,
+           "moses-bin=s" => \$mosesBin,
+           "test=s"    => \$test_name,
+           "data-dir=s"=> \$data_dir,
+           "test-dir=s"=> \$test_dir,
+           "results-dir=s"=> \$results_dir,
+          ) or exit 1;
+# output dir
+unless (defined $results_dir)
+{
+  my $ts = get_timestamp($mosesRoot);
+  $results_dir = "$data_dir/results/$test_name/$ts";
+}
+`mkdir -p $results_dir`;
+use File::Basename qw/dirname/;
+my $dir = dirname ($0);
+my $cmdMain = "perl -I $dir $test_dir/$test_name/run.perl -moses-root $mosesRoot -moses-bin $mosesBin -test $test_name -data-dir $data_dir -test-dir $test_dir  -results-dir $results_dir\n";
+open CMD, ">$results_dir/cmd_line";
+print CMD $cmdMain;
+close CMD;
+`$cmdMain`;
+my $outPath = "$results_dir/out";
+my $truthPath = "$test_dir/$test_name/truth/results.txt";
+print STDERR "outPath=$outPath \n truthPath=$truthPath \n";
+if (-e $outPath)
+{
+  my $cmd = "diff --exclude=cmd_line $outPath $truthPath | wc -l";
+  my $numDiff = `$cmd`;
+  if ($numDiff == 0)
+  {
+    #  print STDERR "FAILURE. Ran $cmdMain\n";
+    print STDERR "SUCCESS\n";
+    exit 0;
+  }
+  else
+  {
+    print STDERR "FAILURE. Ran $cmdMain\n";
+    exit 1;
+  }
+}
+else
+{
+  print STDERR "FAILURE. Output does not exists. Ran $cmdMain\n";
+  exit 1;
+}
+###################################
+sub get_timestamp {
+  my ($file) = @_;
+	my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
+		 $atime,$mtime,$ctime,$blksize,$blocks)
+								= stat($file);
+  my $timestamp = strftime("%Y%m%d-%H%M%S", gmtime $mtime);
+  my $timestamp2 = strftime("%Y%m%d-%H%M%S", gmtime);
+  my $username = `whoami`; chomp $username;
+  return "moses.v$timestamp-$username-at-$timestamp2";
+}

mosesdecoder/regression-testing/run-test-scorer.perl ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/env perl
+use warnings;
+use strict;
+BEGIN {
+use Cwd qw/ abs_path /;
+use File::Basename;
+my $script_dir = dirname(abs_path($0));
+print STDERR  "script_dir=$script_dir\n";
+push @INC, $script_dir;
+}
+use FindBin qw($Bin);
+use MosesRegressionTesting;
+use Getopt::Long;
+use File::Temp qw ( tempfile );
+use POSIX qw ( strftime );
+my $scoreExe;
+my $test_name;
+my $data_dir;
+my $test_dir;
+my $results_dir;
+GetOptions("scorer=s" => \$scoreExe,
+           "test=s"    => \$test_name,
+           "data-dir=s"=> \$data_dir,
+           "test-dir=s"=> \$test_dir,
+           "results-dir=s"=> \$results_dir,
+          ) or exit 1;
+# output dir
+unless (defined $results_dir)
+{
+  my $ts = get_timestamp($scoreExe);
+  $results_dir = "$data_dir/results/$test_name/$ts";
+}
+`mkdir -p $results_dir`;
+my $outPath = "$results_dir/pt.half";
+my $scorerArgs = `cat $test_dir/$test_name/args.txt`;
+$_ = $scorerArgs;
+s/(\$\w+)/$1/eeg;
+$scorerArgs = $_;
+my $cmdMain = "$scoreExe $scorerArgs \n";
+open  CMD, ">$results_dir/cmd_line";
+print CMD "$cmdMain";
+close CMD;
+`$cmdMain`;
+my $truthPath = "$test_dir/$test_name/truth/results.txt";
+if (-e $outPath)
+{
+  my $cmd = "diff $outPath $truthPath | wc -l";
+  my $numDiff = `$cmd`;
+  if ($numDiff == 0)
+  {
+    #  print STDERR "FAILURE. Ran $cmdMain\n";
+    print STDERR "SUCCESS\n";
+    exit 0;
+  }
+  else
+  {
+    print STDERR "FAILURE. Ran $cmdMain\n";
+    exit 1;
+  }
+}
+else
+{
+  print STDERR "FAILURE. Output does not exists. Ran $cmdMain\n";
+  exit 1;
+}
+###################################
+sub get_timestamp {
+  my ($file) = @_;
+	my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
+		 $atime,$mtime,$ctime,$blksize,$blocks)
+								= stat($file);
+  my $timestamp = strftime("%Y%m%d-%H%M%S", gmtime $mtime);
+  my $timestamp2 = strftime("%Y%m%d-%H%M%S", gmtime);
+  my $username = `whoami`; chomp $username;
+  return "moses.v$timestamp-$username-at-$timestamp2";
+}