suricodes commited on
Commit
a691d7a
·
verified ·
1 Parent(s): b7a24d4

Upload 49 files

Browse files
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. mosesdecoder/probingpt/CreateProbingPT.cpp +113 -0
  3. mosesdecoder/probingpt/InputFileStream.cpp +59 -0
  4. mosesdecoder/probingpt/InputFileStream.h +46 -0
  5. mosesdecoder/probingpt/Jamfile +32 -0
  6. mosesdecoder/probingpt/OutputFileStream.cpp +87 -0
  7. mosesdecoder/probingpt/OutputFileStream.h +81 -0
  8. mosesdecoder/probingpt/StoreTarget.cpp +264 -0
  9. mosesdecoder/probingpt/StoreTarget.h +51 -0
  10. mosesdecoder/probingpt/StoreVocab.cpp +13 -0
  11. mosesdecoder/probingpt/StoreVocab.h +60 -0
  12. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT +3 -0
  13. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT.o +0 -0
  14. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/InputFileStream.o +0 -0
  15. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/OutputFileStream.o +0 -0
  16. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/StoreTarget.o +0 -0
  17. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/StoreVocab.o +0 -0
  18. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/hash.o +0 -0
  19. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/libprobingpt.a +3 -0
  20. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/line_splitter.o +0 -0
  21. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/probing_hash_utils.o +0 -0
  22. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/querying.o +0 -0
  23. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/storing.o +0 -0
  24. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/util.o +0 -0
  25. mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/vocabid.o +0 -0
  26. mosesdecoder/probingpt/gzfilebuf.h +94 -0
  27. mosesdecoder/probingpt/hash.cpp +44 -0
  28. mosesdecoder/probingpt/hash.h +17 -0
  29. mosesdecoder/probingpt/line_splitter.cpp +103 -0
  30. mosesdecoder/probingpt/line_splitter.h +56 -0
  31. mosesdecoder/probingpt/probing_hash_utils.cpp +40 -0
  32. mosesdecoder/probingpt/probing_hash_utils.h +55 -0
  33. mosesdecoder/probingpt/querying.cpp +179 -0
  34. mosesdecoder/probingpt/querying.h +79 -0
  35. mosesdecoder/probingpt/storing.cpp +302 -0
  36. mosesdecoder/probingpt/storing.h +92 -0
  37. mosesdecoder/probingpt/util.cpp +25 -0
  38. mosesdecoder/probingpt/util.h +24 -0
  39. mosesdecoder/probingpt/vocabid.cpp +59 -0
  40. mosesdecoder/probingpt/vocabid.h +29 -0
  41. mosesdecoder/regression-testing/Jamfile +81 -0
  42. mosesdecoder/regression-testing/MosesRegressionTesting.pm +93 -0
  43. mosesdecoder/regression-testing/compare-results.perl +91 -0
  44. mosesdecoder/regression-testing/ensure-regression-data-here.perl +35 -0
  45. mosesdecoder/regression-testing/run-single-test.perl +247 -0
  46. mosesdecoder/regression-testing/run-test-detokenizer.perl +309 -0
  47. mosesdecoder/regression-testing/run-test-extract.perl +95 -0
  48. mosesdecoder/regression-testing/run-test-mert.perl +130 -0
  49. mosesdecoder/regression-testing/run-test-misc.perl +90 -0
  50. mosesdecoder/regression-testing/run-test-scorer.perl +96 -0
.gitattributes CHANGED
@@ -124,3 +124,5 @@ mosesdecoder/phrase-extract/score-stsg/bin/gcc-9/release/link-static/threading-m
124
  mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
125
  mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_fragment_tokenizer_test filter=lfs diff=lfs merge=lfs -text
126
  mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_test filter=lfs diff=lfs merge=lfs -text
 
 
 
124
  mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
125
  mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_fragment_tokenizer_test filter=lfs diff=lfs merge=lfs -text
126
  mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_test filter=lfs diff=lfs merge=lfs -text
127
+ mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT filter=lfs diff=lfs merge=lfs -text
128
+ mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/libprobingpt.a filter=lfs diff=lfs merge=lfs -text
mosesdecoder/probingpt/CreateProbingPT.cpp ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <string>
2
+ #include <boost/program_options.hpp>
3
+ #include "util/usage.hh"
4
+ #include "storing.h"
5
+ #include "InputFileStream.h"
6
+ #include "OutputFileStream.h"
7
+ #include "moses/Util.h"
8
+
9
+ using namespace std;
10
+
11
+ std::string ReformatSCFGFile(const std::string &path);
12
+
13
+ int main(int argc, char* argv[])
14
+ {
15
+ string inPath, outPath;
16
+ int num_scores = 4;
17
+ int num_lex_scores = 0;
18
+ bool log_prob = false;
19
+ bool scfg = false;
20
+ int max_cache_size = 50000;
21
+
22
+ namespace po = boost::program_options;
23
+ po::options_description desc("Options");
24
+ desc.add_options()
25
+ ("help", "Print help messages")
26
+ ("input-pt", po::value<string>()->required(), "Text pt")
27
+ ("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
28
+ ("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
29
+ ("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
30
+ ("log-prob", "log (and floor) probabilities before storing")
31
+ ("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
32
+ ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")
33
+
34
+ ;
35
+
36
+ po::variables_map vm;
37
+ try {
38
+ po::store(po::parse_command_line(argc, argv, desc),
39
+ vm); // can throw
40
+
41
+ /** --help option
42
+ */
43
+ if ( vm.count("help")) {
44
+ std::cout << desc << std::endl;
45
+ return EXIT_SUCCESS;
46
+ }
47
+
48
+ po::notify(vm); // throws on error, so do after help in case
49
+ // there are any problems
50
+ } catch(po::error& e) {
51
+ std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
52
+ std::cerr << desc << std::endl;
53
+ return EXIT_FAILURE;
54
+ }
55
+
56
+ if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
57
+ if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
58
+ if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
59
+ if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
60
+ if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
61
+ if (vm.count("log-prob")) log_prob = true;
62
+ if (vm.count("scfg")) scfg = true;
63
+
64
+
65
+ if (scfg) {
66
+ inPath = ReformatSCFGFile(inPath);
67
+ }
68
+
69
+ probingpt::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);
70
+
71
+ //util::PrintUsage(std::cout);
72
+ return 0;
73
+ }
74
+
75
+ std::string ReformatSCFGFile(const std::string &path)
76
+ {
77
+ probingpt::InputFileStream inFile(path);
78
+ string reformattedPath = path + ".reformat.gz";
79
+ probingpt::OutputFileStream outFile(reformattedPath);
80
+
81
+ string line;
82
+ while (getline(inFile, line)) {
83
+ vector<string> toks = Moses::TokenizeMultiCharSeparator(line, "|||");
84
+ assert(toks.size() >= 3);
85
+
86
+ // source
87
+ vector<string> sourceToks = Moses::Tokenize(toks[0], " ");
88
+ for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
89
+ outFile << sourceToks[i] << " ";
90
+ }
91
+
92
+ // other columns
93
+ for (size_t i = 1; i < toks.size(); ++i) {
94
+ outFile << "|||" << toks[i];
95
+ }
96
+ outFile << endl;
97
+ }
98
+
99
+ inFile.Close();
100
+ outFile.Close();
101
+
102
+ string sortedPath = path + ".reformat.sorted.gz";
103
+ string tmpPath = path + ".tmp ";
104
+ string cmd = "mkdir " + tmpPath
105
+ + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
106
+ system(cmd.c_str());
107
+
108
+ cmd = "rm -rf " + tmpPath + " " + reformattedPath;
109
+ system(cmd.c_str());
110
+
111
+ return sortedPath;
112
+ }
113
+
mosesdecoder/probingpt/InputFileStream.cpp ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include "InputFileStream.h"
23
+ #include "gzfilebuf.h"
24
+ #include <iostream>
25
+
26
+ using namespace std;
27
+
28
+ namespace probingpt
29
+ {
30
+
31
+ InputFileStream::InputFileStream(const std::string &filePath) :
32
+ std::istream(NULL), m_streambuf(NULL)
33
+ {
34
+ if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
35
+ m_streambuf = new gzfilebuf(filePath.c_str());
36
+ } else {
37
+ std::filebuf* fb = new std::filebuf();
38
+ fb = fb->open(filePath.c_str(), std::ios::in);
39
+ if (!fb) {
40
+ cerr << "Can't read " << filePath.c_str() << endl;
41
+ exit(1);
42
+ }
43
+ m_streambuf = fb;
44
+ }
45
+ this->init(m_streambuf);
46
+ }
47
+
48
+ InputFileStream::~InputFileStream()
49
+ {
50
+ delete m_streambuf;
51
+ m_streambuf = NULL;
52
+ }
53
+
54
+ void InputFileStream::Close()
55
+ {
56
+ }
57
+
58
+ }
59
+
mosesdecoder/probingpt/InputFileStream.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #pragma once
23
+
24
+ #include <cstdlib>
25
+ #include <fstream>
26
+ #include <string>
27
+
28
+ namespace probingpt
29
+ {
30
+
31
+ /** Used in place of std::istream, can read zipped files if it ends in .gz
32
+ */
33
+ class InputFileStream: public std::istream
34
+ {
35
+ protected:
36
+ std::streambuf *m_streambuf;
37
+ public:
38
+
39
+ explicit InputFileStream(const std::string &filePath);
40
+ ~InputFileStream();
41
+
42
+ void Close();
43
+ };
44
+
45
+ }
46
+
mosesdecoder/probingpt/Jamfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ alias deps : ..//z ..//boost_iostreams ..//boost_filesystem ;
2
+
3
+ lib probingpt :
4
+ StoreTarget.cpp
5
+ StoreVocab.cpp
6
+ hash.cpp
7
+ line_splitter.cpp
8
+ probing_hash_utils.cpp
9
+ querying.cpp
10
+ storing.cpp
11
+ vocabid.cpp
12
+ OutputFileStream.cpp
13
+ InputFileStream.cpp
14
+ util.cpp
15
+
16
+ # ../util/string_piece.cc
17
+ # ../util/exception.cc
18
+ # ../util/file.cc
19
+ # ../util/file_piece.cc
20
+ # ../util/murmur_hash.cc
21
+ # ../util/mmap.cc
22
+ # ../util/read_compressed.cc
23
+ # ../util/parallel_read.cc
24
+ # ../util/ersatz_progress.cc
25
+
26
+
27
+ deps
28
+ ;
29
+
30
+ exe CreateProbingPT : CreateProbingPT.cpp probingpt ../util//kenutil ;
31
+
32
+ alias programs : CreateProbingPT ;
mosesdecoder/probingpt/OutputFileStream.cpp ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include <iostream>
23
+ #include <boost/algorithm/string/predicate.hpp>
24
+ #include <boost/iostreams/filter/gzip.hpp>
25
+ #include "OutputFileStream.h"
26
+ #include "gzfilebuf.h"
27
+
28
+ using namespace std;
29
+ using namespace boost::algorithm;
30
+
31
+ namespace probingpt
32
+ {
33
+ OutputFileStream::OutputFileStream() :
34
+ boost::iostreams::filtering_ostream(), m_outFile(NULL), m_open(false)
35
+ {
36
+ }
37
+
38
+ OutputFileStream::OutputFileStream(const std::string &filePath) :
39
+ m_outFile(NULL), m_open(false)
40
+ {
41
+ Open(filePath);
42
+ }
43
+
44
+ OutputFileStream::~OutputFileStream()
45
+ {
46
+ Close();
47
+ }
48
+
49
+ bool OutputFileStream::Open(const std::string &filePath)
50
+ {
51
+ assert(!m_open);
52
+ if (filePath == std::string("-")) {
53
+ // Write to standard output. Leave m_outFile null.
54
+ this->push(std::cout);
55
+ } else {
56
+ m_outFile = new ofstream(filePath.c_str(),
57
+ ios_base::out | ios_base::binary);
58
+ if (m_outFile->fail()) {
59
+ return false;
60
+ }
61
+
62
+ if (ends_with(filePath, ".gz")) {
63
+ this->push(boost::iostreams::gzip_compressor());
64
+ }
65
+ this->push(*m_outFile);
66
+ }
67
+
68
+ m_open = true;
69
+ return true;
70
+ }
71
+
72
+ void OutputFileStream::Close()
73
+ {
74
+ if (!m_open) return;
75
+ this->flush();
76
+ if (m_outFile) {
77
+ this->pop(); // file
78
+
79
+ m_outFile->close();
80
+ delete m_outFile;
81
+ m_outFile = NULL;
82
+ }
83
+ m_open = false;
84
+ }
85
+
86
+ }
87
+
mosesdecoder/probingpt/OutputFileStream.h ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #pragma once
23
+
24
+ #include <cstdlib>
25
+ #include <fstream>
26
+ #include <string>
27
+ #include <iostream>
28
+ #include <boost/iostreams/filtering_stream.hpp>
29
+
30
+ namespace probingpt
31
+ {
32
+
33
+ /** Version of std::ostream with transparent compression.
34
+ *
35
+ * Transparently compresses output when writing to a file whose name ends in
36
+ * ".gz". Or, writes to stdout instead of a file when given a filename
37
+ * consisting of just a dash ("-").
38
+ */
39
+ class OutputFileStream: public boost::iostreams::filtering_ostream
40
+ {
41
+ private:
42
+ /** File that needs flushing & closing when we close this stream.
43
+ *
44
+ * Is NULL when no file is opened, e.g. when writing to standard output.
45
+ */
46
+ std::ofstream *m_outFile;
47
+
48
+ /// Is this stream open?
49
+ bool m_open;
50
+
51
+ public:
52
+ /** Create an unopened OutputFileStream.
53
+ *
54
+ * Until it's been opened, nothing can be done with this stream.
55
+ */
56
+ OutputFileStream();
57
+
58
+ /// Create an OutputFileStream, and open it by calling Open().
59
+ OutputFileStream(const std::string &filePath);
60
+ virtual ~OutputFileStream();
61
+
62
+ // TODO: Can we please just always throw an exception when this fails?
63
+ /** Open stream.
64
+ *
65
+ * If filePath is "-" (just a dash), this opens the stream for writing to
66
+ * standard output. Otherwise, it opens the given file. If the filename
67
+ * has the ".gz" suffix, output will be transparently compressed.
68
+ *
69
+ * Call Close() to close the file.
70
+ *
71
+ * Returns whether opening the file was successful. It may also throw an
72
+ * exception on failure.
73
+ */
74
+ bool Open(const std::string &filePath);
75
+
76
+ /// Flush and close stream. After this, the stream can be opened again.
77
+ void Close();
78
+ };
79
+
80
+ }
81
+
mosesdecoder/probingpt/StoreTarget.cpp ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * StoreTarget.cpp
3
+ *
4
+ * Created on: 19 Jan 2016
5
+ * Author: hieu
6
+ */
7
+ #include <boost/foreach.hpp>
8
+ #include "StoreTarget.h"
9
+ #include "line_splitter.h"
10
+ #include "probing_hash_utils.h"
11
+ #include "OutputFileStream.h"
12
+ #include "moses2/legacy/Util2.h"
13
+
14
+ using namespace std;
15
+
16
+ namespace probingpt
17
+ {
18
+
19
+ StoreTarget::StoreTarget(const std::string &basepath)
20
+ :m_basePath(basepath)
21
+ ,m_vocab(basepath + "/TargetVocab.dat")
22
+ {
23
+ std::string path = basepath + "/TargetColl.dat";
24
+ m_fileTargetColl.open(path.c_str(),
25
+ std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc);
26
+ if (!m_fileTargetColl.is_open()) {
27
+ throw "can't create file ";
28
+ }
29
+
30
+ }
31
+
32
+ StoreTarget::~StoreTarget()
33
+ {
34
+ assert(m_coll.empty());
35
+ m_fileTargetColl.close();
36
+
37
+ // vocab
38
+ m_vocab.Save();
39
+ }
40
+
41
+ uint64_t StoreTarget::Save()
42
+ {
43
+ uint64_t ret = m_fileTargetColl.tellp();
44
+
45
+ // save to disk
46
+ uint64_t numTP = m_coll.size();
47
+ m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t));
48
+
49
+ for (size_t i = 0; i < m_coll.size(); ++i) {
50
+ Save(*m_coll[i]);
51
+ }
52
+
53
+ // clear coll
54
+ Moses2::RemoveAllInColl(m_coll);
55
+ m_coll.clear();
56
+
57
+ // starting position of coll
58
+ return ret;
59
+ }
60
+
61
+ void StoreTarget::Save(const target_text &rule)
62
+ {
63
+ // metadata for each tp
64
+ TargetPhraseInfo tpInfo;
65
+ tpInfo.alignTerm = GetAlignId(rule.word_align_term);
66
+ tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term);
67
+ tpInfo.numWords = rule.target_phrase.size();
68
+ tpInfo.propLength = rule.property.size();
69
+
70
+ //cerr << "TPInfo=" << sizeof(TPInfo);
71
+ m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo));
72
+
73
+ // scores
74
+ for (size_t i = 0; i < rule.prob.size(); ++i) {
75
+ float prob = rule.prob[i];
76
+ m_fileTargetColl.write((char*) &prob, sizeof(prob));
77
+ }
78
+
79
+ // tp
80
+ for (size_t i = 0; i < rule.target_phrase.size(); ++i) {
81
+ uint32_t vocabId = rule.target_phrase[i];
82
+ m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId));
83
+ }
84
+
85
+ // prop TODO
86
+
87
+ }
88
+
89
+ void StoreTarget::SaveAlignment()
90
+ {
91
+ std::string path = m_basePath + "/Alignments.dat";
92
+ probingpt::OutputFileStream file(path);
93
+
94
+ BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) {
95
+ file << valPair.second << "\t";
96
+
97
+ const std::vector<size_t> &aligns = valPair.first;
98
+ BOOST_FOREACH(size_t align, aligns) {
99
+ file << align << " ";
100
+ }
101
+ file << endl;
102
+ }
103
+
104
+ }
105
+
106
+ void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg)
107
+ {
108
+ target_text *rule = new target_text;
109
+ //cerr << "line.target_phrase=" << line.target_phrase << endl;
110
+
111
+ // target_phrase
112
+ vector<bool> nonTerms;
113
+ util::TokenIter<util::SingleCharacter> it;
114
+ it = util::TokenIter<util::SingleCharacter>(line.target_phrase,
115
+ util::SingleCharacter(' '));
116
+ while (it) {
117
+ StringPiece word = *it;
118
+ //cerr << "word=" << word << endl;
119
+
120
+ bool nonTerm = false;
121
+ if (scfg) {
122
+ // not really sure how to handle factored SCFG and NT
123
+ if (scfg && word[0] == '[' && word[word.size() - 1] == ']') {
124
+ //cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl;
125
+ nonTerm = true;
126
+ }
127
+ nonTerms.push_back(nonTerm);
128
+ }
129
+
130
+ util::TokenIter<util::SingleCharacter> itFactor;
131
+ itFactor = util::TokenIter<util::SingleCharacter>(word,
132
+ util::SingleCharacter('|'));
133
+ while (itFactor) {
134
+ StringPiece factor = *itFactor;
135
+
136
+ string factorStr = factor.as_string();
137
+ uint32_t vocabId = m_vocab.GetVocabId(factorStr);
138
+
139
+ rule->target_phrase.push_back(vocabId);
140
+
141
+ itFactor++;
142
+ }
143
+
144
+ it++;
145
+ }
146
+
147
+ // probs
148
+ it = util::TokenIter<util::SingleCharacter>(line.prob,
149
+ util::SingleCharacter(' '));
150
+ while (it) {
151
+ string tok = it->as_string();
152
+ float prob = Moses2::Scan<float>(tok);
153
+
154
+ if (log_prob) {
155
+ prob = Moses2::FloorScore(log(prob));
156
+ if (prob == 0.0f) prob = 0.0000000001;
157
+ }
158
+
159
+ rule->prob.push_back(prob);
160
+ it++;
161
+ }
162
+
163
+ /*
164
+ cerr << "nonTerms=";
165
+ for (size_t i = 0; i < nonTerms.size(); ++i) {
166
+ cerr << nonTerms[i] << " ";
167
+ }
168
+ cerr << endl;
169
+ */
170
+
171
+ // alignment
172
+ it = util::TokenIter<util::SingleCharacter>(line.word_align,
173
+ util::SingleCharacter(' '));
174
+ while (it) {
175
+ string tokPair = Moses2::Trim(it->as_string());
176
+ if (tokPair.empty()) {
177
+ break;
178
+ }
179
+
180
+ vector<size_t> alignPair = Moses2::Tokenize<size_t>(tokPair, "-");
181
+ assert(alignPair.size() == 2);
182
+
183
+ bool nonTerm = false;
184
+ size_t sourcePos = alignPair[0];
185
+ size_t targetPos = alignPair[1];
186
+ if (scfg) {
187
+ nonTerm = nonTerms[targetPos];
188
+ }
189
+
190
+ //cerr << targetPos << "=" << nonTerm << endl;
191
+
192
+ if (nonTerm) {
193
+ rule->word_align_non_term.push_back(sourcePos);
194
+ rule->word_align_non_term.push_back(targetPos);
195
+ //cerr << (int) rule->word_all1.back() << " ";
196
+ } else {
197
+ rule->word_align_term.push_back(sourcePos);
198
+ rule->word_align_term.push_back(targetPos);
199
+ }
200
+
201
+ it++;
202
+ }
203
+
204
+ // extra scores
205
+ string prop = line.property.as_string();
206
+ AppendLexRO(prop, rule->prob, log_prob);
207
+
208
+ //cerr << "line.property=" << line.property << endl;
209
+ //cerr << "prop=" << prop << endl;
210
+
211
+ // properties
212
+ /*
213
+ for (size_t i = 0; i < prop.size(); ++i) {
214
+ rule->property.push_back(prop[i]);
215
+ }
216
+ */
217
+ m_coll.push_back(rule);
218
+ }
219
+
220
+ uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align)
221
+ {
222
+ boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter =
223
+ m_aligns.find(align);
224
+ if (iter == m_aligns.end()) {
225
+ uint32_t ind = m_aligns.size();
226
+ m_aligns[align] = ind;
227
+ return ind;
228
+ } else {
229
+ return iter->second;
230
+ }
231
+ }
232
+
233
+ void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector,
234
+ bool log_prob) const
235
+ {
236
+ size_t startPos = prop.find("{{LexRO ");
237
+
238
+ if (startPos != string::npos) {
239
+ size_t endPos = prop.find("}}", startPos + 8);
240
+ string lexProb = prop.substr(startPos + 8, endPos - startPos - 8);
241
+ //cerr << "lexProb=" << lexProb << endl;
242
+
243
+ // append lex probs to pt probs
244
+ vector<float> scores = Moses2::Tokenize<float>(lexProb);
245
+
246
+ if (log_prob) {
247
+ for (size_t i = 0; i < scores.size(); ++i) {
248
+ scores[i] = Moses2::FloorScore(log(scores[i]));
249
+ if (scores[i] == 0.0f) scores[i] = 0.0000000001;
250
+ }
251
+ }
252
+
253
+ for (size_t i = 0; i < scores.size(); ++i) {
254
+ retvector.push_back(scores[i]);
255
+ }
256
+
257
+ // exclude LexRO property from property column
258
+ prop = prop.substr(0, startPos)
259
+ + prop.substr(endPos + 2, prop.size() - endPos - 2);
260
+ //cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl;
261
+ }
262
+ }
263
+
264
+ } /* namespace Moses2 */
mosesdecoder/probingpt/StoreTarget.h ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * StoreTarget.h
3
+ *
4
+ * Created on: 19 Jan 2016
5
+ * Author: hieu
6
+ */
7
+ #pragma once
8
+ #include <string>
9
+ #include <fstream>
10
+ #include <vector>
11
+ #include <inttypes.h>
12
+ #include <boost/unordered_map.hpp>
13
+ #include <boost/unordered_set.hpp>
14
+ #include "StoreVocab.h"
15
+
16
+ namespace probingpt
17
+ {
18
+
19
+ class line_text;
20
+ class target_text;
21
+
22
+ class StoreTarget
23
+ {
24
+ public:
25
+ StoreTarget(const std::string &basepath);
26
+ virtual ~StoreTarget();
27
+
28
+ uint64_t Save();
29
+ void SaveAlignment();
30
+
31
+ void Append(const line_text &line, bool log_prob, bool scfg);
32
+ protected:
33
+ std::string m_basePath;
34
+ std::fstream m_fileTargetColl;
35
+ StoreVocab<uint32_t> m_vocab;
36
+
37
+ typedef boost::unordered_map<std::vector<size_t>, uint32_t> Alignments;
38
+ Alignments m_aligns;
39
+
40
+ std::vector<target_text*> m_coll;
41
+
42
+ uint32_t GetAlignId(const std::vector<size_t> &align);
43
+ void Save(const target_text &rule);
44
+
45
+ void AppendLexRO(std::string &prop, std::vector<float> &retvector,
46
+ bool log_prob) const;
47
+
48
+ };
49
+
50
+ } /* namespace Moses2 */
51
+
mosesdecoder/probingpt/StoreVocab.cpp ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * StoreVocab.cpp
3
+ *
4
+ * Created on: 15 Jun 2016
5
+ * Author: hieu
6
+ */
7
+ #include <fstream>
8
+ #include "StoreVocab.h"
9
+
10
+ namespace probingpt
11
+ {
12
+
13
+ } /* namespace Moses2 */
mosesdecoder/probingpt/StoreVocab.h ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * StoreVocab.h
3
+ *
4
+ * Created on: 15 Jun 2016
5
+ * Author: hieu
6
+ */
7
+ #pragma once
8
+ #include <string>
9
+ #include <boost/unordered_map.hpp>
10
+ #include "OutputFileStream.h"
11
+ #include "moses2/legacy/Util2.h"
12
+
13
+ namespace probingpt
14
+ {
15
+
16
+ template<typename VOCABID>
17
+ class StoreVocab
18
+ {
19
+ protected:
20
+ std::string m_path;
21
+
22
+ typedef boost::unordered_map<std::string, VOCABID> Coll;
23
+ Coll m_vocab;
24
+
25
+ public:
26
+ StoreVocab(const std::string &path)
27
+ :m_path(path)
28
+ {}
29
+
30
+ virtual ~StoreVocab() {}
31
+
32
+ VOCABID GetVocabId(const std::string &word) {
33
+ typename Coll::iterator iter = m_vocab.find(word);
34
+ if (iter == m_vocab.end()) {
35
+ VOCABID ind = m_vocab.size() + 1;
36
+ m_vocab[word] = ind;
37
+ return ind;
38
+ } else {
39
+ return iter->second;
40
+ }
41
+ }
42
+
43
+ void Insert(VOCABID id, const std::string &word) {
44
+ m_vocab[word] = id;
45
+ }
46
+
47
+ void Save() {
48
+ OutputFileStream strme(m_path);
49
+
50
+ typename Coll::const_iterator iter;
51
+ for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) {
52
+ strme << iter->first << "\t" << iter->second << std::endl;
53
+ }
54
+
55
+ strme.Close();
56
+ }
57
+ };
58
+
59
+ } /* namespace Moses2 */
60
+
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fc6b9a78300ab10f59601d7e9650d090b3f50f6696b61ccb969f4ca349ae21c
3
+ size 1571424
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT.o ADDED
Binary file (274 kB). View file
 
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/InputFileStream.o ADDED
Binary file (23 kB). View file
 
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/OutputFileStream.o ADDED
Binary file (213 kB). View file
 
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/StoreTarget.o ADDED
Binary file (170 kB). View file
 
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/StoreVocab.o ADDED
Binary file (113 kB). View file
 
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/hash.o ADDED
Binary file (15.8 kB). View file
 
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/libprobingpt.a ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cca17ced5f31f0998cebf65d6fc9c7508886356fbac3586505cbe916634c9b7c
3
+ size 1227034
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/line_splitter.o ADDED
Binary file (25.7 kB). View file
 
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/probing_hash_utils.o ADDED
Binary file (7.44 kB). View file
 
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/querying.o ADDED
Binary file (213 kB). View file
 
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/storing.o ADDED
Binary file (190 kB). View file
 
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/util.o ADDED
Binary file (19 kB). View file
 
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/vocabid.o ADDED
Binary file (144 kB). View file
 
mosesdecoder/probingpt/gzfilebuf.h ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef moses_gzfile_buf_h
2
+ #define moses_gzfile_buf_h
3
+
4
+ #include <stdexcept>
5
+ #include <streambuf>
6
+ #include <zlib.h>
7
+ #include <cstring>
8
+
9
+ namespace probingpt
10
+ {
11
+
12
+ /** wrapper around gzip input stream. Unknown parentage
13
+ * @todo replace with boost version - output stream already uses it
14
+ */
15
+ class gzfilebuf: public std::streambuf
16
+ {
17
+ public:
18
+ gzfilebuf(const char *filename) {
19
+ _gzf = gzopen(filename, "rb");
20
+ if (!_gzf) throw std::runtime_error(
21
+ "Could not open " + std::string(filename) + ".");
22
+ setg(_buff + sizeof(int), // beginning of putback area
23
+ _buff + sizeof(int), // read position
24
+ _buff + sizeof(int)); // end position
25
+ }
26
+ ~gzfilebuf() {
27
+ gzclose(_gzf);
28
+ }
29
+ protected:
30
+ virtual int_type overflow(int_type /* c */) {
31
+ throw;
32
+ }
33
+
34
+ // write multiple characters
35
+ virtual std::streamsize xsputn(const char* /* s */, std::streamsize /* num */) {
36
+ throw;
37
+ }
38
+
39
+ virtual std::streampos seekpos(std::streampos /* sp */,
40
+ std::ios_base::openmode /* which = std::ios_base::in | std::ios_base::out */) {
41
+ throw;
42
+ }
43
+
44
+ //read one character
45
+ virtual int_type underflow() {
46
+ // is read position before end of _buff?
47
+ if (gptr() < egptr()) {
48
+ return traits_type::to_int_type(*gptr());
49
+ }
50
+
51
+ /* process size of putback area
52
+ * - use number of characters read
53
+ * - but at most four
54
+ */
55
+ unsigned int numPutback = gptr() - eback();
56
+ if (numPutback > sizeof(int)) {
57
+ numPutback = sizeof(int);
58
+ }
59
+
60
+ /* copy up to four characters previously read into
61
+ * the putback _buff (area of first four characters)
62
+ */
63
+ std::memmove(_buff + (sizeof(int) - numPutback), gptr() - numPutback,
64
+ numPutback);
65
+
66
+ // read new characters
67
+ int num = gzread(_gzf, _buff + sizeof(int), _buffsize - sizeof(int));
68
+ if (num <= 0) {
69
+ // ERROR or EOF
70
+ return EOF;
71
+ }
72
+
73
+ // reset _buff pointers
74
+ setg(_buff + (sizeof(int) - numPutback), // beginning of putback area
75
+ _buff + sizeof(int), // read position
76
+ _buff + sizeof(int) + num); // end of buffer
77
+
78
+ // return next character
79
+ return traits_type::to_int_type(*gptr());
80
+ }
81
+
82
+ std::streamsize xsgetn(char* s, std::streamsize num) {
83
+ return gzread(_gzf, s, num);
84
+ }
85
+
86
+ private:
87
+ gzFile _gzf;
88
+ static const unsigned int _buffsize = 1024;
89
+ char _buff[_buffsize];
90
+ };
91
+
92
+ }
93
+
94
+ #endif
mosesdecoder/probingpt/hash.cpp ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include "hash.h"
3
+
4
+ using namespace std;
5
+
6
+ namespace probingpt
7
+ {
8
+
9
+ uint64_t getHash(StringPiece text)
10
+ {
11
+ std::size_t len = text.size();
12
+ uint64_t key = util::MurmurHashNative(text.data(), len);
13
+ return key;
14
+ }
15
+
16
+ std::vector<uint64_t> getVocabIDs(const StringPiece &textin)
17
+ {
18
+ //Tokenize
19
+ std::vector<uint64_t> output;
20
+
21
+ util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
22
+
23
+ while (itWord) {
24
+ StringPiece word = *itWord;
25
+ uint64_t id = 0;
26
+
27
+ util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
28
+ while (itFactor) {
29
+ StringPiece factor = *itFactor;
30
+ //cerr << "factor=" << factor << endl;
31
+
32
+ id += getHash(factor);
33
+ itFactor++;
34
+ }
35
+
36
+ output.push_back(id);
37
+ itWord++;
38
+ }
39
+
40
+ return output;
41
+ }
42
+
43
+ }
44
+
mosesdecoder/probingpt/hash.h ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "util/string_piece.hh"
4
+ #include "util/murmur_hash.hh"
5
+ #include "util/string_piece.hh" //Tokenization and work with StringPiece
6
+ #include "util/tokenize_piece.hh"
7
+ #include <vector>
8
+
9
+ namespace probingpt
10
+ {
11
+
12
+ //Gets the MurmurmurHash for give string
13
+ uint64_t getHash(StringPiece text);
14
+
15
+ std::vector<uint64_t> getVocabIDs(const StringPiece &textin);
16
+
17
+ }
mosesdecoder/probingpt/line_splitter.cpp ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "line_splitter.h"
2
+
3
+ namespace probingpt
4
+ {
5
+
6
+ line_text splitLine(const StringPiece &textin, bool scfg)
7
+ {
8
+ const char delim[] = "|||";
9
+ line_text output;
10
+
11
+ //Tokenize
12
+ util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
13
+ //Get source phrase
14
+ output.source_phrase = Trim(*it);
15
+ //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
16
+
17
+ //Get target_phrase
18
+ it++;
19
+ output.target_phrase = Trim(*it);
20
+ //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
21
+
22
+ if (scfg) {
23
+ /*
24
+ std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
25
+ std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
26
+ reformatSCFG(output);
27
+ std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
28
+ std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
29
+ */
30
+ }
31
+
32
+ //Get probabilities
33
+ it++;
34
+ output.prob = Trim(*it);
35
+ //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
36
+
37
+ //Get WordAllignment
38
+ it++;
39
+ if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
40
+ output.word_align = Trim(*it);
41
+ //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
42
+
43
+ //Get count
44
+ it++;
45
+ if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
46
+ output.counts = Trim(*it);
47
+ //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
48
+
49
+ //Get sparse_score
50
+ it++;
51
+ if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
52
+ output.sparse_score = Trim(*it);
53
+ //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
54
+
55
+ //Get property
56
+ it++;
57
+ if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
58
+ output.property = Trim(*it);
59
+ //std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
60
+
61
+ return output;
62
+ }
63
+
64
+ std::vector<unsigned char> splitWordAll1(const StringPiece &textin)
65
+ {
66
+ const char delim[] = " ";
67
+ const char delim2[] = "-";
68
+ std::vector<unsigned char> output;
69
+
70
+ //Case with no word alignments.
71
+ if (textin.size() == 0) {
72
+ return output;
73
+ }
74
+
75
+ //Split on space
76
+ util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
77
+
78
+ //For each int
79
+ while (it) {
80
+ //Split on dash (-)
81
+ util::TokenIter<util::MultiCharacter> itInner(*it,
82
+ util::MultiCharacter(delim2));
83
+
84
+ //Insert the two entries in the vector. User will read entry 0 and 1 to get the first,
85
+ //2 and 3 for second etc. Use unsigned char instead of int to save space, as
86
+ //word allignments are all very small numbers that fit in a single byte
87
+ output.push_back((unsigned char) (atoi(itInner->data())));
88
+ itInner++;
89
+ output.push_back((unsigned char) (atoi(itInner->data())));
90
+ it++;
91
+ }
92
+
93
+ return output;
94
+
95
+ }
96
+
97
+ void reformatSCFG(line_text &output)
98
+ {
99
+
100
+ }
101
+
102
+ }
103
+
mosesdecoder/probingpt/line_splitter.h ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <vector>
4
+ #include <cstdlib> //atof
5
+ #include "util/string_piece.hh"
6
+ #include "util/tokenize_piece.hh"
7
+ #include "util/file_piece.hh"
8
+ #include "util/string_piece.hh" //Tokenization and work with StringPiece
9
+ #include "util/tokenize_piece.hh"
10
+
11
+ namespace probingpt
12
+ {
13
+
14
+ //Struct for holding processed line
15
+ struct line_text {
16
+ StringPiece source_phrase;
17
+ StringPiece target_phrase;
18
+ StringPiece prob;
19
+ StringPiece word_align;
20
+ StringPiece counts;
21
+ StringPiece sparse_score;
22
+ StringPiece property;
23
+ std::string property_to_be_binarized;
24
+ };
25
+
26
+ //Struct for holding processed line
27
+ struct target_text {
28
+ std::vector<unsigned int> target_phrase;
29
+ std::vector<float> prob;
30
+ std::vector<size_t> word_align_term;
31
+ std::vector<size_t> word_align_non_term;
32
+ std::vector<char> counts;
33
+ std::vector<char> sparse_score;
34
+ std::vector<char> property;
35
+
36
+ /*
37
+ void Reset()
38
+ {
39
+ target_phrase.clear();
40
+ prob.clear();
41
+ word_all1.clear();
42
+ counts.clear();
43
+ sparse_score.clear();
44
+ property.clear();
45
+ }
46
+ */
47
+ };
48
+
49
+ //Ask if it's better to have it receive a pointer to a line_text struct
50
+ line_text splitLine(const StringPiece &textin, bool scfg);
51
+ void reformatSCFG(line_text &output);
52
+
53
+ std::vector<unsigned char> splitWordAll1(const StringPiece &textin);
54
+
55
+ }
56
+
mosesdecoder/probingpt/probing_hash_utils.cpp ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include "probing_hash_utils.h"
3
+ #include "util/file.hh"
4
+
5
+ namespace probingpt
6
+ {
7
+
8
+ //Read table from disk, return memory map location
9
+ char * readTable(const char * filename, util::LoadMethod load_method, util::scoped_fd &file, util::scoped_memory &memory)
10
+ {
11
+ //std::cerr << "filename=" << filename << std::endl;
12
+ file.reset(util::OpenReadOrThrow(filename));
13
+ uint64_t total_size_ = util::SizeFile(file.get());
14
+
15
+ MapRead(load_method, file.get(), 0, total_size_, memory);
16
+
17
+ return (char*) memory.get();
18
+ }
19
+
20
+ void serialize_table(char *mem, size_t size, const std::string &filename)
21
+ {
22
+ std::ofstream os(filename.c_str(), std::ios::binary);
23
+ os.write((const char*) &mem[0], size);
24
+ os.close();
25
+
26
+ }
27
+
28
+ uint64_t getKey(const uint64_t source_phrase[], size_t size)
29
+ {
30
+ //TOO SLOW
31
+ //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
32
+ uint64_t key = 0;
33
+ for (size_t i = 0; i < size; i++) {
34
+ key += (source_phrase[i] << i);
35
+ }
36
+ return key;
37
+ }
38
+
39
+ }
40
+
mosesdecoder/probingpt/probing_hash_utils.h ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "util/probing_hash_table.hh"
4
+
5
+ #if defined(_WIN32) || defined(_WIN64)
6
+ #include <mman.h>
7
+ #else
8
+ #include <sys/mman.h>
9
+ #endif
10
+ #include <boost/functional/hash.hpp>
11
+ #include <fcntl.h>
12
+ #include <fstream>
13
+
14
+ namespace probingpt
15
+ {
16
+
17
+ #define API_VERSION 15
18
+
19
+ //Hash table entry
20
+ struct Entry {
21
+ typedef uint64_t Key;
22
+ Key key;
23
+
24
+ Key GetKey() const {
25
+ return key;
26
+ }
27
+
28
+ void SetKey(Key to) {
29
+ key = to;
30
+ }
31
+
32
+ uint64_t value;
33
+ };
34
+
35
+ #define NONE std::numeric_limits<uint64_t>::max()
36
+
37
+ //Define table
38
+ typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table;
39
+
40
+ void serialize_table(char *mem, size_t size, const std::string &filename);
41
+
42
+ char * readTable(const char * filename, util::LoadMethod load_method, util::scoped_fd &file, util::scoped_memory &memory);
43
+
44
+ uint64_t getKey(const uint64_t source_phrase[], size_t size);
45
+
46
+ struct TargetPhraseInfo {
47
+ uint32_t alignTerm;
48
+ uint32_t alignNonTerm;
49
+ uint16_t numWords;
50
+ uint16_t propLength;
51
+ uint16_t filler;
52
+ };
53
+
54
+ }
55
+
mosesdecoder/probingpt/querying.cpp ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "querying.h"
2
+ #include "util/exception.hh"
3
+ #include "moses2/legacy/Util2.h"
4
+
5
+ using namespace std;
6
+
7
+ namespace probingpt
8
+ {
9
+
10
+ QueryEngine::QueryEngine(const char * filepath, util::LoadMethod load_method)
11
+ {
12
+
13
+ //Create filepaths
14
+ std::string basepath(filepath);
15
+ std::string path_to_config = basepath + "/config";
16
+ std::string path_to_hashtable = basepath + "/probing_hash.dat";
17
+ std::string path_to_source_vocabid = basepath + "/source_vocabids";
18
+ std::string alignPath = basepath + "/Alignments.dat";
19
+
20
+ file_exits(basepath);
21
+
22
+ ///Source phrase vocabids
23
+ read_map(source_vocabids, path_to_source_vocabid.c_str());
24
+
25
+ // alignments
26
+ read_alignments(alignPath);
27
+
28
+ // target phrase
29
+ string targetCollPath = basepath + "/TargetColl.dat";
30
+ memTPS = readTable(targetCollPath.c_str(), load_method, fileTPS_, memoryTPS_);
31
+
32
+ //Read config file
33
+ boost::unordered_map<std::string, std::string> keyValue;
34
+
35
+ std::ifstream config(path_to_config.c_str());
36
+ std::string line;
37
+ while (getline(config, line)) {
38
+ std::vector<std::string> toks = Moses2::Tokenize(line, "\t");
39
+ UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
40
+ keyValue[ toks[0] ] = toks[1];
41
+ }
42
+
43
+ bool found;
44
+ //Check API version:
45
+ int version;
46
+ found = Get(keyValue, "API_VERSION", version);
47
+ if (!found) {
48
+ std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
49
+ } else if (version != API_VERSION) {
50
+ std::cerr << "The ProbingPT API has changed. " << version << "!="
51
+ << API_VERSION << " Please rebinarize your phrase tables." << std::endl;
52
+ exit(EXIT_FAILURE);
53
+ }
54
+
55
+ //Get tablesize.
56
+ int tablesize;
57
+ found = Get(keyValue, "uniq_entries", tablesize);
58
+ if (!found) {
59
+ std::cerr << "uniq_entries not found" << std::endl;
60
+ exit(EXIT_FAILURE);
61
+ }
62
+
63
+ //Number of scores
64
+ found = Get(keyValue, "num_scores", num_scores);
65
+ if (!found) {
66
+ std::cerr << "num_scores not found" << std::endl;
67
+ exit(EXIT_FAILURE);
68
+ }
69
+
70
+ //How may scores from lex reordering models
71
+ found = Get(keyValue, "num_lex_scores", num_lex_scores);
72
+ if (!found) {
73
+ std::cerr << "num_lex_scores not found" << std::endl;
74
+ exit(EXIT_FAILURE);
75
+ }
76
+
77
+ // have the scores been log() and FloorScore()?
78
+ found = Get(keyValue, "log_prob", logProb);
79
+ if (!found) {
80
+ std::cerr << "logProb not found" << std::endl;
81
+ exit(EXIT_FAILURE);
82
+ }
83
+
84
+ config.close();
85
+
86
+ //Read hashtable
87
+ table_filesize = Table::Size(tablesize, 1.2);
88
+ mem = readTable(path_to_hashtable.c_str(), load_method, file_, memory_);
89
+ Table table_init(mem, table_filesize);
90
+ table = table_init;
91
+
92
+ std::cerr << "Initialized successfully! " << std::endl;
93
+ }
94
+
95
+ QueryEngine::~QueryEngine()
96
+ {
97
+ //Clear mmap content from memory.
98
+ //munmap(mem, table_filesize);
99
+
100
+ }
101
+
102
+ uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
103
+ {
104
+ //TOO SLOW
105
+ //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
106
+ return probingpt::getKey(source_phrase, size);
107
+ }
108
+
109
+ std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
110
+ {
111
+ std::pair<bool, uint64_t> ret;
112
+
113
+ const Entry * entry;
114
+ ret.first = table.Find(key, entry);
115
+ if (ret.first) {
116
+ ret.second = entry->value;
117
+ }
118
+ return ret;
119
+ }
120
+
121
+ void QueryEngine::read_alignments(const std::string &alignPath)
122
+ {
123
+ std::ifstream strm(alignPath.c_str());
124
+
125
+ string line;
126
+ while (getline(strm, line)) {
127
+ vector<string> toks = Moses2::Tokenize(line, "\t ");
128
+ UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
129
+
130
+ uint32_t alignInd = Moses2::Scan<uint32_t>(toks[0]);
131
+ if (alignInd >= alignColl.size()) {
132
+ alignColl.resize(alignInd + 1);
133
+ }
134
+
135
+ Alignments &aligns = alignColl[alignInd];
136
+ for (size_t i = 1; i < toks.size(); ++i) {
137
+ size_t pos = Moses2::Scan<size_t>(toks[i]);
138
+ aligns.push_back(pos);
139
+ }
140
+ }
141
+ }
142
+
143
+ void QueryEngine::file_exits(const std::string &basePath)
144
+ {
145
+ if (!Moses2::FileExists(basePath + "/Alignments.dat")) {
146
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/Alignments.dat");
147
+ }
148
+ if (!Moses2::FileExists(basePath + "/TargetColl.dat")) {
149
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/TargetColl.dat");
150
+ }
151
+ if (!Moses2::FileExists(basePath + "/TargetVocab.dat")) {
152
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/TargetVocab.dat");
153
+ }
154
+ if (!Moses2::FileExists(basePath + "/cache")) {
155
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/cache");
156
+ }
157
+ if (!Moses2::FileExists(basePath + "/config")) {
158
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/config");
159
+ }
160
+ if (!Moses2::FileExists(basePath + "/probing_hash.dat")) {
161
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/probing_hash.dat");
162
+ }
163
+ if (!Moses2::FileExists(basePath + "/source_vocabids")) {
164
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/source_vocabids");
165
+ }
166
+
167
+ /*
168
+
169
+ if (!FileExists(path_to_config) || !FileExists(path_to_hashtable) ||
170
+ !FileExists(path_to_source_vocabid) || !FileExists(basepath + alignPath) ||
171
+ !FileExists(basepath + "/TargetColl.dat") || !FileExists(basepath + "/TargetVocab.dat") ||
172
+ !FileExists(basepath + "/cache")) {
173
+ UTIL_THROW2("A required table doesn't exist in: " << basepath);
174
+ }
175
+ */
176
+ }
177
+
178
+ }
179
+
mosesdecoder/probingpt/querying.h ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <boost/iostreams/device/mapped_file.hpp>
4
+ #include <boost/unordered_map.hpp>
5
+ #include <sys/stat.h> //For finding size of file
6
+ #include <algorithm> //toLower
7
+ #include <deque>
8
+ #include "vocabid.h"
9
+ #include "probing_hash_utils.h"
10
+ #include "hash.h" //Includes line splitter
11
+ #include "line_splitter.h"
12
+ #include "util.h"
13
+ #include "moses2/legacy/Util2.h"
14
+
15
+ namespace probingpt
16
+ {
17
+
18
+ class QueryEngine
19
+ {
20
+ std::map<uint64_t, std::string> source_vocabids;
21
+
22
+ typedef std::vector<unsigned char> Alignments;
23
+ std::vector<Alignments> alignColl;
24
+
25
+ Table table;
26
+ char *mem; //Memory for the table, necessary so that we can correctly destroy the object
27
+
28
+ size_t table_filesize;
29
+ bool is_reordering;
30
+
31
+ util::scoped_fd file_;
32
+ util::scoped_memory memory_;
33
+
34
+ // target phrases
35
+ boost::iostreams::mapped_file_source file;
36
+
37
+ util::scoped_fd fileTPS_;
38
+ util::scoped_memory memoryTPS_;
39
+
40
+ void read_alignments(const std::string &alignPath);
41
+ void file_exits(const std::string &basePath);
42
+
43
+ public:
44
+ int num_scores;
45
+ int num_lex_scores;
46
+ bool logProb;
47
+ const char *memTPS;
48
+
49
+ QueryEngine(const char *, util::LoadMethod load_method);
50
+ ~QueryEngine();
51
+
52
+ std::pair<bool, uint64_t> query(uint64_t key);
53
+
54
+ const std::map<uint64_t, std::string> &getSourceVocab() const {
55
+ return source_vocabids;
56
+ }
57
+
58
+ const std::vector<Alignments> &getAlignments() const {
59
+ return alignColl;
60
+ }
61
+
62
+ uint64_t getKey(uint64_t source_phrase[], size_t size) const;
63
+
64
+ template<typename T>
65
+ inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const {
66
+ boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
67
+ if (iter == keyValue.end()) {
68
+ return false;
69
+ }
70
+
71
+ const std::string &foundStr = iter->second;
72
+ found = Scan<T>(foundStr);
73
+ return true;
74
+ }
75
+
76
+ };
77
+
78
+ }
79
+
mosesdecoder/probingpt/storing.cpp ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <sys/stat.h>
2
+ #include <boost/foreach.hpp>
3
+ #include "line_splitter.h"
4
+ #include "storing.h"
5
+ #include "StoreTarget.h"
6
+ #include "StoreVocab.h"
7
+ #include "moses2/legacy/Util2.h"
8
+ #include "InputFileStream.h"
9
+
10
+ using namespace std;
11
+
12
+ namespace probingpt
13
+ {
14
+
15
+ ///////////////////////////////////////////////////////////////////////
16
+ void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos)
17
+ {
18
+ if (pos < sourcePhrase.size()) {
19
+ uint64_t vocabId = sourcePhrase[pos];
20
+
21
+ Node *child;
22
+ Children::iterator iter = m_children.find(vocabId);
23
+ if (iter == m_children.end()) {
24
+ // New node. Write other children then discard them
25
+ BOOST_FOREACH(Children::value_type &valPair, m_children) {
26
+ Node &otherChild = valPair.second;
27
+ otherChild.Write(table);
28
+ }
29
+ m_children.clear();
30
+
31
+ // create new node
32
+ child = &m_children[vocabId];
33
+ assert(!child->done);
34
+ child->key = key + (vocabId << pos);
35
+ } else {
36
+ child = &iter->second;
37
+ }
38
+
39
+ child->Add(table, sourcePhrase, pos + 1);
40
+ } else {
41
+ // this node was written previously 'cos it has rules
42
+ done = true;
43
+ }
44
+ }
45
+
46
+ void Node::Write(Table &table)
47
+ {
48
+ //cerr << "START write " << done << " " << key << endl;
49
+ BOOST_FOREACH(Children::value_type &valPair, m_children) {
50
+ Node &child = valPair.second;
51
+ child.Write(table);
52
+ }
53
+
54
+ if (!done) {
55
+ // save
56
+ Entry sourceEntry;
57
+ sourceEntry.value = NONE;
58
+ sourceEntry.key = key;
59
+
60
+ //Put into table
61
+ table.Insert(sourceEntry);
62
+ }
63
+ }
64
+
65
+ ///////////////////////////////////////////////////////////////////////
66
+ void createProbingPT(const std::string &phrasetable_path,
67
+ const std::string &basepath, int num_scores, int num_lex_scores,
68
+ bool log_prob, int max_cache_size, bool scfg)
69
+ {
70
+ #if defined(_WIN32) || defined(_WIN64)
71
+ std::cerr << "Create not implemented for Windows" << std::endl;
72
+ #else
73
+ std::cerr << "Starting..." << std::endl;
74
+
75
+ //Get basepath and create directory if missing
76
+ mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
77
+
78
+ StoreTarget storeTarget(basepath);
79
+
80
+ //Get uniq lines:
81
+ unsigned long uniq_entries = countUniqueSource(phrasetable_path);
82
+
83
+ //Source phrase vocabids
84
+ StoreVocab<uint64_t> sourceVocab(basepath + "/source_vocabids");
85
+
86
+ //Read the file
87
+ util::FilePiece filein(phrasetable_path.c_str());
88
+
89
+ //Init the probing hash table
90
+ size_t size = Table::Size(uniq_entries, 1.2);
91
+ char * mem = new char[size];
92
+ memset(mem, 0, size);
93
+ Table sourceEntries(mem, size);
94
+
95
+ std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> cache;
96
+ float totalSourceCount = 0;
97
+
98
+ //Keep track of the size of each group of target phrases
99
+ size_t line_num = 0;
100
+
101
+ //Read everything and processs
102
+ std::string prevSource;
103
+
104
+ Node sourcePhrases;
105
+ sourcePhrases.done = true;
106
+ sourcePhrases.key = 0;
107
+
108
+ while (true) {
109
+ try {
110
+ //Process line read
111
+ line_text line;
112
+ line = splitLine(filein.ReadLine(), scfg);
113
+ //cerr << "line=" << line.source_phrase << endl;
114
+
115
+ ++line_num;
116
+ if (line_num % 1000000 == 0) {
117
+ std::cerr << line_num << " " << std::flush;
118
+ }
119
+
120
+ //Add source phrases to vocabularyIDs
121
+ add_to_map(sourceVocab, line.source_phrase);
122
+
123
+ if (prevSource.empty()) {
124
+ // 1st line
125
+ prevSource = line.source_phrase.as_string();
126
+ storeTarget.Append(line, log_prob, scfg);
127
+ } else if (prevSource == line.source_phrase) {
128
+ //If we still have the same line, just append to it:
129
+ storeTarget.Append(line, log_prob, scfg);
130
+ } else {
131
+ assert(prevSource != line.source_phrase);
132
+
133
+ //Create a new entry even
134
+
135
+ // save
136
+ uint64_t targetInd = storeTarget.Save();
137
+
138
+ // next line
139
+ storeTarget.Append(line, log_prob, scfg);
140
+
141
+ //Create an entry for the previous source phrase:
142
+ Entry sourceEntry;
143
+ sourceEntry.value = targetInd;
144
+ //The key is the sum of hashes of individual words bitshifted by their position in the phrase.
145
+ //Probably not entirerly correct, but fast and seems to work fine in practise.
146
+ std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
147
+ if (scfg) {
148
+ // storing prefixes?
149
+ sourcePhrases.Add(sourceEntries, vocabid_source);
150
+ }
151
+ sourceEntry.key = getKey(vocabid_source);
152
+
153
+ /*
154
+ cerr << "prevSource=" << prevSource << flush
155
+ << " vocabids=" << Debug(vocabid_source) << flush
156
+ << " key=" << sourceEntry.key << endl;
157
+ */
158
+ //Put into table
159
+ sourceEntries.Insert(sourceEntry);
160
+
161
+ // update cache - CURRENT source phrase, not prev
162
+ if (max_cache_size) {
163
+ std::string countStr = line.counts.as_string();
164
+ countStr = Moses2::Trim(countStr);
165
+ if (!countStr.empty()) {
166
+ std::vector<float> toks = Moses2::Tokenize<float>(countStr);
167
+ //cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl;
168
+
169
+ if (toks.size() >= 2) {
170
+ totalSourceCount += toks[1];
171
+
172
+ // compute key for CURRENT source
173
+ std::vector<uint64_t> currVocabidSource = getVocabIDs(line.source_phrase.as_string());
174
+ uint64_t currKey = getKey(currVocabidSource);
175
+
176
+ CacheItem *item = new CacheItem(
177
+ Moses2::Trim(line.source_phrase.as_string()),
178
+ currKey,
179
+ toks[1]);
180
+ cache.push(item);
181
+
182
+ if (max_cache_size > 0 && cache.size() > max_cache_size) {
183
+ cache.pop();
184
+ }
185
+ }
186
+ }
187
+ }
188
+
189
+ //Set prevLine
190
+ prevSource = line.source_phrase.as_string();
191
+ }
192
+
193
+ } catch (util::EndOfFileException e) {
194
+ std::cerr
195
+ << "Reading phrase table finished, writing remaining files to disk."
196
+ << std::endl;
197
+
198
+ //After the final entry is constructed we need to add it to the phrase_table
199
+ //Create an entry for the previous source phrase:
200
+ uint64_t targetInd = storeTarget.Save();
201
+
202
+ Entry sourceEntry;
203
+ sourceEntry.value = targetInd;
204
+
205
+ //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
206
+ std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
207
+ sourceEntry.key = getKey(vocabid_source);
208
+
209
+ //Put into table
210
+ sourceEntries.Insert(sourceEntry);
211
+
212
+ break;
213
+ }
214
+ }
215
+
216
+ sourcePhrases.Write(sourceEntries);
217
+
218
+ storeTarget.SaveAlignment();
219
+
220
+ serialize_table(mem, size, (basepath + "/probing_hash.dat"));
221
+
222
+ sourceVocab.Save();
223
+
224
+ serialize_cache(cache, (basepath + "/cache"), totalSourceCount);
225
+
226
+ delete[] mem;
227
+
228
+ //Write configfile
229
+ std::ofstream configfile;
230
+ configfile.open((basepath + "/config").c_str());
231
+ configfile << "API_VERSION\t" << API_VERSION << '\n';
232
+ configfile << "uniq_entries\t" << uniq_entries << '\n';
233
+ configfile << "num_scores\t" << num_scores << '\n';
234
+ configfile << "num_lex_scores\t" << num_lex_scores << '\n';
235
+ configfile << "log_prob\t" << log_prob << '\n';
236
+ configfile.close();
237
+ #endif
238
+ }
239
+
240
+ size_t countUniqueSource(const std::string &path)
241
+ {
242
+ size_t ret = 0;
243
+ InputFileStream strme(path);
244
+
245
+ std::string line, prevSource;
246
+ while (std::getline(strme, line)) {
247
+ std::vector<std::string> toks = Moses2::TokenizeMultiCharSeparator(line, "|||");
248
+ assert(toks.size() != 0);
249
+
250
+ if (prevSource != toks[0]) {
251
+ prevSource = toks[0];
252
+ ++ret;
253
+ }
254
+ }
255
+
256
+ return ret;
257
+ }
258
+
259
+ void serialize_cache(
260
+ std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
261
+ const std::string &path, float totalSourceCount)
262
+ {
263
+ std::vector<const CacheItem*> vec(cache.size());
264
+
265
+ size_t ind = cache.size() - 1;
266
+ while (!cache.empty()) {
267
+ const CacheItem *item = cache.top();
268
+ vec[ind] = item;
269
+ cache.pop();
270
+ --ind;
271
+ }
272
+
273
+ std::ofstream os(path.c_str());
274
+
275
+ os << totalSourceCount << std::endl;
276
+ for (size_t i = 0; i < vec.size(); ++i) {
277
+ const CacheItem *item = vec[i];
278
+ os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl;
279
+ delete item;
280
+ }
281
+
282
+ os.close();
283
+ }
284
+
285
+ uint64_t getKey(const std::vector<uint64_t> &vocabid_source)
286
+ {
287
+ return probingpt::getKey(vocabid_source.data(), vocabid_source.size());
288
+ }
289
+
290
+ std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos)
291
+ {
292
+ assert(endPos < vocabid_source.size());
293
+
294
+ std::vector<uint64_t> ret(endPos + 1);
295
+ for (size_t i = 0; i <= endPos; ++i) {
296
+ ret[i] = vocabid_source[i];
297
+ }
298
+ return ret;
299
+ }
300
+
301
+ }
302
+
mosesdecoder/probingpt/storing.h ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <boost/unordered_set.hpp>
4
+ #include <boost/unordered_map.hpp>
5
+ #include <cstdio>
6
+ #include <sstream>
7
+ #include <fstream>
8
+ #include <iostream>
9
+ #include <string>
10
+ #include <queue>
11
+ #include <sys/stat.h> //mkdir
12
+
13
+ #include "hash.h" //Includes line_splitter
14
+ #include "probing_hash_utils.h"
15
+ #include "vocabid.h"
16
+
17
+ #include "util/file_piece.hh"
18
+ #include "util/file.hh"
19
+
20
+ namespace probingpt
21
+ {
22
+ typedef std::vector<uint64_t> SourcePhrase;
23
+
24
+
25
+ class Node
26
+ {
27
+ typedef boost::unordered_map<uint64_t, Node> Children;
28
+ Children m_children;
29
+
30
+ public:
31
+ uint64_t key;
32
+ bool done;
33
+
34
+ Node()
35
+ :done(false)
36
+ {}
37
+
38
+ void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0);
39
+ void Write(Table &table);
40
+ };
41
+
42
+
43
+ void createProbingPT(const std::string &phrasetable_path,
44
+ const std::string &basepath, int num_scores, int num_lex_scores,
45
+ bool log_prob, int max_cache_size, bool scfg);
46
+ uint64_t getKey(const std::vector<uint64_t> &source_phrase);
47
+
48
+ std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos);
49
+
50
+ template<typename T>
51
+ std::string Debug(const std::vector<T> &vec)
52
+ {
53
+ std::stringstream strm;
54
+ for (size_t i = 0; i < vec.size(); ++i) {
55
+ strm << vec[i] << " ";
56
+ }
57
+ return strm.str();
58
+ }
59
+
60
+ size_t countUniqueSource(const std::string &path);
61
+
62
+ class CacheItem
63
+ {
64
+ public:
65
+ std::string source;
66
+ uint64_t sourceKey;
67
+ float count;
68
+ CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount)
69
+ :source(vSource)
70
+ ,sourceKey(vSourceKey)
71
+ ,count(vCount) {
72
+ }
73
+
74
+ bool operator<(const CacheItem &other) const {
75
+ return count > other.count;
76
+ }
77
+ };
78
+
79
+ class CacheItemOrderer
80
+ {
81
+ public:
82
+ bool operator()(const CacheItem* a, const CacheItem* b) const {
83
+ return (*a) < (*b);
84
+ }
85
+ };
86
+
87
+ void serialize_cache(
88
+ std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
89
+ const std::string &path, float totalSourceCount);
90
+
91
+ }
92
+
mosesdecoder/probingpt/util.cpp ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <cctype>
2
+ #include "util.h"
3
+ #include "util/exception.hh"
4
+
5
+ namespace probingpt
6
+ {
7
+
8
+ template<>
9
+ bool Scan<bool>(const std::string &input)
10
+ {
11
+ std::string lc = ToLower(input);
12
+ if (lc == "yes" || lc == "y" || lc == "true" || lc == "1") return true;
13
+ if (lc == "no" || lc == "n" || lc == "false" || lc == "0") return false;
14
+ UTIL_THROW2("Could not interpret " << input << " as a boolean. After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0.");
15
+ }
16
+
17
+ const std::string ToLower(const std::string& str)
18
+ {
19
+ std::string lc(str);
20
+ std::transform(lc.begin(), lc.end(), lc.begin(), (int (*)(int))std::tolower);
21
+ return
22
+ lc ;
23
+ }
24
+
25
+ }
mosesdecoder/probingpt/util.h ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include <string>
3
+ #include <sstream>
4
+
5
+ namespace probingpt
6
+ {
7
+
8
+ //! convert string to variable of type T. Used to reading floats, int etc from files
9
+ template<typename T>
10
+ inline T Scan(const std::string &input)
11
+ {
12
+ std::stringstream stream(input);
13
+ T ret;
14
+ stream >> ret;
15
+ return ret;
16
+ }
17
+
18
+ //! Specialisation to understand yes/no y/n true/false 0/1
19
+ template<>
20
+ bool Scan<bool>(const std::string &input);
21
+
22
+ const std::string ToLower(const std::string& str);
23
+
24
+ }
mosesdecoder/probingpt/vocabid.cpp ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <boost/foreach.hpp>
2
+ #include "vocabid.h"
3
+ #include "StoreVocab.h"
4
+ #include "moses2/legacy/Util2.h"
5
+
6
+ namespace probingpt
7
+ {
8
+
9
+ void add_to_map(StoreVocab<uint64_t> &sourceVocab,
10
+ const StringPiece &textin)
11
+ {
12
+ //Tokenize
13
+ util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
14
+
15
+ while (itWord) {
16
+ StringPiece word = *itWord;
17
+
18
+ util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
19
+ while (itFactor) {
20
+ StringPiece factor = *itFactor;
21
+
22
+ sourceVocab.Insert(getHash(factor), factor.as_string());
23
+ itFactor++;
24
+ }
25
+ itWord++;
26
+ }
27
+ }
28
+
29
+ void serialize_map(const std::map<uint64_t, std::string> &karta,
30
+ const std::string &filename)
31
+ {
32
+ std::ofstream os(filename.c_str());
33
+
34
+ std::map<uint64_t, std::string>::const_iterator iter;
35
+ for (iter = karta.begin(); iter != karta.end(); ++iter) {
36
+ os << iter->first << '\t' << iter->second << std::endl;
37
+ }
38
+
39
+ os.close();
40
+ }
41
+
42
+ void read_map(std::map<uint64_t, std::string> &karta, const char* filename)
43
+ {
44
+ std::ifstream is(filename);
45
+
46
+ std::string line;
47
+ while (getline(is, line)) {
48
+ std::vector<std::string> toks = Moses2::Tokenize(line, "\t");
49
+ assert(toks.size() == 2);
50
+ uint64_t ind = Moses2::Scan<uint64_t>(toks[1]);
51
+ karta[ind] = toks[0];
52
+ }
53
+
54
+ //Close the stream after we are done.
55
+ is.close();
56
+ }
57
+
58
+ }
59
+
mosesdecoder/probingpt/vocabid.h ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //Serialization
2
+ #include <boost/serialization/serialization.hpp>
3
+ #include <boost/serialization/map.hpp>
4
+ #include <boost/archive/text_iarchive.hpp>
5
+ #include <boost/archive/text_oarchive.hpp>
6
+ #include <fstream>
7
+ #include <iostream>
8
+ #include <vector>
9
+
10
+ #include <map> //Container
11
+ #include "hash.h" //Hash of elements
12
+
13
+ #include "util/string_piece.hh" //Tokenization and work with StringPiece
14
+ #include "util/tokenize_piece.hh"
15
+
16
+ namespace probingpt
17
+ {
18
+ template<typename VOCABID>
19
+ class StoreVocab;
20
+
21
+ void add_to_map(StoreVocab<uint64_t> &sourceVocab,
22
+ const StringPiece &textin);
23
+
24
+ void serialize_map(const std::map<uint64_t, std::string> &karta,
25
+ const std::string &filename);
26
+
27
+ void read_map(std::map<uint64_t, std::string> &karta, const char* filename);
28
+
29
+ }
mosesdecoder/regression-testing/Jamfile ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import option path ;
2
+
3
+ with-regtest = [ option.get "with-regtest" ] ;
4
+ skip-compact = [ option.get "regtest-skip-compactpt" : : "yes" ] ;
5
+ with-xmlrpc = [ option.get "with-xmlrpc-c" ] ;
6
+
7
+ if $(with-regtest) {
8
+ with-regtest = [ path.root $(with-regtest) [ path.pwd ] ] ;
9
+ } else if [ option.get "with-regtest" : : "yes" ] {
10
+ shell_or_fail "git submodule init" ;
11
+ shell_or_fail "git submodule update" ;
12
+ with-regtest = $(TOP)/regression-testing/tests ;
13
+ }
14
+
15
+ if $(with-regtest) {
16
+ test-dir = $(with-regtest)/tests ;
17
+
18
+ rule reg_test ( name : tests * : programs * : action ) {
19
+ alias $(name) : $(tests:D=).passed ;
20
+ for test in $(tests) {
21
+ make $(test:D=).passed : $(programs) : $(action) ;
22
+ alias $(test) : $(test:D=).passed ;
23
+ }
24
+ }
25
+
26
+ actions reg_test_decode {
27
+ $(TOP)/regression-testing/run-single-test.perl --decoder=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
28
+ }
29
+
30
+ if $(with-xmlrpc) {
31
+ actions reg_test_decode_server {
32
+ $(TOP)/regression-testing/run-single-test.perl --server --decoder=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
33
+ }
34
+ reg_test phrase-server : [ glob $(test-dir)/phrase-server.* ] : ../moses-cmd//moses : @reg_test_decode_server ;
35
+ }
36
+
37
+ if $(skip-compact) {
38
+ reg_test phrase : [ glob $(test-dir)/phrase.* : $(test-dir)/*withDALM $(test-dir)/*compactptable ] : ../moses-cmd//moses : @reg_test_decode ;
39
+ reg_test chart : [ glob $(test-dir)/chart.* : $(test-dir)/*withDALM $(test-dir)/*compactptable ] : ../moses-cmd//moses : @reg_test_decode ;
40
+ reg_test moses2 : [ glob $(test-dir)/moses2.* : $(test-dir)/*withDALM $(test-dir)/*compactptable ] : ../moses-cmd//moses2 : @reg_test_decode ;
41
+ } else {
42
+ reg_test phrase : [ glob $(test-dir)/phrase.* : $(test-dir)/*withDALM ] : ../moses-cmd//moses : @reg_test_decode ;
43
+ reg_test chart : [ glob $(test-dir)/chart.* : $(test-dir)/*withDALM ] : ../moses-cmd//moses : @reg_test_decode ;
44
+ reg_test moses2 : [ glob $(test-dir)/moses2.* : $(test-dir)/*withDALM ] : ../moses2//moses2 : @reg_test_decode ;
45
+ }
46
+
47
+ if [ option.get "with-dalm" : : "yes" ] {
48
+ reg_test dalm : [ glob $(test-dir)/*withDALM ] : ../moses-cmd//moses : @reg_test_decode ;
49
+ } else {
50
+ alias dalm ;
51
+ }
52
+ actions reg_test_score {
53
+ $(TOP)/regression-testing/run-test-scorer.perl --scorer=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
54
+ }
55
+ reg_test score : [ glob $(test-dir)/score.* : ] : ../phrase-extract//score : @reg_test_score ;
56
+
57
+ actions reg_test_extract {
58
+ $(TOP)/regression-testing/run-test-extract.perl --extractor=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
59
+ }
60
+ reg_test extract : [ glob $(test-dir)/extract.* ] : ../phrase-extract//extract : @reg_test_extract ;
61
+
62
+
63
+ actions reg_test_extractrules {
64
+ $(TOP)/regression-testing/run-test-extract.perl --extractor=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
65
+ }
66
+ reg_test extractrules : [ glob $(test-dir)/extract-rules.* : $(with-regtest)/extract-rules.hierarchical ] : ../phrase-extract//extract-rules : @reg_test_extractrules ;
67
+
68
+ pwd = [ path.pwd ] ;
69
+ actions reg_test_mert {
70
+ $(TOP)/regression-testing/run-test-mert.perl --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) --bin-dir=$(pwd)/$(>:D) && touch $(<)
71
+ }
72
+ reg_test mert : [ glob $(test-dir)/mert.* ] : ../mert//mert ../mert//extractor ../mert//pro ../mert//hgdecode : @reg_test_mert ;
73
+
74
+ actions reg_test_misc {
75
+ $(TOP)/regression-testing/run-test-misc.perl --moses-root=$(TOP) --moses-bin=$(BINDIR) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
76
+ }
77
+ reg_test misc : [ glob $(test-dir)/misc.* : $(test-dir)/misc.mml* ] : ..//prefix-bin ..//prefix-lib : @reg_test_misc ;
78
+ reg_test misc-mml : [ glob $(test-dir)/misc.mml* ] : $(TOP)/scripts/ems/support/mml-filter.py $(TOP)/scripts/ems/support/defaultconfig.py : @reg_test_misc ;
79
+
80
+ alias all : phrase chart mert score extract extractrules misc misc-mml dalm ;
81
+ }
mosesdecoder/regression-testing/MosesRegressionTesting.pm ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package MosesRegressionTesting;
2
+
3
+ use strict;
4
+
5
+ # if your tests need a new version of the test data, increment this
6
+ # and make sure that a moses-regression-tests-vX.Y is available for
7
+ # download from statmt.org (redpony AT umd dot edu for more info)
8
+
9
+ # find the data directory in a few likely locations and make sure
10
+ # that it is the correct version
11
+ sub find_data_directory
12
+ {
13
+ my ($test_script_root, $data_dir) = @_;
14
+ my @ds = ();
15
+ my $mrtp = "moses-reg-test-data";
16
+ push @ds, $data_dir if defined $data_dir;
17
+ push @ds, "$test_script_root/$mrtp";
18
+ push @ds, "/export/ws06osmt/regression-testing/$mrtp";
19
+ push @ds, "/tmp/$mrtp";
20
+ push @ds, "/var/tmp/$mrtp";
21
+ foreach my $d (@ds) {
22
+ next unless (-d $d);
23
+ if (!-d "$d/models") {
24
+ print STDERR "Found $d but it is malformed: missing subdir models/\n";
25
+ next;
26
+ }
27
+ if (!-d "$d/lm") {
28
+ print STDERR "Found $d but it is malformed: missing subdir lm/\n";
29
+ next;
30
+ }
31
+ return $d;
32
+ }
33
+ print STDERR<<EOT;
34
+
35
+ You do not appear to have the regression testing data installed. You may
36
+ either specify a non-standard location when running the test suite with
37
+ the --data-dir option, or, you may install it in any one of the following
38
+ standard locations: $test_script_root, /tmp, or /var/tmp with these
39
+ commands:
40
+
41
+ cd <DESIRED_INSTALLATION_DIRECTORY>
42
+ git clone https://github.com/hieuhoang/moses-reg-test-data.git
43
+
44
+ EOT
45
+ exit 1;
46
+ }
47
+
48
+
49
+ sub get_localized_moses_ini
50
+ {
51
+ use File::Temp;
52
+ my ($moses_ini, $data_dir, $results_dir) = @_;
53
+ my $LM_PATH = "$data_dir/lm";
54
+ my $MODEL_PATH = "$data_dir/models";
55
+ use Cwd qw/ abs_path /; use File::Basename; my $TEST_PATH = dirname(abs_path($moses_ini));
56
+ my $local_moses_ini = new File::Temp( UNLINK => 0, SUFFIX => '.ini' );
57
+
58
+ open MI, "<$moses_ini" or die "Couldn't read $moses_ini";
59
+ open MO, ">$local_moses_ini" or die "Couldn't open $local_moses_ini for writing";
60
+ while (my $l = <MI>) {
61
+ $l =~ s/\$\{LM_PATH\}/$LM_PATH/g;
62
+ $l =~ s/\$\{MODEL_PATH\}/$MODEL_PATH/g;
63
+ $l =~ s/\$\{TEST_PATH\}/$TEST_PATH/g;
64
+ $l =~ s/\$\{RESULTS_PATH\}/$results_dir/g;
65
+ print $local_moses_ini $l;
66
+ }
67
+ close MO;
68
+ close MI;
69
+
70
+ return $local_moses_ini->filename;
71
+ }
72
+
73
+ sub get_nbestlist
74
+ {
75
+ my ($moses_ini) = @_;
76
+ my $nbestfile = undef;
77
+ my $nbestsize = undef;
78
+
79
+ open MI, "<$moses_ini" or die "Couldn't read $moses_ini";
80
+ while (my $l = <MI>) {
81
+ if ($l =~ /^\[n-best-list\]/i){
82
+ chomp($nbestfile = <MI>);
83
+ chomp($nbestsize = <MI>);
84
+ }
85
+ }
86
+ close MI;
87
+
88
+ return ($nbestfile,$nbestsize);
89
+ }
90
+
91
+
92
+ 1;
93
+
mosesdecoder/regression-testing/compare-results.perl ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ # $Id$
4
+
5
+ use warnings;
6
+ use strict;
7
+ my ($results, $truth) = @ARGV;
8
+
9
+ my ($report, $pass, $fail) = compare_results("$results/results.txt", "$truth/results.txt");
10
+ open OUT, ">$results/Summary";
11
+ print OUT $report;
12
+ print $report;
13
+ close OUT;
14
+
15
+ if ($fail > 0) {
16
+ print <<EOT;
17
+
18
+ There were failures in this test run. Please analyze the results carefully.
19
+
20
+ EOT
21
+ exit 1;
22
+ }
23
+ exit 0;
24
+
25
+ sub compare_results {
26
+ my ($testf, $truthf) = @_;
27
+ my $test = read_results($testf);
28
+ my $truth = read_results($truthf);
29
+ my $ct1 = delete $truth->{'COMPARISON_TYPE'};
30
+ my $ct2 = delete $test->{'COMPARISON_TYPE'};
31
+ my $pass = 0;
32
+ my $fail = 0;
33
+ my $report = '';
34
+ foreach my $k (sort keys %$truth) {
35
+ $report .= "test-name=$k\tresult=";
36
+ if (!exists $test->{$k}) {
37
+ $report .= "missing from test results\n";
38
+ $fail++;
39
+ next;
40
+ }
41
+ my $truthv = (defined($truth->{$k}))?$truth->{$k}:'';
42
+ my $testv = '';
43
+ if (defined($test->{$k})){
44
+ $testv = $test->{$k};
45
+ delete $test->{$k};
46
+ }
47
+ if ($ct1->{$k} eq '=') {
48
+ if ($truthv eq $testv) {
49
+ $report .= "pass\n";
50
+ $pass++;
51
+ } else {
52
+ $report .= "fail\n\tTRUTH=$truthv\n\t TEST=$testv\n";
53
+ $fail++;
54
+ }
55
+ } else { # numeric difference
56
+
57
+
58
+ my $diff = $testv - $truthv;
59
+ if ($diff == 0) { $report .= "identical\n"; next; }
60
+ $report .= "BASELINE=$truthv, TEST=$testv\t DELTA=$diff";
61
+ if ($truthv != 0) {
62
+ my $pct = $diff/$truthv;
63
+ my $t = sprintf "\t PCT CHANGE=%4.2f", $pct*100;
64
+ $report .= $t;
65
+ }
66
+ $report .= "\n";
67
+ }
68
+ }
69
+ foreach my $k (sort keys %$test) {
70
+ $fail++;
71
+ $report .= "test-name=$k\tfound in TEST but not in TRUTH.\n";
72
+ }
73
+ $report .= "\nTESTS PASSED=$pass\nTESTS FAILED=$fail\n";
74
+ return $report, $pass, $fail;
75
+ }
76
+
77
+ sub read_results {
78
+ my ($file) = @_;
79
+ open IN, "<$file" or die "Could not open $file!";
80
+ my %res;
81
+ while (my $l = <IN>) {
82
+ if ($l =~ /^([A-Za-z0-9_]+)\s*([=~])\s*(.+)$/) {
83
+ my ($key, $comparison_type, $value) = ($1, $2, $3);
84
+ $res{$key} = $value;
85
+ $res{'COMPARISON_TYPE'}->{$key}=$comparison_type;
86
+ }
87
+ }
88
+ close IN;
89
+ return \%res;
90
+ }
91
+
mosesdecoder/regression-testing/ensure-regression-data-here.perl ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ # downloads the regression data
3
+ use warnings;
4
+ use strict;
5
+ use MosesRegressionTesting;
6
+
7
+ my $data_version = MosesRegressionTesting::TESTING_DATA_VERSION;
8
+
9
+ exit 0 if -d "moses-reg-test-data-$data_version";
10
+ # data in place
11
+
12
+ safesystem("wget http://www.statmt.org/moses/reg-testing/moses-reg-test-data-$data_version.tgz")
13
+ or die "wget failed";
14
+ safesystem("tar xzf moses-reg-test-data-$data_version.tgz")
15
+ or die "untar failed";
16
+ safesystem("rm moses-reg-test-data-$data_version.tgz");
17
+
18
+ sub safesystem {
19
+ # print STDERR "Executing: @_\n";
20
+ system(@_);
21
+ if ($? == -1) {
22
+ print STDERR "Failed to execute: @_\n $!\n";
23
+ exit(1);
24
+ }
25
+ elsif ($? & 127) {
26
+ printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
27
+ ($? & 127), ($? & 128) ? 'with' : 'without';
28
+ exit(1);
29
+ }
30
+ else {
31
+ my $exitcode = $? >> 8;
32
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
33
+ return ! $exitcode;
34
+ }
35
+ }
mosesdecoder/regression-testing/run-single-test.perl ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ # $Id$
4
+
5
+ use Encode;
6
+ use utf8;
7
+ use warnings;
8
+ use strict;
9
+ my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
10
+ use MosesRegressionTesting;
11
+ use Getopt::Long;
12
+ use File::Temp qw ( tempfile );
13
+ use POSIX qw ( strftime );
14
+ use POSIX ":sys_wait_h";
15
+ my @SIGS = qw ( SIGHUP SIGINT SIGQUIT SIGILL SIGTRAP SIGABRT SIGIOT SIGBUS SIGFPE SIGKILL SIGUSR1 SIGSEGV SIGUSR2 SIGPIPE SIGALRM SIGTERM SIGSTKFLT SIGCHLD SIGCONT SIGSTOP SIGTSTP SIGTTIN SIGTTOU SIGURG SIGXCPU SIGXFSZ SIGVTALRM SIGPROF SIGWINCH SIGIO SIGPWR SIGSYS SIGUNUSED SIGRTMIN );
16
+ my ($decoder, $test_name);
17
+
18
+ my $test_dir = "$script_dir/tests";
19
+ my $data_dir;
20
+ my $BIN_TEST = $script_dir;
21
+ my $results_dir;
22
+ my $NBEST = 0;
23
+ my $run_server_test = 0;
24
+ my $serverport = int(rand(9999)) + 10001;
25
+ my $url = "http://localhost:$serverport/RPC2";
26
+ my $startupTest = 0;
27
+ GetOptions("decoder=s" => \$decoder,
28
+ "test=s" => \$test_name,
29
+ "data-dir=s"=> \$data_dir,
30
+ "test-dir=s"=> \$test_dir,
31
+ "results-dir=s"=> \$results_dir,
32
+ "server"=> \$run_server_test,
33
+ "startuptest"=> \$startupTest
34
+ ) or exit 1;
35
+
36
+ if($run_server_test)
37
+ {
38
+ eval {
39
+ require XMLRPC::Lite;
40
+ import XMLRPC::Lite;
41
+ };
42
+ if ($@) {
43
+ die "Error: XMLRPC::Lite not installed, moses server regression tests will not be run. $@";
44
+ }
45
+ exit(0) if($startupTest);
46
+ }
47
+
48
+ die "Please specify a decoder with --decoder\n" unless $decoder;
49
+ die "Please specify a test to run with --test\n" unless $test_name;
50
+
51
+ die "Please specify the location of the data directory with --data-dir\n" unless $data_dir;
52
+
53
+ die "Cannot locate test dir at $test_dir" unless (-d $test_dir);
54
+
55
+ $test_dir .= "/$test_name";
56
+ die "Cannot locate test dir at $test_dir" unless (-d $test_dir);
57
+
58
+ #### get place to put results
59
+ unless (defined $results_dir) { $results_dir = "$data_dir/results"; }
60
+ if (!-d $results_dir) {
61
+ print STDERR "[WARNING] Results directory not found.\n";
62
+ mkdir ($results_dir) || die "Failed to create $results_dir";
63
+ }
64
+ $results_dir .= "/$test_name";
65
+ if (!-d $results_dir) {
66
+ print STDERR "[WARNING] Results directory for test=$test_name could not be found.\n";
67
+ mkdir ($results_dir) || die "Failed to create $results_dir";
68
+ }
69
+ ##########
70
+
71
+ my $conf = "$test_dir/moses.ini";
72
+ my $input = "$test_dir/to-translate.txt";
73
+
74
+ die "Cannot locate executable called $decoder\n" unless (-x $decoder);
75
+ die "Cannot find $conf\n" unless (-f $conf);
76
+ die "Cannot locate input at $input" unless (-f $input);
77
+
78
+ my $local_moses_ini = MosesRegressionTesting::get_localized_moses_ini($conf, $data_dir, $results_dir);
79
+ my ($nbestfile,$nbestsize) = MosesRegressionTesting::get_nbestlist($conf);
80
+
81
+ if (defined($nbestsize) && $nbestsize > 0) {
82
+ $NBEST=$nbestsize;
83
+ }
84
+
85
+ my $ts = get_timestamp($decoder);
86
+ my $results = "$results_dir/$ts";
87
+ mkdir($results) || die "Failed to create results directory: $results\n";
88
+
89
+ my $truth = "$test_dir/truth";
90
+ if (!-d $truth) {
91
+ die "Could not find truth/ in $test_dir!\n";
92
+ }
93
+
94
+ print "RESULTS AVAILABLE IN: $results\n\n";
95
+ my ($o, $elapsed, $ec, $sig);
96
+ if($run_server_test) {
97
+ ($o, $elapsed, $ec, $sig) = exec_moses_server($decoder, $local_moses_ini, $input, $results);
98
+ }
99
+ else {
100
+ ($o, $elapsed, $ec, $sig) = exec_moses($decoder, $local_moses_ini, $input, $results);
101
+ }
102
+ my $error = ($sig || $ec > 0);
103
+ if ($error) {
104
+ open OUT, ">$results/Summary";
105
+ print STDERR "MOSES CRASHED.\n\texit_code=$ec\n\tsignal=$sig\n";
106
+ print OUT "MOSES CRASHED.\n\texit_code=$ec\n\tsignal=$sig\n";
107
+ print STDERR "FAILURE, for debugging, local moses.ini=$local_moses_ini\n";
108
+ print OUT "FAILURE, for debugging, local moses.ini=$local_moses_ini\n";
109
+ close OUT;
110
+ exit 2 if $sig;
111
+ exit 3;
112
+ }
113
+
114
+ ($o, $ec, $sig) = run_command("$test_dir/filter-stdout.pl $results/run.stdout > $results/results.txt");
115
+ warn "filter-stdout failed!" if ($ec > 0 || $sig);
116
+ ($o, $ec, $sig) = run_command("$test_dir/filter-stderr.pl $results/run.stderr >> $results/results.txt");
117
+ warn "filter-stderr failed!" if ($ec > 0 || $sig);
118
+
119
+ if($NBEST > 0){
120
+ ($o, $ec, $sig) = run_command("$test_dir/filter-nbest.pl $results/run.nbest >> $results/results.txt");
121
+ warn "filter-nbest failed!" if ($ec > 0 || $sig);
122
+ }
123
+
124
+ open OUT, ">>$results/results.txt";
125
+ print OUT "TOTAL_WALLTIME ~ $elapsed\n";
126
+ close OUT;
127
+
128
+ run_command("gzip $results/run.stdout");
129
+ run_command("gzip $results/run.stderr");
130
+ if($NBEST > 0){
131
+ run_command("gzip $results/run.nbest");
132
+ }
133
+
134
+ ($o, $ec, $sig) = run_command("$BIN_TEST/compare-results.perl $results $truth");
135
+ print $o;
136
+ if ($ec) {
137
+ print STDERR "FAILURE, for debugging, local moses.ini=$local_moses_ini\n";
138
+ exit 1;
139
+ }
140
+
141
+ unlink $local_moses_ini or warn "Couldn't remove $local_moses_ini\n";
142
+ exit 0;
143
+
144
+ sub exec_moses {
145
+ my ($decoder, $conf, $input, $results) = @_;
146
+ my $start_time = time;
147
+ my ($o, $ec, $sig);
148
+ my $cmd;
149
+ if ($NBEST > 0){
150
+ print STDERR "Nbest output file is $results/run.nbest\n";
151
+ print STDERR "Nbest size is $NBEST\n";
152
+ $cmd = "$decoder -f $conf -i $input -n-best-list $results/run.nbest $NBEST 1> $results/run.stdout 2> $results/run.stderr";
153
+ }
154
+ else{
155
+ $cmd = "$decoder -f $conf -i $input 1> $results/run.stdout 2> $results/run.stderr";
156
+ }
157
+
158
+ open CMD, ">$results/cmd_line";
159
+ print CMD "$cmd\n";
160
+ close CMD;
161
+
162
+ ($o, $ec, $sig) = run_command($cmd);
163
+ my $elapsed = time - $start_time;
164
+ return ($o, $elapsed, $ec, $sig);
165
+ }
166
+
167
+ sub exec_moses_server {
168
+ my ($decoder, $conf, $input, $results) = @_;
169
+ my $start_time = time;
170
+ my ($o, $ec, $sig);
171
+ $ec = 0; $sig = 0; $o = 0;
172
+ my $pid = fork();
173
+ if (not defined $pid) {
174
+ warn "resources not avilable to fork Moses server\n";
175
+ $ec = 1; # to generate error
176
+ } elsif ($pid == 0) {
177
+ setpgrp(0, 0);
178
+ warn "Starting Moses server on port $serverport ...\n";
179
+ my $cmd = "$decoder --server --server-port $serverport -f $conf -verbose 2 --server-log $results/run.stderr.server 2> $results/run.stderr ";
180
+ open CMD, ">$results/cmd_line";
181
+ print CMD "$cmd\n";
182
+ close CMD;
183
+ ($o, $ec, $sig) = run_command($cmd);
184
+ exit;
185
+ # this should not be reached unless the server fails to start
186
+ }
187
+ while( 1==1 ) # wait until the server is listening for requests
188
+ {
189
+ sleep 5;
190
+ my $res = waitpid($pid, WNOHANG);
191
+ die "Moses crashed or aborted! Check $results/run.stderr for error messages.\n" if ($res);
192
+ my $str = `grep "Listening on port $serverport" $results/run.stderr`;
193
+ last if($str =~ /Listening/);
194
+ }
195
+ my $proxy = XMLRPC::Lite->proxy($url);
196
+ warn "Opening file $input to write to $results\n";
197
+ open(TEXTIN, "$input") or die "Can not open the input file to translate with Moses server\n";
198
+ binmode TEXTIN, ':utf8';
199
+ open(TEXTOUT, ">$results/run.stdout");
200
+ binmode TEXTOUT, ':utf8';
201
+ while(<TEXTIN>)
202
+ {
203
+ chop;
204
+ my $encoded = SOAP::Data->type(string => $_); # NOTE: assuming properly encoded UTF-8 input: check tests before adding them!
205
+ my %param = ("text" => $encoded);
206
+ my $result = $proxy->call("translate",\%param)->result;
207
+ print TEXTOUT $result->{'text'} . "\n";
208
+ }
209
+ close(TEXTIN);
210
+ close(TEXTOUT);
211
+ my $elapsed = time - $start_time;
212
+ print STDERR "Finished translating file $input\n";
213
+ if(waitpid($pid, WNOHANG) <= 0)
214
+ {
215
+ warn "Killing process group $pid of the $decoder --server ... \n";
216
+ kill 9, -$pid;
217
+ }
218
+ return ($o, $elapsed, $ec, $sig);
219
+ }
220
+
221
+ sub run_command {
222
+ my ($cmd) = @_;
223
+ my $o = `$cmd`;
224
+ my $exit_code = $? >> 8;
225
+
226
+ my $signal = $? & 127;
227
+ my $core_dumped = $? & 128;
228
+ if ($signal) { $signal = sig_name($signal); }
229
+ return $o, $exit_code, $signal;
230
+ }
231
+
232
+ sub sig_name {
233
+ my $sig = shift;
234
+ return $SIGS[$sig];
235
+ }
236
+
237
+ sub get_timestamp {
238
+ my ($file) = @_;
239
+ my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
240
+ $atime,$mtime,$ctime,$blksize,$blocks)
241
+ = stat($file);
242
+ my $timestamp = strftime("%Y%m%d-%H%M%S", gmtime $mtime);
243
+ my $timestamp2 = strftime("%Y%m%d-%H%M%S", gmtime);
244
+ my $username = `whoami`; chomp $username;
245
+ return "moses.v$timestamp-$username-at-$timestamp2";
246
+ }
247
+
mosesdecoder/regression-testing/run-test-detokenizer.perl ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # Detokenization tests.
4
+ #
5
+
6
+ use warnings;
7
+ use strict;
8
+ # This is here to suppress (false) warnings about OLDOUT and OLDERR being used only once. Maybe there is a less brutish way to suppress that, but I don't know it.
9
+ no warnings 'once';
10
+ use utf8;
11
+
12
+ use Cwd ('abs_path');
13
+ use File::Spec::Functions;
14
+ use File::Basename ('dirname');
15
+ use IPC::Run3;
16
+ use Getopt::Long;
17
+ use Test::More;
18
+
19
+ GetOptions("detokenizer=s" => \(my $detokenizer),
20
+ "results-dir=s"=> \(my $results_dir)
21
+ ) or exit 1;
22
+
23
+ unless (defined $results_dir) {
24
+ print STDERR "Usage: run-test-detokenizer.t --results-dir <RESULTS-DIRECTORY> [--detokenizer <DETOKENIZER-SCRIPT>]\n";
25
+ exit 1;
26
+ }
27
+
28
+ die "ERROR: Results directory ".$results_dir." doesn't exist or is not a writable directory. Dying" unless (-d $results_dir && -w $results_dir);
29
+
30
+ $detokenizer = catfile(dirname(dirname(abs_path($0))), "scripts", "tokenizer", "detokenizer.perl") unless $detokenizer;
31
+ die "ERROR: Detokenizer script ".$detokenizer." does not exist. Dying" unless -f $detokenizer;
32
+
33
+
34
+ my @testCases = ();
35
+
36
+ ######################################
37
+ # Definitions of individual test cases
38
+ ######################################
39
+
40
+ # A simple English test
41
+ &addDetokenizerTest("TEST_ENGLISH_EASY", "en",
42
+ <<'TOK'
43
+ This sentence is really simple , so it should not be hard to detokenize .
44
+ This one is no more difficult , but , hey , it is on a new line .
45
+ TOK
46
+ ,
47
+ <<'EXP'
48
+ This sentence is really simple, so it should not be hard to detokenize.
49
+ This one is no more difficult, but, hey, it is on a new line.
50
+ EXP
51
+ );
52
+
53
+ # An English test involving double-quotes
54
+ &addDetokenizerTest("TEST_ENGLISH_DOUBLEQUOTES", "en",
55
+ <<'TOK'
56
+ This is a somewhat " less simple " test .
57
+ TOK
58
+ ,
59
+ <<'EXP'
60
+ This is a somewhat "less simple" test.
61
+ EXP
62
+ );
63
+
64
+ # A simple French test
65
+ &addDetokenizerTest("TEST_FRENCH_EASY", "fr",
66
+ <<'TOK'
67
+ Voici une phrase simple .
68
+ TOK
69
+ ,
70
+ <<'EXP'
71
+ Voici une phrase simple.
72
+ EXP
73
+ );
74
+
75
+ # A French test involving an apostrophe
76
+ &addDetokenizerTest("TEST_FRENCH_APOSTROPHE", "fr",
77
+ <<'TOK'
78
+ Moi , j' ai une apostrophe .
79
+ TOK
80
+ ,
81
+ <<'EXP'
82
+ Moi, j'ai une apostrophe.
83
+ EXP
84
+ );
85
+
86
+ # A French test involving an apostrophe on the second-last word
87
+ &addDetokenizerTest("TEST_FRENCH_APOSTROPHE_PENULTIMATE", "fr",
88
+ <<'TOK'
89
+ de musique rap issus de l' immigration
90
+ TOK
91
+ ,
92
+ <<'EXP'
93
+ de musique rap issus de l'immigration
94
+ EXP
95
+ );
96
+
97
+ # A German test involving non-ASCII characters
98
+ # Note: We don't specify a language because the detokenizer errors if you pass in a language for which it has no special rules, of which German is an example.
99
+ &addDetokenizerTest("TEST_GERMAN_NONASCII", undef,
100
+ <<'TOK'
101
+ Ich hoffe , daß Sie schöne Ferien hatten .
102
+ Frau Präsidentin ! Frau Díez González und ich hatten einige Anfragen
103
+ TOK
104
+ ,
105
+ <<'EXP'
106
+ Ich hoffe, daß Sie schöne Ferien hatten.
107
+ Frau Präsidentin! Frau Díez González und ich hatten einige Anfragen
108
+ EXP
109
+ );
110
+
111
+ # A simple Chinese test
112
+ &addDetokenizerTest("TEST_CHINESE_EASY", undef,
113
+ <<'TOK'
114
+ 这 是 一个 简单 的的 汉语 句子 。
115
+ TOK
116
+ ,
117
+ <<'EXP'
118
+ 这是一个简单的的汉语句子。
119
+ EXP
120
+ );
121
+
122
+ # A simple Japanese test
123
+ &addDetokenizerTest("TEST_JAPANESE_EASY", undef,
124
+ <<'TOK'
125
+ どう しょ う か な 。
126
+ どこ で 食べ たい 。
127
+ TOK
128
+ ,
129
+ <<'EXP'
130
+ どうしょうかな。
131
+ どこで食べたい。
132
+ EXP
133
+ );
134
+
135
+
136
+ ######################################
137
+ # Now run those babies ...
138
+ ######################################
139
+
140
+ plan tests => scalar(@testCases);
141
+
142
+ foreach my $testCase (@testCases) {
143
+ &runDetokenizerTest($testCase);
144
+ }
145
+
146
+ ############
147
+ ## Utilities
148
+ ############
149
+
150
+ # Creates a new detokenizer test case, adds it to the array of test cases to be run, and returns it.
151
+ sub addDetokenizerTest {
152
+ my ($testName, $language, $tokenizedText, $rightAnswer) = @_;
153
+
154
+ my $testCase = new DetokenizerTestCase($testName, $language, $tokenizedText, $rightAnswer);
155
+ push(@testCases, $testCase);
156
+ return $testCase;
157
+ }
158
+
159
+ sub runDetokenizerTest {
160
+ my ($testCase) = @_;
161
+
162
+ my $testOutputDir = catfile($results_dir, $testCase->getName());
163
+ my $tokenizedFile = catfile($testOutputDir, "input.txt");
164
+ my $expectedFile = catfile($testOutputDir, "expected.txt");
165
+
166
+ # Fail if we can't make the test output directory
167
+ unless (mkdir($testOutputDir)) {
168
+ return fail($testCase->getName().": Failed to create output directory ".$testOutputDir." [".$!."]");
169
+ }
170
+
171
+ open TOK, ">".$tokenizedFile;
172
+ binmode TOK, ":utf8";
173
+ print TOK $testCase->getTokenizedText();
174
+ close TOK;
175
+
176
+ open TRUTH, ">".$expectedFile;
177
+ binmode TRUTH, ":utf8";
178
+ print TRUTH $testCase->getRightAnswer();
179
+ close TRUTH;
180
+
181
+ &runTest($testCase->getName(), $testOutputDir, $tokenizedFile, sub {
182
+ return defined($testCase->getLanguage()) ? [$detokenizer, "-l", $testCase->getLanguage()] : [$detokenizer];
183
+ }, sub {
184
+ &verifyIdentical($testCase->getName(), $expectedFile, catfile($testOutputDir, "stdout.txt"))
185
+ }, 1, $testCase->getFailureExplanation());
186
+ }
187
+
188
+ # $stdinFile, if defined, is a file to send to the command via STDIN
189
+ # $buildCommandRoutineReference is a reference to a zero-argument subroutine that returns the
190
+ # system command to run in the form of an array reference
191
+ # $validationRoutineReference is a reference to a zero-argument subroutine that makes exactly one call
192
+ # to ok() or similar to validate the contents of the output directory
193
+ # $separateStdoutFromStderr is an optional boolean argument; if omitted or false, the command's
194
+ # STDOUT and STDERR are mixed together in out output file called
195
+ # stdout-and-stderr.txt; otherwise, they are printed to separate output
196
+ # files called stdout.txt and stderr.txt, respectively
197
+ # $failureExplanation is an explanation of why the test is expected to fail. If the test is expected
198
+ # to pass, then this should be left undefined. Even in the case of a test that
199
+ # is expected to fail, the system command is still expected to exit normally --
200
+ # only the validation routine is expected to fail.
201
+ sub runTest {
202
+ my ($testName, $outputDir, $stdinFile, $buildCommandRoutineReference, $validationRoutineReference, $separateStdoutFromStderr, $failureExplanation) = @_;
203
+
204
+ my ($stdoutFile, $stderrFile);
205
+ if ($separateStdoutFromStderr) {
206
+ $stdoutFile = catfile($outputDir, "stdout.txt");
207
+ $stderrFile = catfile($outputDir, "stderr.txt");
208
+ } else {
209
+ $stdoutFile = catfile($outputDir, "stdout-and-stderr.txt");
210
+ $stderrFile = $stdoutFile;
211
+ }
212
+
213
+ my $commandRef = $buildCommandRoutineReference->();
214
+ my $exitStatus = &runVerbosely($commandRef, $stdinFile, $stdoutFile, $stderrFile);
215
+ return fail($testName.": command exited with status ".$exitStatus) unless $exitStatus == 0;
216
+
217
+ if (defined $failureExplanation) {
218
+ TODO: {
219
+ local $TODO = $failureExplanation;
220
+ $validationRoutineReference->();
221
+ }
222
+ } else {
223
+ $validationRoutineReference->();
224
+ }
225
+ }
226
+
227
+ # Announce that we're going to run the given command, then run it.
228
+ # $stdinFile, if defined, is a file to send to the command via STDIN
229
+ # $stdoutFile and $stderrFile, if defined, are file paths to which the command's standard output
230
+ # and standard error, respectively, are written. They can be the same file.
231
+ # The exit code of the command is returned.
232
+ sub runVerbosely {
233
+ my ($commandRef, $stdinFile, $stdoutFile, $stderrFile) = @_;
234
+ my @command = @{$commandRef};
235
+ note("Executing command:\n @command\n");
236
+ note("standard input coming from: ".$stdinFile) if defined $stdinFile;
237
+ note("standard output going to: ".$stdoutFile) if defined $stdoutFile;
238
+ note("standard error going to: ".$stderrFile) if defined $stderrFile;
239
+ run3($commandRef, $stdinFile, $stdoutFile, $stderrFile);
240
+ return $?;
241
+ }
242
+
243
+ # Verify that the given output file is identical to the given reference file.
244
+ sub verifyIdentical {
245
+ my ($testName, $referenceFile, $outputFile) = @_;
246
+
247
+ open(REF, $referenceFile) or return fail($testName.": Can't open reference file ".$referenceFile." [".$!."].");
248
+ open(OUT, $outputFile) or return fail($testName.": Can't open output file ".$outputFile." [".$!."].");
249
+ my @referenceFileAsArray = <REF>;
250
+ my @outputFileAsArray = <OUT>;
251
+ close(REF);
252
+ close(OUT);
253
+ is_deeply(\@outputFileAsArray, \@referenceFileAsArray, $testName.": Output file ".$outputFile." matches reference file ".$referenceFile.".");
254
+ }
255
+
256
+
257
+ ##%%%%%%%%%%%%%%%%%%%%%%%%%%%##
258
+ ## DetokenizerTestCase class ##
259
+
260
+ package DetokenizerTestCase;
261
+
262
+ # Constructor
263
+ sub new {
264
+ my $class = shift;
265
+ my $self = {
266
+ _name => shift,
267
+ _language => shift,
268
+ _tokenizedText => shift,
269
+ _rightAnswer => shift,
270
+
271
+ _failureExplanation => undef
272
+ };
273
+ bless $self, $class;
274
+ }
275
+
276
+ sub getName {
277
+ my ($self) = @_;
278
+ return $self->{_name};
279
+ }
280
+
281
+ sub getLanguage {
282
+ my ($self) = @_;
283
+ return $self->{_language};
284
+ }
285
+
286
+ sub getTokenizedText {
287
+ my ($self) = @_;
288
+ return $self->{_tokenizedText};
289
+ }
290
+
291
+ sub getRightAnswer {
292
+ my ($self) = @_;
293
+ return $self->{_rightAnswer};
294
+ }
295
+
296
+ # Call this routine to indicate that this test case is expected to fail.
297
+ # (The detokenizer script is still expected to exit normally, but the output is not expected to
298
+ # match the right answer because of a bug or unimplemented use case.)
299
+ sub setExpectedToFail {
300
+ my ($self, $failureExplanation) = @_;
301
+ $self->{_failureExplanation} = $failureExplanation || "This test is expected to fail.";
302
+ }
303
+
304
+ # Returns a string explaining why this test is expected to fail, or undef if this test is expected
305
+ # to pass.
306
+ sub getFailureExplanation {
307
+ my ($self) = @_;
308
+ return $self->{_failureExplanation};
309
+ }
mosesdecoder/regression-testing/run-test-extract.perl ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ use warnings;
4
+ use strict;
5
+
6
+ BEGIN {
7
+ use Cwd qw/ abs_path /;
8
+ use File::Basename;
9
+ my $script_dir = dirname(abs_path($0));
10
+ print STDERR "script_dir=$script_dir\n";
11
+ push @INC, $script_dir;
12
+ }
13
+
14
+ use FindBin qw($Bin);
15
+ use MosesRegressionTesting;
16
+ use Getopt::Long;
17
+ use File::Temp qw ( tempfile );
18
+ use POSIX qw ( strftime );
19
+
20
+ my $extractorExe;
21
+ my $test_name;
22
+ my $data_dir;
23
+ my $test_dir;
24
+ my $results_dir;
25
+
26
+ GetOptions("extractor=s" => \$extractorExe,
27
+ "test=s" => \$test_name,
28
+ "data-dir=s"=> \$data_dir,
29
+ "test-dir=s"=> \$test_dir,
30
+ "results-dir=s"=> \$results_dir,
31
+ ) or exit 1;
32
+
33
+ # output dir
34
+ unless (defined $results_dir)
35
+ {
36
+ my $ts = get_timestamp($extractorExe);
37
+ $results_dir = "$data_dir/results/$test_name/$ts";
38
+ }
39
+
40
+ `mkdir -p $results_dir`;
41
+
42
+ my $outPath = "$results_dir";
43
+
44
+ my $extractorArgs = `cat $test_dir/$test_name/args.txt`;
45
+ $_ = $extractorArgs;
46
+ s/(\$\w+)/$1/eeg;
47
+ $extractorArgs = $_;
48
+
49
+ my $cmdMain = "$extractorExe $extractorArgs \n";
50
+
51
+ open CMD, ">$results_dir/cmd_line";
52
+ print CMD "$cmdMain";
53
+ close CMD;
54
+
55
+ `$cmdMain`;
56
+
57
+ my $truthPath = "$test_dir/$test_name/truth/";
58
+
59
+
60
+ if (-e $outPath)
61
+ {
62
+ my $cmd = "diff --exclude=.DS_Store --exclude=._* --exclude=cmd_line $outPath/ $truthPath/ | wc -l";
63
+ my $numDiff = `$cmd`;
64
+
65
+ if ($numDiff == 0)
66
+ {
67
+ # print STDERR "FAILURE. Ran $cmdMain\n";
68
+ print STDERR "SUCCESS\n";
69
+ exit 0;
70
+ }
71
+ else
72
+ {
73
+ print STDERR "FAILURE. Ran $cmdMain\n";
74
+ exit 1;
75
+ }
76
+ }
77
+ else
78
+ {
79
+ print STDERR "FAILURE. Output does not exists. Ran $cmdMain\n";
80
+ exit 1;
81
+ }
82
+
83
+ ###################################
84
+ sub get_timestamp {
85
+ my ($file) = @_;
86
+ my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
87
+ $atime,$mtime,$ctime,$blksize,$blocks)
88
+ = stat($file);
89
+ my $timestamp = strftime("%Y%m%d-%H%M%S", gmtime $mtime);
90
+ my $timestamp2 = strftime("%Y%m%d-%H%M%S", gmtime);
91
+ my $username = `whoami`; chomp $username;
92
+ return "moses.v$timestamp-$username-at-$timestamp2";
93
+ }
94
+
95
+
mosesdecoder/regression-testing/run-test-mert.perl ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ use warnings;
4
+ use strict;
5
+ my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
6
+ #use MertRegressionTesting;
7
+ use Getopt::Long;
8
+ use File::Temp qw ( tempfile );
9
+ use POSIX qw ( strftime );
10
+ my @SIGS = qw ( SIGHUP SIGINT SIGQUIT SIGILL SIGTRAP SIGABRT SIGIOT SIGBUS SIGFPE SIGKILL SIGUSR1 SIGSEGV SIGUSR2 SIGPIPE SIGALRM SIGTERM SIGSTKFLT SIGCHLD SIGCONT SIGSTOP SIGTSTP SIGTTIN SIGTTOU SIGURG SIGXCPU SIGXFSZ SIGVTALRM SIGPROF SIGWINCH SIGIO SIGPWR SIGSYS SIGUNUSED SIGRTMIN );
11
+ my ($decoder, $test_name);
12
+
13
+ my $test_dir = "$script_dir/tests";
14
+ my $bin_dir = "$script_dir/../bin";
15
+ my $data_dir;
16
+ my $BIN_TEST = $script_dir;
17
+ my $results_dir;
18
+
19
+ GetOptions("test=s" => \$test_name,
20
+ "data-dir=s"=> \$data_dir,
21
+ "bin-dir=s"=> \$bin_dir,
22
+ "test-dir=s"=> \$test_dir,
23
+ "results-dir=s"=> \$results_dir,
24
+ ) or exit 1;
25
+
26
+ die "Please specify a test to run with --test\n" unless $test_name;
27
+
28
+ die "Please specify the location of the data directory with --data-dir\n" unless $data_dir;
29
+
30
+ die "Please specify the location of the mert directory with --mert-dir\n" unless $bin_dir;
31
+
32
+ die "Cannot locate test dir at $test_dir" unless (-d $test_dir);
33
+
34
+ $test_dir .= "/$test_name";
35
+ die "Cannot locate test dir at $test_dir" unless (-d $test_dir);
36
+
37
+ #### get place to put results
38
+ unless (defined $results_dir) { $results_dir = "$data_dir/results"; }
39
+ if (!-d $results_dir) {
40
+ print STDERR "[WARNING] Results directory not found.\n";
41
+ mkdir ($results_dir) || die "Failed to create $results_dir";
42
+ }
43
+ $results_dir .= "/$test_name";
44
+ if (!-d $results_dir) {
45
+ print STDERR "[WARNING] Results directory for test=$test_name could not be found.\n";
46
+ mkdir ($results_dir) || die "Failed to create $results_dir";
47
+ }
48
+ ##########
49
+
50
+ my $ts = get_timestamp("$test_dir/command");
51
+ my $results = "$results_dir/$ts";
52
+ mkdir($results) || die "Failed to create results directory: $results\n";
53
+
54
+ my $truth = "$test_dir/truth";
55
+ if (!-d $truth) {
56
+ die "Could not find truth/ in $test_dir!\n";
57
+ }
58
+
59
+ print "RESULTS AVAILABLE IN: $results\n\n";
60
+
61
+ my ($o, $elapsed, $ec, $sig) = exec_test($test_dir, $results);
62
+ my $error = ($sig || $ec > 0);
63
+ if ($error) {
64
+ open OUT, ">$results/Summary";
65
+ print STDERR "$test_name CRASHED.\n\texit_code=$ec\n\tsignal=$sig\n";
66
+ print OUT "$test_name CRASHED.\n\texit_code=$ec\n\tsignal=$sig\n";
67
+ close OUT;
68
+ exit 2 if $sig;
69
+ exit 3;
70
+ }
71
+
72
+ ($o, $ec, $sig) = run_command("$test_dir/filter-stdout $results/run.stdout > $results/results.txt");
73
+ warn "filter-stdout failed!" if ($ec > 0 || $sig);
74
+ ($o, $ec, $sig) = run_command("$test_dir/filter-stderr $results/run.stderr >> $results/results.txt");
75
+ warn "filter-stderr failed!" if ($ec > 0 || $sig);
76
+
77
+ open OUT, ">> $results/results.txt";
78
+ print OUT "TOTAL_WALLTIME ~ $elapsed\n";
79
+ close OUT;
80
+
81
+ run_command("gzip $results/run.stdout");
82
+ run_command("gzip $results/run.stderr");
83
+
84
+ ($o, $ec, $sig) = run_command("$BIN_TEST/compare-results.perl $results $truth");
85
+ print $o;
86
+ if ($ec) {
87
+ print STDERR "FAILURE, for debugging see $test_dir\n";
88
+ exit 1;
89
+ }
90
+
91
+ exit 0;
92
+
93
+ sub exec_test {
94
+ my ($test_dir,$results) = @_;
95
+ my $start_time = time;
96
+ my $cmd = "sh $test_dir/command $bin_dir $test_dir 1> $results/run.stdout 2> $results/run.stderr";
97
+ open CMD, ">$results/cmd_line";
98
+ print CMD "$cmd";
99
+ close CMD;
100
+ my ($o, $ec, $sig) = run_command($cmd);
101
+ my $elapsed = 0;
102
+ $elapsed = time - $start_time;
103
+ return ($o, $elapsed, $ec, $sig);
104
+ }
105
+
106
+ sub run_command {
107
+ my ($cmd) = @_;
108
+ my $o = `$cmd`;
109
+ my $exit_code = $? >> 8;
110
+
111
+ my $signal = $? & 127;
112
+ my $core_dumped = $? & 128;
113
+ if ($signal) { $signal = sig_name($signal); }
114
+ return $o, $exit_code, $signal;
115
+ }
116
+
117
+ sub sig_name {
118
+ my $sig = shift;
119
+ return $SIGS[$sig];
120
+ }
121
+
122
+ sub get_timestamp {
123
+ my ($file) = @_;
124
+ my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,$mtime,$ctime,$blksize,$blocks) = stat($file);
125
+ my $timestamp = strftime("%Y%m%d-%H%M%S", gmtime $mtime);
126
+ my $timestamp2 = strftime("%Y%m%d-%H%M%S", gmtime);
127
+ my $username = `whoami`; chomp $username;
128
+ return "command.v$timestamp-$username-at-$timestamp2";
129
+ }
130
+
mosesdecoder/regression-testing/run-test-misc.perl ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ use warnings;
4
+ use strict;
5
+
6
+ BEGIN {
7
+ use Cwd qw/ abs_path cwd /;
8
+ use File::Basename;
9
+ my $script_dir = dirname(abs_path($0));
10
+ print STDERR "script_dir=$script_dir\n";
11
+ push @INC, $script_dir;
12
+ }
13
+
14
+ use FindBin qw($Bin);
15
+ use MosesRegressionTesting;
16
+ use Getopt::Long;
17
+ use File::Temp qw ( tempfile );
18
+ use POSIX qw ( strftime );
19
+
20
+ my ($mosesRoot, $mosesBin, $test_name, $data_dir, $test_dir, $results_dir);
21
+
22
+ GetOptions("moses-root=s" => \$mosesRoot,
23
+ "moses-bin=s" => \$mosesBin,
24
+ "test=s" => \$test_name,
25
+ "data-dir=s"=> \$data_dir,
26
+ "test-dir=s"=> \$test_dir,
27
+ "results-dir=s"=> \$results_dir,
28
+ ) or exit 1;
29
+
30
+ # output dir
31
+ unless (defined $results_dir)
32
+ {
33
+ my $ts = get_timestamp($mosesRoot);
34
+ $results_dir = "$data_dir/results/$test_name/$ts";
35
+ }
36
+
37
+ `mkdir -p $results_dir`;
38
+
39
+ use File::Basename qw/dirname/;
40
+ my $dir = dirname ($0);
41
+ my $cmdMain = "perl -I $dir $test_dir/$test_name/run.perl -moses-root $mosesRoot -moses-bin $mosesBin -test $test_name -data-dir $data_dir -test-dir $test_dir -results-dir $results_dir\n";
42
+
43
+ open CMD, ">$results_dir/cmd_line";
44
+ print CMD $cmdMain;
45
+ close CMD;
46
+
47
+ `$cmdMain`;
48
+
49
+ my $outPath = "$results_dir/out";
50
+ my $truthPath = "$test_dir/$test_name/truth/results.txt";
51
+
52
+ print STDERR "outPath=$outPath \n truthPath=$truthPath \n";
53
+
54
+ if (-e $outPath)
55
+ {
56
+ my $cmd = "diff --exclude=cmd_line $outPath $truthPath | wc -l";
57
+
58
+ my $numDiff = `$cmd`;
59
+
60
+ if ($numDiff == 0)
61
+ {
62
+ # print STDERR "FAILURE. Ran $cmdMain\n";
63
+ print STDERR "SUCCESS\n";
64
+ exit 0;
65
+ }
66
+ else
67
+ {
68
+ print STDERR "FAILURE. Ran $cmdMain\n";
69
+ exit 1;
70
+ }
71
+ }
72
+ else
73
+ {
74
+ print STDERR "FAILURE. Output does not exists. Ran $cmdMain\n";
75
+ exit 1;
76
+ }
77
+
78
+ ###################################
79
+ sub get_timestamp {
80
+ my ($file) = @_;
81
+ my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
82
+ $atime,$mtime,$ctime,$blksize,$blocks)
83
+ = stat($file);
84
+ my $timestamp = strftime("%Y%m%d-%H%M%S", gmtime $mtime);
85
+ my $timestamp2 = strftime("%Y%m%d-%H%M%S", gmtime);
86
+ my $username = `whoami`; chomp $username;
87
+ return "moses.v$timestamp-$username-at-$timestamp2";
88
+ }
89
+
90
+
mosesdecoder/regression-testing/run-test-scorer.perl ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ use warnings;
4
+ use strict;
5
+
6
+ BEGIN {
7
+ use Cwd qw/ abs_path /;
8
+ use File::Basename;
9
+ my $script_dir = dirname(abs_path($0));
10
+ print STDERR "script_dir=$script_dir\n";
11
+ push @INC, $script_dir;
12
+ }
13
+
14
+ use FindBin qw($Bin);
15
+ use MosesRegressionTesting;
16
+ use Getopt::Long;
17
+ use File::Temp qw ( tempfile );
18
+ use POSIX qw ( strftime );
19
+
20
+ my $scoreExe;
21
+ my $test_name;
22
+ my $data_dir;
23
+ my $test_dir;
24
+ my $results_dir;
25
+
26
+ GetOptions("scorer=s" => \$scoreExe,
27
+ "test=s" => \$test_name,
28
+ "data-dir=s"=> \$data_dir,
29
+ "test-dir=s"=> \$test_dir,
30
+ "results-dir=s"=> \$results_dir,
31
+ ) or exit 1;
32
+
33
+ # output dir
34
+ unless (defined $results_dir)
35
+ {
36
+ my $ts = get_timestamp($scoreExe);
37
+ $results_dir = "$data_dir/results/$test_name/$ts";
38
+ }
39
+
40
+ `mkdir -p $results_dir`;
41
+
42
+ my $outPath = "$results_dir/pt.half";
43
+
44
+ my $scorerArgs = `cat $test_dir/$test_name/args.txt`;
45
+ $_ = $scorerArgs;
46
+ s/(\$\w+)/$1/eeg;
47
+ $scorerArgs = $_;
48
+
49
+ my $cmdMain = "$scoreExe $scorerArgs \n";
50
+
51
+ open CMD, ">$results_dir/cmd_line";
52
+ print CMD "$cmdMain";
53
+ close CMD;
54
+
55
+ `$cmdMain`;
56
+
57
+ my $truthPath = "$test_dir/$test_name/truth/results.txt";
58
+
59
+
60
+ if (-e $outPath)
61
+ {
62
+ my $cmd = "diff $outPath $truthPath | wc -l";
63
+
64
+ my $numDiff = `$cmd`;
65
+
66
+ if ($numDiff == 0)
67
+ {
68
+ # print STDERR "FAILURE. Ran $cmdMain\n";
69
+ print STDERR "SUCCESS\n";
70
+ exit 0;
71
+ }
72
+ else
73
+ {
74
+ print STDERR "FAILURE. Ran $cmdMain\n";
75
+ exit 1;
76
+ }
77
+ }
78
+ else
79
+ {
80
+ print STDERR "FAILURE. Output does not exists. Ran $cmdMain\n";
81
+ exit 1;
82
+ }
83
+
84
+ ###################################
85
+ sub get_timestamp {
86
+ my ($file) = @_;
87
+ my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
88
+ $atime,$mtime,$ctime,$blksize,$blocks)
89
+ = stat($file);
90
+ my $timestamp = strftime("%Y%m%d-%H%M%S", gmtime $mtime);
91
+ my $timestamp2 = strftime("%Y%m%d-%H%M%S", gmtime);
92
+ my $username = `whoami`; chomp $username;
93
+ return "moses.v$timestamp-$username-at-$timestamp2";
94
+ }
95
+
96
+