sleepyhead111 commited on
Commit
7221d50
·
verified ·
1 Parent(s): 4b25173

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. mosesdecoder/defer/Joint.h +139 -0
  3. mosesdecoder/defer/PhraseDictionaryInterpolated.cpp +186 -0
  4. mosesdecoder/defer/PhraseLengthFeatureTest.cpp +104 -0
  5. mosesdecoder/lm/builder/corpus_count.hh +53 -0
  6. mosesdecoder/lm/builder/dump_counts_main.cc +36 -0
  7. mosesdecoder/lm/builder/lmplz_main.cc +220 -0
  8. mosesdecoder/lm/common/CMakeLists.txt +40 -0
  9. mosesdecoder/lm/common/Jamfile +2 -0
  10. mosesdecoder/lm/common/joint_order.hh +71 -0
  11. mosesdecoder/lm/common/ngram.hh +77 -0
  12. mosesdecoder/lm/common/print.cc +62 -0
  13. mosesdecoder/lm/common/renumber.cc +17 -0
  14. mosesdecoder/lm/common/renumber.hh +30 -0
  15. mosesdecoder/mert/ReferenceTest.cpp +123 -0
  16. mosesdecoder/mert/ScoreArray.cpp +169 -0
  17. mosesdecoder/mert/ScoreArray.h +113 -0
  18. mosesdecoder/mert/Util.h +149 -0
  19. mosesdecoder/moses/TranslationModel/UG/util/ibm1-align +3 -0
  20. mosesdecoder/scripts/Jamfile +23 -0
  21. mosesdecoder/scripts/README +15 -0
  22. mosesdecoder/scripts/generic/binarize4moses2.perl +88 -0
  23. mosesdecoder/scripts/generic/bsbleu.py +179 -0
  24. mosesdecoder/scripts/generic/compound-splitter.perl +295 -0
  25. mosesdecoder/scripts/generic/extract-factors.pl +24 -0
  26. mosesdecoder/scripts/generic/extract-parallel.perl +385 -0
  27. mosesdecoder/scripts/generic/fsa-sample.fsa +10 -0
  28. mosesdecoder/scripts/generic/fsa2fsal.pl +53 -0
  29. mosesdecoder/scripts/generic/fsa2plf.pl +182 -0
  30. mosesdecoder/scripts/generic/generic-parallel.perl +119 -0
  31. mosesdecoder/scripts/generic/giza-parallel.perl +134 -0
  32. mosesdecoder/scripts/generic/lopar2pos.pl +20 -0
  33. mosesdecoder/scripts/generic/moses_sim_pe.py +452 -0
  34. mosesdecoder/scripts/generic/mteval-v11b.pl +761 -0
  35. mosesdecoder/scripts/generic/mteval-v12.pl +784 -0
  36. mosesdecoder/scripts/generic/mteval-v13a.pl +1170 -0
  37. mosesdecoder/scripts/generic/mteval-v14.pl +1179 -0
  38. mosesdecoder/scripts/generic/multi-bleu-detok.perl +214 -0
  39. mosesdecoder/scripts/generic/multi-bleu.perl +177 -0
  40. mosesdecoder/scripts/generic/multi_moses.py +332 -0
  41. mosesdecoder/scripts/generic/ph_numbers.perl +106 -0
  42. mosesdecoder/scripts/generic/reverse-alignment.perl +24 -0
  43. mosesdecoder/scripts/generic/score-parallel.perl +428 -0
  44. mosesdecoder/scripts/generic/score_parallel.py +776 -0
  45. mosesdecoder/scripts/generic/strip-xml.perl +48 -0
  46. mosesdecoder/scripts/generic/trainlm-irst2.perl +72 -0
  47. mosesdecoder/scripts/share/nonbreaking_prefixes/README.txt +8 -0
  48. mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
  49. mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
  50. mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
.gitattributes CHANGED
@@ -37,3 +37,4 @@ fairseq-0.10.2/fairseq/libbleu.cpython-310-x86_64-linux-gnu.so filter=lfs diff=l
37
  fairseq-0.10.2/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
38
  fairseq-0.10.2/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
39
  fairseq-0.10.2/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text
 
 
37
  fairseq-0.10.2/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
38
  fairseq-0.10.2/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
39
  fairseq-0.10.2/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text
40
+ mosesdecoder/moses/TranslationModel/UG/util/ibm1-align filter=lfs diff=lfs merge=lfs -text
mosesdecoder/defer/Joint.h ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_LanguageModelJoint_h
23
+ #define moses_LanguageModelJoint_h
24
+
25
+ #include <vector>
26
+ #include <string>
27
+ #include <sstream>
28
+ #include "SingleFactor.h"
29
+ #include "MultiFactor.h"
30
+ #include "moses/Word.h"
31
+ #include "moses/FactorTypeSet.h"
32
+ #include "moses/FactorCollection.h"
33
+
34
+ namespace Moses
35
+ {
36
+
37
+ class Phrase;
38
+ class FactorCollection;
39
+
40
+ /** LM of multiple factors. A simple extension of single factor LM - factors backoff together.
41
+ * Rather slow as this uses string concatenation/split.
42
+ * Not used for a long time
43
+ */
44
+ class LanguageModelJoint : public LanguageModelMultiFactor
45
+ {
46
+ protected:
47
+ LanguageModelSingleFactor *m_lmImpl;
48
+ std::vector<FactorType> m_factorTypesOrdered;
49
+
50
+ size_t m_implFactor;
51
+ public:
52
+ LanguageModelJoint(const std::string &line, LanguageModelSingleFactor *lmImpl)
53
+ :LanguageModelMultiFactor(line) {
54
+ m_lmImpl = lmImpl;
55
+ }
56
+
57
+ ~LanguageModelJoint() {
58
+ delete m_lmImpl;
59
+ }
60
+
61
+ bool Load(AllOptions const& opts, const std::string &filePath
62
+ , const std::vector<FactorType> &factorTypes
63
+ , size_t nGramOrder) {
64
+ m_factorTypes = FactorMask(factorTypes);
65
+ m_filePath = filePath;
66
+ m_nGramOrder = nGramOrder;
67
+
68
+ m_factorTypesOrdered= factorTypes;
69
+ m_implFactor = 0;
70
+
71
+ FactorCollection &factorCollection = FactorCollection::Instance();
72
+
73
+ // sentence markers
74
+ for (size_t index = 0 ; index < factorTypes.size() ; ++index) {
75
+ FactorType factorType = factorTypes[index];
76
+ m_sentenceStartWord[factorType] = factorCollection.AddFactor(Output, factorType, BOS_);
77
+ m_sentenceEndWord[factorType] = factorCollection.AddFactor(Output, factorType, EOS_);
78
+ }
79
+
80
+ m_lmImpl->Load(AllOptions const& opts);
81
+ }
82
+
83
+ LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const {
84
+ if (contextFactor.size() == 0) {
85
+ LMResult ret;
86
+ ret.score = 0.0;
87
+ ret.unknown = false;
88
+ return ret;
89
+ }
90
+
91
+ // joint context for internal LM
92
+ std::vector<const Word*> jointContext;
93
+
94
+ for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos ) {
95
+ const Word &word = *contextFactor[currPos];
96
+
97
+ // add word to chunked context
98
+ std::stringstream stream("");
99
+
100
+ const Factor *factor = word[ m_factorTypesOrdered[0] ];
101
+ stream << factor->GetString();
102
+
103
+ for (size_t index = 1 ; index < m_factorTypesOrdered.size() ; ++index) {
104
+ FactorType factorType = m_factorTypesOrdered[index];
105
+ const Factor *factor = word[factorType];
106
+ stream << "|" << factor->GetString();
107
+ }
108
+
109
+ factor = FactorCollection::Instance().AddFactor(Output, m_implFactor, stream.str());
110
+
111
+ Word* jointWord = new Word;
112
+ jointWord->SetFactor(m_implFactor, factor);
113
+ jointContext.push_back(jointWord);
114
+ }
115
+
116
+ // calc score on chunked phrase
117
+ LMResult ret = m_lmImpl->GetValueForgotState(jointContext, outState);
118
+
119
+ RemoveAllInColl(jointContext);
120
+
121
+ return ret;
122
+ }
123
+
124
+ const FFState *GetNullContextState() const {
125
+ return m_lmImpl->GetNullContextState();
126
+ }
127
+
128
+ const FFState *GetBeginSentenceState() const {
129
+ return m_lmImpl->GetBeginSentenceState();
130
+ }
131
+
132
+ FFState *NewState(const FFState *from) const {
133
+ return m_lmImpl->NewState(from);
134
+ }
135
+
136
+ };
137
+
138
+ }
139
+ #endif
mosesdecoder/defer/PhraseDictionaryInterpolated.cpp ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2013- University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include <boost/lexical_cast.hpp>
21
+ #include <boost/unordered_set.hpp>
22
+
23
+ #include "util/exception.hh"
24
+ #include "util/tokenize_piece.hh"
25
+ #include "moses/TranslationModel/PhraseDictionaryInterpolated.h"
26
+
27
+ using namespace std;
28
+
29
+ namespace Moses
30
+ {
31
+
32
+ PhraseDictionaryInterpolated::PhraseDictionaryInterpolated
33
+ (size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature):
34
+ PhraseDictionary(numScoreComponent,feature),
35
+ m_targetPhrases(NULL),
36
+ m_languageModels(NULL) {}
37
+
38
+ bool PhraseDictionaryInterpolated::Load(
39
+ const std::vector<FactorType> &input
40
+ , const std::vector<FactorType> &output
41
+ , const std::vector<std::string>& config
42
+ , const std::vector<float> &weightT
43
+ , size_t tableLimit
44
+ , const LMList &languageModels
45
+ , float weightWP)
46
+ {
47
+
48
+ m_languageModels = &languageModels;
49
+ m_weightT = weightT;
50
+ m_tableLimit = tableLimit;
51
+ m_weightWP = weightWP;
52
+
53
+ //The config should be as follows:
54
+ //0-3: type factor factor num-components (as usual)
55
+ //4: combination mode (e.g. naive)
56
+ //5-(length-2): List of phrase-table files
57
+ //length-1: Weight string, in the same format as used for tmcombine
58
+
59
+ UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7");
60
+ UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'");
61
+
62
+ // Create the dictionaries
63
+ for (size_t i = 5; i < config.size()-1; ++i) {
64
+ m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor(
65
+ GetFeature()->GetNumScoreComponents(),
66
+ GetFeature()->GetNumInputScores(),
67
+ GetFeature())));
68
+ bool ret = m_dictionaries.back()->Load(
69
+ input,
70
+ output,
71
+ config[i],
72
+ weightT,
73
+ 0,
74
+ languageModels,
75
+ weightWP);
76
+ if (!ret) return ret;
77
+ }
78
+
79
+ //Parse the weight strings
80
+ for (util::TokenIter<util::SingleCharacter, false> featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) {
81
+ m_weights.push_back(vector<float>());
82
+ float sum = 0;
83
+ for (util::TokenIter<util::SingleCharacter, false> tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) {
84
+ const float weight = boost::lexical_cast<float>(*tableWeights);
85
+ m_weights.back().push_back(weight);
86
+ sum += weight;
87
+ }
88
+ UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception,
89
+ "Number of weights (" << m_weights.back().size() <<
90
+ ") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")");
91
+ UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised");
92
+
93
+ }
94
+
95
+ //check number of weight sets. Make sure there is a weight for every score component
96
+ //except for the last - which is assumed to be the phrase penalty.
97
+ UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets");
98
+ //if 1 weight set, then repeat
99
+ if (m_weights.size() == 1) {
100
+ while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) {
101
+ m_weights.push_back(m_weights[0]);
102
+ }
103
+ }
104
+
105
+ return true;
106
+ }
107
+
108
+ void PhraseDictionaryInterpolated::InitializeForInput(ttasksptr const& ttask)
109
+ {
110
+ for (size_t i = 0; i < m_dictionaries.size(); ++i) {
111
+ m_dictionaries[i]->InitializeForInput(ttask);
112
+ }
113
+ }
114
+
115
+ typedef
116
+ boost::unordered_set<TargetPhrase*,PhrasePtrHasher,PhrasePtrComparator> PhraseSet;
117
+
118
+
119
+ TargetPhraseCollection::shared_ptr
120
+ PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const
121
+ {
122
+
123
+ delete m_targetPhrases;
124
+ m_targetPhrases = new TargetPhraseCollection();
125
+ PhraseSet allPhrases;
126
+ vector<PhraseSet> phrasesByTable(m_dictionaries.size());
127
+ for (size_t i = 0; i < m_dictionaries.size(); ++i) {
128
+ TargetPhraseCollection::shared_ptr phrases = m_dictionaries[i]->GetTargetPhraseCollection(src);
129
+ if (phrases) {
130
+ for (TargetPhraseCollection::const_iterator j = phrases->begin();
131
+ j != phrases->end(); ++j) {
132
+ allPhrases.insert(*j);
133
+ phrasesByTable[i].insert(*j);
134
+ }
135
+ }
136
+ }
137
+ ScoreComponentCollection sparseVector;
138
+ for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) {
139
+ TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i);
140
+ //combinedPhrase->ResetScore();
141
+ //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
142
+ combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase());
143
+ combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm()));
144
+ combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm()));
145
+ Scores combinedScores(GetFeature()->GetNumScoreComponents());
146
+ for (size_t j = 0; j < phrasesByTable.size(); ++j) {
147
+ PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase);
148
+ if (tablePhrase != phrasesByTable[j].end()) {
149
+ Scores tableScores = (*tablePhrase)->GetScoreBreakdown()
150
+ .GetScoresForProducer(GetFeature());
151
+ //cerr << "Scores from " << j << " table: ";
152
+ for (size_t k = 0; k < tableScores.size()-1; ++k) {
153
+ //cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") ";
154
+ combinedScores[k] += m_weights[k][j] * exp(tableScores[k]);
155
+ //cerr << m_weights[k][j] * exp(tableScores[k]) << " ";
156
+ }
157
+ //cerr << endl;
158
+ }
159
+ }
160
+ //map back to log space
161
+ //cerr << "Combined ";
162
+ for (size_t k = 0; k < combinedScores.size()-1; ++k) {
163
+ //cerr << combinedScores[k] << " ";
164
+ combinedScores[k] = log(combinedScores[k]);
165
+ //cerr << combinedScores[k] << " ";
166
+ }
167
+ //cerr << endl;
168
+ combinedScores.back() = 1; //assume last is penalty
169
+ combinedPhrase->SetScore(
170
+ GetFeature(),
171
+ combinedScores,
172
+ sparseVector,
173
+ m_weightT,
174
+ m_weightWP,
175
+ *m_languageModels);
176
+ //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
177
+ m_targetPhrases->Add(combinedPhrase);
178
+ }
179
+
180
+ m_targetPhrases->Prune(true,m_tableLimit);
181
+
182
+
183
+ return m_targetPhrases;
184
+ }
185
+
186
+ }
mosesdecoder/defer/PhraseLengthFeatureTest.cpp ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+ #include <boost/test/unit_test.hpp>
20
+
21
+ #include "moses/FF/PhraseLengthFeature.h"
22
+ #include "moses/FactorCollection.h"
23
+ #include "moses/Sentence.h"
24
+ #include "moses/TargetPhrase.h"
25
+ #include "moses/TranslationOption.h"
26
+
27
+ using namespace Moses;
28
+ using namespace std;
29
+
30
+ BOOST_AUTO_TEST_SUITE(phrase_length_feature)
31
+
32
+ //TODO: Factor out setup code so that it can be reused
33
+
34
+ static Word MakeWord(string text)
35
+ {
36
+ FactorCollection &factorCollection = FactorCollection::Instance();
37
+ const Factor* f = factorCollection.AddFactor(Input,0,text);
38
+ Word w;
39
+ w.SetFactor(0,f);
40
+ return w;
41
+ }
42
+
43
+
44
+ BOOST_AUTO_TEST_CASE(evaluate)
45
+ {
46
+ Word w1 = MakeWord("w1");
47
+ Word w2 = MakeWord("y2");
48
+ Word w3 = MakeWord("x3");
49
+ Word w4 = MakeWord("w4");
50
+
51
+ Phrase p1;
52
+ p1.AddWord(w1);
53
+ p1.AddWord(w3);
54
+ p1.AddWord(w4);
55
+
56
+ Phrase p2;
57
+ p2.AddWord(w1);
58
+ p2.AddWord(w2);
59
+
60
+ Phrase p3;
61
+ p3.AddWord(w2);
62
+ p3.AddWord(w1);
63
+ p3.AddWord(w4);
64
+ p3.AddWord(w4);
65
+
66
+ TargetPhrase tp1(p1);
67
+ TargetPhrase tp2(p2);
68
+ TargetPhrase tp3(p3);
69
+
70
+ Sentence sentence;
71
+ vector<FactorType> order;
72
+ order.push_back(0);
73
+ stringstream in("the input sentence has 6 words");
74
+ sentence.Read(in, order);
75
+
76
+ TranslationOption topt1(WordsRange(0,0), tp1);
77
+ TranslationOption topt2(WordsRange(1,3), tp2);
78
+ TranslationOption topt3(WordsRange(2,3), tp3);
79
+
80
+ PhraseBasedFeatureContext context1(topt1,sentence);
81
+ PhraseBasedFeatureContext context2(topt2,sentence);
82
+ PhraseBasedFeatureContext context3(topt3,sentence);
83
+
84
+ PhraseLengthFeature plf;
85
+
86
+ ScoreComponentCollection acc1,acc2,acc3;
87
+
88
+ plf.Evaluate(context1, &acc1);
89
+ BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "s1"),1);
90
+ BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "t3"),1);
91
+ BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "1,3"),1);
92
+
93
+ plf.Evaluate(context2, &acc2);
94
+ BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "s3"),1);
95
+ BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "t2"),1);
96
+ BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "3,2"),1);
97
+
98
+ plf.Evaluate(context3, &acc3);
99
+ BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "s2"),1);
100
+ BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "t4"),1);
101
+ BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "2,4"),1);
102
+ }
103
+
104
+ BOOST_AUTO_TEST_SUITE_END()
mosesdecoder/lm/builder/corpus_count.hh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef LM_BUILDER_CORPUS_COUNT_H
2
+ #define LM_BUILDER_CORPUS_COUNT_H
3
+
4
+ #include "lm/lm_exception.hh"
5
+ #include "lm/word_index.hh"
6
+ #include "util/scoped.hh"
7
+
8
+ #include <cstddef>
9
+ #include <string>
10
+ #include <stdint.h>
11
+ #include <vector>
12
+
13
+ namespace util {
14
+ class FilePiece;
15
+ namespace stream {
16
+ class ChainPosition;
17
+ } // namespace stream
18
+ } // namespace util
19
+
20
+ namespace lm {
21
+ namespace builder {
22
+
23
+ class CorpusCount {
24
+ public:
25
+ // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size
26
+ static float DedupeMultiplier(std::size_t order);
27
+
28
+ // How much memory vocabulary will use based on estimated size of the vocab.
29
+ static std::size_t VocabUsage(std::size_t vocab_estimate);
30
+
31
+ // token_count: out.
32
+ // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value.
33
+ CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol);
34
+
35
+ void Run(const util::stream::ChainPosition &position);
36
+
37
+ private:
38
+ util::FilePiece &from_;
39
+ int vocab_write_;
40
+ uint64_t &token_count_;
41
+ WordIndex &type_count_;
42
+ std::vector<bool>& prune_words_;
43
+ const std::string& prune_vocab_filename_;
44
+
45
+ std::size_t dedupe_mem_size_;
46
+ util::scoped_malloc dedupe_mem_;
47
+
48
+ WarningAction disallowed_symbol_action_;
49
+ };
50
+
51
+ } // namespace builder
52
+ } // namespace lm
53
+ #endif // LM_BUILDER_CORPUS_COUNT_H
mosesdecoder/lm/builder/dump_counts_main.cc ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "lm/common/print.hh"
2
+ #include "lm/word_index.hh"
3
+ #include "util/file.hh"
4
+ #include "util/read_compressed.hh"
5
+
6
+ #include <boost/lexical_cast.hpp>
7
+
8
+ #include <iostream>
9
+ #include <vector>
10
+
11
+ int main(int argc, char *argv[]) {
12
+ if (argc != 4) {
13
+ std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n"
14
+ "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n"
15
+ "counts. Each record has order many vocabulary ids.\n"
16
+ "The vocabulary file contains the words delimited by NULL in order of id.\n"
17
+ "The vocabulary file may not be compressed because it is mmapped but the counts\n"
18
+ "file can be compressed.\n";
19
+ return 1;
20
+ }
21
+ util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
22
+ util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
23
+ lm::VocabReconstitute vocab(vocab_file.get());
24
+ unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
25
+ std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
26
+ while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
27
+ UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size());
28
+ const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin());
29
+ for (const lm::WordIndex *i = words; i != words + order; ++i) {
30
+ UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?");
31
+ std::cout << vocab.Lookup(*i) << ' ';
32
+ }
33
+ // TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FileStream.
34
+ std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
35
+ }
36
+ }
mosesdecoder/lm/builder/lmplz_main.cc ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "lm/builder/output.hh"
2
+ #include "lm/builder/pipeline.hh"
3
+ #include "lm/common/size_option.hh"
4
+ #include "lm/lm_exception.hh"
5
+ #include "util/file.hh"
6
+ #include "util/file_piece.hh"
7
+ #include "util/usage.hh"
8
+
9
+ #include <iostream>
10
+
11
+ #include <boost/program_options.hpp>
12
+ #include <boost/version.hpp>
13
+ #include <vector>
14
+
15
+ namespace {
16
+
17
+ // Parse and validate pruning thresholds then return vector of threshold counts
18
+ // for each n-grams order.
19
+ std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::size_t order) {
20
+ // convert to vector of integers
21
+ std::vector<uint64_t> prune_thresholds;
22
+ prune_thresholds.reserve(order);
23
+ for (std::vector<std::string>::const_iterator it(param.begin()); it != param.end(); ++it) {
24
+ try {
25
+ prune_thresholds.push_back(boost::lexical_cast<uint64_t>(*it));
26
+ } catch(const boost::bad_lexical_cast &) {
27
+ UTIL_THROW(util::Exception, "Bad pruning threshold " << *it);
28
+ }
29
+ }
30
+
31
+ // Fill with zeros by default.
32
+ if (prune_thresholds.empty()) {
33
+ prune_thresholds.resize(order, 0);
34
+ return prune_thresholds;
35
+ }
36
+
37
+ // validate pruning threshold if specified
38
+ // throw if each n-gram order has not threshold specified
39
+ UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
40
+ // threshold for unigram can only be 0 (no pruning)
41
+
42
+ // check if threshold are not in decreasing order
43
+ uint64_t lower_threshold = 0;
44
+ for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
45
+ UTIL_THROW_IF(lower_threshold > *it, util::Exception, "Pruning thresholds should be in non-decreasing order. Otherwise substrings would be removed, which is bad for query-time data structures.");
46
+ lower_threshold = *it;
47
+ }
48
+
49
+ // Pad to all orders using the last value.
50
+ prune_thresholds.resize(order, prune_thresholds.back());
51
+ return prune_thresholds;
52
+ }
53
+
54
+ lm::builder::Discount ParseDiscountFallback(const std::vector<std::string> &param) {
55
+ lm::builder::Discount ret;
56
+ UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+");
57
+ UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified");
58
+ ret.amount[0] = 0.0;
59
+ for (unsigned i = 0; i < 3; ++i) {
60
+ float discount = boost::lexical_cast<float>(param[i < param.size() ? i : (param.size() - 1)]);
61
+ UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "].");
62
+ ret.amount[i + 1] = discount;
63
+ }
64
+ return ret;
65
+ }
66
+
67
+ } // namespace
68
+
69
+ int main(int argc, char *argv[]) {
70
+ try {
71
+ namespace po = boost::program_options;
72
+ po::options_description options("Language model building options");
73
+ lm::builder::PipelineConfig pipeline;
74
+
75
+ std::string text, intermediate, arpa;
76
+ std::vector<std::string> pruning;
77
+ std::vector<std::string> discount_fallback;
78
+ std::vector<std::string> discount_fallback_default;
79
+ discount_fallback_default.push_back("0.5");
80
+ discount_fallback_default.push_back("1");
81
+ discount_fallback_default.push_back("1.5");
82
+ bool verbose_header;
83
+
84
+ options.add_options()
85
+ ("help,h", po::bool_switch(), "Show this help message")
86
+ ("order,o", po::value<std::size_t>(&pipeline.order)
87
+ #if BOOST_VERSION >= 104200
88
+ ->required()
89
+ #endif
90
+ , "Order of the model")
91
+ ("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI. If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
92
+ ("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
93
+ ("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
94
+ ("memory,S", lm:: SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
95
+ ("minimum_block", lm::SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
96
+ ("sort_block", lm::SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
97
+ ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
98
+ ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
99
+ ("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
100
+ ("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
101
+ ("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
102
+ ("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
103
+ ("intermediate", po::value<std::string>(&intermediate), "Write ngrams to intermediate files. Turns off ARPA output (which can be reactivated by --arpa file). Forces --renumber on.")
104
+ ("renumber", po::bool_switch(&pipeline.renumber_vocabulary), "Rrenumber the vocabulary identifiers so that they are monotone with the hash of each string. This is consistent with the ordering used by the trie data structure.")
105
+ ("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
106
+ ("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.")
107
+ ("limit_vocab_file", po::value<std::string>(&pipeline.prune_vocab_file)->default_value(""), "Read allowed vocabulary separated by whitespace. N-grams that contain vocabulary items not in this list will be pruned. Can be combined with --prune arg")
108
+ ("discount_fallback", po::value<std::vector<std::string> >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons. It can also fail if these values are out of range. This option falls back to user-specified discounts when the closed-form estimate fails. Note that this option is generally a bad idea: you should deduplicate your corpus instead. However, class-based models need custom discounts because they lack singleton unigrams. Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail.");
109
+ po::variables_map vm;
110
+ po::store(po::parse_command_line(argc, argv, options), vm);
111
+
112
+ if (argc == 1 || vm["help"].as<bool>()) {
113
+ std::cerr <<
114
+ "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
115
+ "Please cite:\n"
116
+ "@inproceedings{Heafield-estimate,\n"
117
+ " author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
118
+ " title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
119
+ " year = {2013},\n"
120
+ " month = {8},\n"
121
+ " booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
122
+ " address = {Sofia, Bulgaria},\n"
123
+ " url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
124
+ "}\n\n"
125
+ "Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n"
126
+ "the model (-o) is the only mandatory option. As this is an on-disk program,\n"
127
+ "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
128
+ "Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
129
+ "Valid units are \% for percentage of memory (supported platforms only) and (in\n"
130
+ "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n";
131
+ uint64_t mem = util::GuessPhysicalMemory();
132
+ if (mem) {
133
+ std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
134
+ } else {
135
+ std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
136
+ }
137
+ std::cerr << options << std::endl;
138
+ return 1;
139
+ }
140
+
141
+ po::notify(vm);
142
+
143
+ // required() appeared in Boost 1.42.0.
144
+ #if BOOST_VERSION < 104200
145
+ if (!vm.count("order")) {
146
+ std::cerr << "the option '--order' is required but missing" << std::endl;
147
+ return 1;
148
+ }
149
+ #endif
150
+
151
+ if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) {
152
+ std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl;
153
+ return 1;
154
+ }
155
+
156
+ if (vm["skip_symbols"].as<bool>()) {
157
+ pipeline.disallowed_symbol_action = lm::COMPLAIN;
158
+ } else {
159
+ pipeline.disallowed_symbol_action = lm::THROW_UP;
160
+ }
161
+
162
+ if (vm.count("discount_fallback")) {
163
+ pipeline.discount.fallback = ParseDiscountFallback(discount_fallback);
164
+ pipeline.discount.bad_action = lm::COMPLAIN;
165
+ } else {
166
+ // Unused, just here to prevent the compiler from complaining about uninitialized.
167
+ pipeline.discount.fallback = lm::builder::Discount();
168
+ pipeline.discount.bad_action = lm::THROW_UP;
169
+ }
170
+
171
+ // parse pruning thresholds. These depend on order, so it is not done as a notifier.
172
+ pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order);
173
+
174
+ if (!vm["limit_vocab_file"].as<std::string>().empty()) {
175
+ pipeline.prune_vocab = true;
176
+ }
177
+ else {
178
+ pipeline.prune_vocab = false;
179
+ }
180
+
181
+ util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
182
+
183
+ lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
184
+ // TODO: evaluate options for these.
185
+ initial.adder_in.total_memory = 32768;
186
+ initial.adder_in.block_count = 2;
187
+ initial.adder_out.total_memory = 32768;
188
+ initial.adder_out.block_count = 2;
189
+ pipeline.read_backoffs = initial.adder_out;
190
+
191
+ // Read from stdin, write to stdout by default
192
+ util::scoped_fd in(0), out(1);
193
+ if (vm.count("text")) {
194
+ in.reset(util::OpenReadOrThrow(text.c_str()));
195
+ }
196
+ if (vm.count("arpa")) {
197
+ out.reset(util::CreateOrThrow(arpa.c_str()));
198
+ }
199
+
200
+ try {
201
+ bool writing_intermediate = vm.count("intermediate");
202
+ if (writing_intermediate) {
203
+ pipeline.renumber_vocabulary = true;
204
+ }
205
+ lm::builder::Output output(writing_intermediate ? intermediate : pipeline.sort.temp_prefix, writing_intermediate, pipeline.output_q);
206
+ if (!writing_intermediate || vm.count("arpa")) {
207
+ output.Add(new lm::builder::PrintHook(out.release(), verbose_header));
208
+ }
209
+ lm::builder::Pipeline(pipeline, in.release(), output);
210
+ } catch (const util::MallocException &e) {
211
+ std::cerr << e.what() << std::endl;
212
+ std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
213
+ return 1;
214
+ }
215
+ util::PrintUsage(std::cerr);
216
+ } catch (const std::exception &e) {
217
+ std::cerr << e.what() << std::endl;
218
+ return 1;
219
+ }
220
+ }
mosesdecoder/lm/common/CMakeLists.txt ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 2.8.8)
2
+ #
3
+ # The KenLM cmake files make use of add_library(... OBJECTS ...)
4
+ #
5
+ # This syntax allows grouping of source files when compiling
6
+ # (effectively creating "fake" libraries based on source subdirs).
7
+ #
8
+ # This syntax was only added in cmake version 2.8.8
9
+ #
10
+ # see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
11
+
12
+
13
+ # This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
14
+
15
+ # Explicitly list the source files for this subdirectory
16
+ #
17
+ # If you add any source files to this subdirectory
18
+ # that should be included in the kenlm library,
19
+ # (this excludes any unit test files)
20
+ # you should add them to the following list:
21
+ #
22
+ # In order to set correct paths to these files
23
+ # in case this variable is referenced by CMake files in the parent directory,
24
+ # we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
25
+ #
26
+ set(KENLM_COMMON_SOURCE
27
+ ${CMAKE_CURRENT_SOURCE_DIR}/model_buffer.cc
28
+ ${CMAKE_CURRENT_SOURCE_DIR}/print.cc
29
+ ${CMAKE_CURRENT_SOURCE_DIR}/renumber.cc
30
+ ${CMAKE_CURRENT_SOURCE_DIR}/size_option.cc
31
+ )
32
+
33
+
34
+ # Group these objects together for later use.
35
+ #
36
+ # Given add_library(foo OBJECT ${my_foo_sources}),
37
+ # refer to these objects as $<TARGET_OBJECTS:foo>
38
+ #
39
+ add_library(kenlm_common OBJECT ${KENLM_COMMON_SOURCE})
40
+
mosesdecoder/lm/common/Jamfile ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ fakelib common : [ glob *.cc : *test.cc *main.cc ]
2
+ ../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm /top//boost_program_options ;
mosesdecoder/lm/common/joint_order.hh ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef LM_COMMON_JOINT_ORDER_H
2
+ #define LM_COMMON_JOINT_ORDER_H
3
+
4
+ #include "lm/common/ngram_stream.hh"
5
+ #include "lm/lm_exception.hh"
6
+
7
+ #ifdef DEBUG
8
+ #include "util/fixed_array.hh"
9
+ #include <iostream>
10
+ #endif
11
+
12
+ #include <cstring>
13
+
14
+ namespace lm {
15
+
16
+ template <class Callback, class Compare> void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) {
17
+ // Allow matching to reference streams[-1].
18
+ util::FixedArray<ProxyStream<NGramHeader> > streams_with_dummy(positions.size() + 1);
19
+ // A bogus stream for [-1].
20
+ streams_with_dummy.push_back();
21
+ for (std::size_t i = 0; i < positions.size(); ++i) {
22
+ streams_with_dummy.push_back(positions[i], NGramHeader(NULL, i + 1));
23
+ }
24
+ ProxyStream<NGramHeader> *streams = streams_with_dummy.begin() + 1;
25
+
26
+ std::size_t order;
27
+ for (order = 0; order < positions.size() && streams[order]; ++order) {}
28
+ assert(order); // should always have <unk>.
29
+
30
+ // Debugging only: call comparison function to sanity check order.
31
+ #ifdef DEBUG
32
+ util::FixedArray<Compare> less_compare(order);
33
+ for (unsigned i = 0; i < order; ++i)
34
+ less_compare.push_back(i + 1);
35
+ #endif // DEBUG
36
+
37
+ std::size_t current = 0;
38
+ while (true) {
39
+ // Does the context match the lower one?
40
+ if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
41
+ callback.Enter(current, streams[current].Get());
42
+ // Transition to looking for extensions.
43
+ if (++current < order) continue;
44
+ }
45
+ #ifdef DEBUG
46
+ // match_check[current - 1] matches current-grams
47
+ // The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams).
48
+ else if (!less_compare[current - 1](streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) {
49
+ std::cerr << "Stream out of order detected" << std::endl;
50
+ abort();
51
+ }
52
+ #endif // DEBUG
53
+ // No extension left.
54
+ while(true) {
55
+ assert(current > 0);
56
+ --current;
57
+ callback.Exit(current, streams[current].Get());
58
+
59
+ if (++streams[current]) break;
60
+
61
+ UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
62
+
63
+ order = current;
64
+ if (!order) return;
65
+ }
66
+ }
67
+ }
68
+
69
+ } // namespaces
70
+
71
+ #endif // LM_COMMON_JOINT_ORDER_H
mosesdecoder/lm/common/ngram.hh ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef LM_COMMON_NGRAM_H
2
+ #define LM_COMMON_NGRAM_H
3
+
4
+ #include "lm/weights.hh"
5
+ #include "lm/word_index.hh"
6
+
7
+ #include <cstddef>
8
+ #include <cassert>
9
+ #include <stdint.h>
10
+ #include <cstring>
11
+
12
+ namespace lm {
13
+
14
+ class NGramHeader {
15
+ public:
16
+ NGramHeader(void *begin, std::size_t order)
17
+ : begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {}
18
+
19
+ NGramHeader() : begin_(NULL), end_(NULL) {}
20
+
21
+ const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); }
22
+ uint8_t *Base() { return reinterpret_cast<uint8_t*>(begin_); }
23
+
24
+ void ReBase(void *to) {
25
+ std::size_t difference = end_ - begin_;
26
+ begin_ = reinterpret_cast<WordIndex*>(to);
27
+ end_ = begin_ + difference;
28
+ }
29
+
30
+ // These are for the vocab index.
31
+ // Lower-case in deference to STL.
32
+ const WordIndex *begin() const { return begin_; }
33
+ WordIndex *begin() { return begin_; }
34
+ const WordIndex *end() const { return end_; }
35
+ WordIndex *end() { return end_; }
36
+
37
+ std::size_t size() const { return end_ - begin_; }
38
+ std::size_t Order() const { return end_ - begin_; }
39
+
40
+ private:
41
+ WordIndex *begin_, *end_;
42
+ };
43
+
44
+ template <class PayloadT> class NGram : public NGramHeader {
45
+ public:
46
+ typedef PayloadT Payload;
47
+
48
+ NGram() : NGramHeader(NULL, 0) {}
49
+
50
+ NGram(void *begin, std::size_t order) : NGramHeader(begin, order) {}
51
+
52
+ // Would do operator++ but that can get confusing for a stream.
53
+ void NextInMemory() {
54
+ ReBase(&Value() + 1);
55
+ }
56
+
57
+ static std::size_t TotalSize(std::size_t order) {
58
+ return order * sizeof(WordIndex) + sizeof(Payload);
59
+ }
60
+ std::size_t TotalSize() const {
61
+ // Compiler should optimize this.
62
+ return TotalSize(Order());
63
+ }
64
+
65
+ static std::size_t OrderFromSize(std::size_t size) {
66
+ std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex);
67
+ assert(size == TotalSize(ret));
68
+ return ret;
69
+ }
70
+
71
+ const Payload &Value() const { return *reinterpret_cast<const Payload *>(end()); }
72
+ Payload &Value() { return *reinterpret_cast<Payload *>(end()); }
73
+ };
74
+
75
+ } // namespace lm
76
+
77
+ #endif // LM_COMMON_NGRAM_H
mosesdecoder/lm/common/print.cc ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "lm/common/print.hh"
2
+
3
+ #include "lm/common/ngram_stream.hh"
4
+ #include "util/file_stream.hh"
5
+ #include "util/file.hh"
6
+ #include "util/mmap.hh"
7
+ #include "util/scoped.hh"
8
+
9
+ #include <sstream>
10
+ #include <cstring>
11
+
12
+ namespace lm {
13
+
14
+ VocabReconstitute::VocabReconstitute(int fd) {
15
+ uint64_t size = util::SizeOrThrow(fd);
16
+ util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
17
+ const char *const start = static_cast<const char*>(memory_.get());
18
+ const char *i;
19
+ for (i = start; i != start + size; i += strlen(i) + 1) {
20
+ map_.push_back(i);
21
+ }
22
+ // Last one for LookupPiece.
23
+ map_.push_back(i);
24
+ }
25
+
26
+ namespace {
27
+ template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FileStream &out) {
28
+ out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin());
29
+ for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
30
+ out << ' ' << vocab.Lookup(*i);
31
+ }
32
+ }
33
+ } // namespace
34
+
35
+ void PrintARPA::Run(const util::stream::ChainPositions &positions) {
36
+ VocabReconstitute vocab(vocab_fd_);
37
+ util::FileStream out(out_fd_);
38
+ out << "\\data\\\n";
39
+ for (size_t i = 0; i < positions.size(); ++i) {
40
+ out << "ngram " << (i+1) << '=' << counts_[i] << '\n';
41
+ }
42
+ out << '\n';
43
+
44
+ for (unsigned order = 1; order < positions.size(); ++order) {
45
+ out << "\\" << order << "-grams:" << '\n';
46
+ for (ProxyStream<NGram<ProbBackoff> > stream(positions[order - 1], NGram<ProbBackoff>(NULL, order)); stream; ++stream) {
47
+ PrintLead(vocab, stream, out);
48
+ out << '\t' << stream->Value().backoff << '\n';
49
+ }
50
+ out << '\n';
51
+ }
52
+
53
+ out << "\\" << positions.size() << "-grams:" << '\n';
54
+ for (ProxyStream<NGram<Prob> > stream(positions.back(), NGram<Prob>(NULL, positions.size())); stream; ++stream) {
55
+ PrintLead(vocab, stream, out);
56
+ out << '\n';
57
+ }
58
+ out << '\n';
59
+ out << "\\end\\\n";
60
+ }
61
+
62
+ } // namespace lm
mosesdecoder/lm/common/renumber.cc ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "lm/common/renumber.hh"
2
+ #include "lm/common/ngram.hh"
3
+
4
+ #include "util/stream/stream.hh"
5
+
6
+ namespace lm {
7
+
8
+ void Renumber::Run(const util::stream::ChainPosition &position) {
9
+ for (util::stream::Stream stream(position); stream; ++stream) {
10
+ NGramHeader gram(stream.Get(), order_);
11
+ for (WordIndex *w = gram.begin(); w != gram.end(); ++w) {
12
+ *w = new_numbers_[*w];
13
+ }
14
+ }
15
+ }
16
+
17
+ } // namespace lm
mosesdecoder/lm/common/renumber.hh ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Map vocab ids. This is useful to merge independently collected counts or
2
+ * change the vocab ids to the order used by the trie.
3
+ */
4
+ #ifndef LM_COMMON_RENUMBER_H
5
+ #define LM_COMMON_RENUMBER_H
6
+
7
+ #include "lm/word_index.hh"
8
+
9
+ #include <cstddef>
10
+
11
+ namespace util { namespace stream { class ChainPosition; }}
12
+
13
+ namespace lm {
14
+
15
+ class Renumber {
16
+ public:
17
+ // Assumes the array is large enough to map all words and stays alive while
18
+ // the thread is active.
19
+ Renumber(const WordIndex *new_numbers, std::size_t order)
20
+ : new_numbers_(new_numbers), order_(order) {}
21
+
22
+ void Run(const util::stream::ChainPosition &position);
23
+
24
+ private:
25
+ const WordIndex *new_numbers_;
26
+ std::size_t order_;
27
+ };
28
+
29
+ } // namespace lm
30
+ #endif // LM_COMMON_RENUMBER_H
mosesdecoder/mert/ReferenceTest.cpp ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "Reference.h"
2
+
3
+ #define BOOST_TEST_MODULE MertReference
4
+ #include <boost/test/unit_test.hpp>
5
+
6
+ using namespace MosesTuning;
7
+
8
+ BOOST_AUTO_TEST_CASE(refernece_count)
9
+ {
10
+ Reference ref;
11
+ BOOST_CHECK(ref.get_counts() != NULL);
12
+ }
13
+
14
+ BOOST_AUTO_TEST_CASE(refernece_length_iterator)
15
+ {
16
+ Reference ref;
17
+ ref.push_back(4);
18
+ ref.push_back(2);
19
+ BOOST_REQUIRE(ref.num_references() == 2);
20
+
21
+ Reference::iterator it = ref.begin();
22
+ BOOST_CHECK_EQUAL(*it, 4);
23
+ ++it;
24
+ BOOST_CHECK_EQUAL(*it, 2);
25
+ ++it;
26
+ BOOST_CHECK(it == ref.end());
27
+ }
28
+
29
+ BOOST_AUTO_TEST_CASE(refernece_length_average)
30
+ {
31
+ {
32
+ Reference ref;
33
+ ref.push_back(4);
34
+ ref.push_back(1);
35
+ BOOST_CHECK_EQUAL(2, ref.CalcAverage());
36
+ }
37
+
38
+ {
39
+ Reference ref;
40
+ ref.push_back(4);
41
+ ref.push_back(3);
42
+ BOOST_CHECK_EQUAL(3, ref.CalcAverage());
43
+ }
44
+
45
+ {
46
+ Reference ref;
47
+ ref.push_back(4);
48
+ ref.push_back(3);
49
+ ref.push_back(4);
50
+ ref.push_back(5);
51
+ BOOST_CHECK_EQUAL(4, ref.CalcAverage());
52
+ }
53
+ }
54
+
55
+ BOOST_AUTO_TEST_CASE(refernece_length_closest)
56
+ {
57
+ {
58
+ Reference ref;
59
+ ref.push_back(4);
60
+ ref.push_back(1);
61
+ BOOST_REQUIRE(ref.num_references() == 2);
62
+
63
+ BOOST_CHECK_EQUAL(1, ref.CalcClosest(2));
64
+ BOOST_CHECK_EQUAL(1, ref.CalcClosest(1));
65
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(3));
66
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
67
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(5));
68
+ }
69
+
70
+ {
71
+ Reference ref;
72
+ ref.push_back(4);
73
+ ref.push_back(3);
74
+ BOOST_REQUIRE(ref.num_references() == 2);
75
+
76
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(1));
77
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(2));
78
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(3));
79
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
80
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(5));
81
+ }
82
+
83
+ {
84
+ Reference ref;
85
+ ref.push_back(4);
86
+ ref.push_back(3);
87
+ ref.push_back(4);
88
+ ref.push_back(5);
89
+ BOOST_REQUIRE(ref.num_references() == 4);
90
+
91
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(1));
92
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(2));
93
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(3));
94
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
95
+ BOOST_CHECK_EQUAL(5, ref.CalcClosest(5));
96
+ }
97
+ }
98
+
99
+ BOOST_AUTO_TEST_CASE(refernece_length_shortest)
100
+ {
101
+ {
102
+ Reference ref;
103
+ ref.push_back(4);
104
+ ref.push_back(1);
105
+ BOOST_CHECK_EQUAL(1, ref.CalcShortest());
106
+ }
107
+
108
+ {
109
+ Reference ref;
110
+ ref.push_back(4);
111
+ ref.push_back(3);
112
+ BOOST_CHECK_EQUAL(3, ref.CalcShortest());
113
+ }
114
+
115
+ {
116
+ Reference ref;
117
+ ref.push_back(4);
118
+ ref.push_back(3);
119
+ ref.push_back(4);
120
+ ref.push_back(5);
121
+ BOOST_CHECK_EQUAL(3, ref.CalcShortest());
122
+ }
123
+ }
mosesdecoder/mert/ScoreArray.cpp ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * ScoreArray.cpp
3
+ * mert - Minimum Error Rate Training
4
+ *
5
+ * Created by Nicola Bertoldi on 13/05/08.
6
+ *
7
+ */
8
+
9
+ #include "ScoreArray.h"
10
+ #include "Util.h"
11
+ #include "FileStream.h"
12
+
13
+ using namespace std;
14
+
15
+ namespace MosesTuning
16
+ {
17
+
18
+
19
+ ScoreArray::ScoreArray()
20
+ : m_num_scores(0), m_index(0) {}
21
+
22
+ void ScoreArray::savetxt(ostream* os, const string& sctype)
23
+ {
24
+ *os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size()
25
+ << " " << m_num_scores << " " << sctype << endl;
26
+ for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
27
+ i->savetxt(os);
28
+ *os << endl;
29
+ }
30
+ *os << SCORES_TXT_END << endl;
31
+ }
32
+
33
+ void ScoreArray::savebin(ostream* os, const string& score_type)
34
+ {
35
+ *os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size()
36
+ << " " << m_num_scores << " " << score_type << endl;
37
+ for (scorearray_t::iterator i = m_array.begin();
38
+ i != m_array.end(); i++) {
39
+ i->savebin(os);
40
+ }
41
+ *os << SCORES_BIN_END << endl;
42
+ }
43
+
44
+ void ScoreArray::save(ostream* os, const string& score_type, bool bin)
45
+ {
46
+ if (size() <= 0) return;
47
+ if (bin) {
48
+ savebin(os, score_type);
49
+ } else {
50
+ savetxt(os, score_type);
51
+ }
52
+ }
53
+
54
+ void ScoreArray::save(const string &file, const string& score_type, bool bin)
55
+ {
56
+ ofstream ofs(file.c_str(), ios::out);
57
+ if (!ofs) {
58
+ cerr << "Failed to open " << file << endl;
59
+ exit(1);
60
+ }
61
+ ostream* os = &ofs;
62
+ save(os, score_type, bin);
63
+ ofs.close();
64
+ }
65
+
66
+ void ScoreArray::save(const string& score_type, bool bin)
67
+ {
68
+ save(&cout, score_type, bin);
69
+ }
70
+
71
+ void ScoreArray::loadbin(istream* is, size_t n)
72
+ {
73
+ ScoreStats entry(m_num_scores);
74
+ for (size_t i = 0; i < n; i++) {
75
+ entry.loadbin(is);
76
+ add(entry);
77
+ }
78
+ }
79
+
80
+ void ScoreArray::loadtxt(istream* is, size_t n)
81
+ {
82
+ ScoreStats entry(m_num_scores);
83
+ for (size_t i = 0; i < n; i++) {
84
+ entry.loadtxt(is);
85
+ add(entry);
86
+ }
87
+ }
88
+
89
+ void ScoreArray::load(istream* is)
90
+ {
91
+ size_t number_of_entries = 0;
92
+ bool binmode = false;
93
+
94
+ string substring, stringBuf;
95
+ string::size_type loc;
96
+
97
+ getline(*is, stringBuf);
98
+ if (!is->good()) {
99
+ return;
100
+ }
101
+
102
+ if (!stringBuf.empty()) {
103
+ if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0) {
104
+ binmode=false;
105
+ } else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0) {
106
+ binmode=true;
107
+ } else {
108
+ TRACE_ERR("ERROR: ScoreArray::load(): Wrong header");
109
+ return;
110
+ }
111
+ getNextPound(stringBuf, substring);
112
+ getNextPound(stringBuf, substring);
113
+ m_index = atoi(substring.c_str());
114
+ getNextPound(stringBuf, substring);
115
+ number_of_entries = atoi(substring.c_str());
116
+ getNextPound(stringBuf, substring);
117
+ m_num_scores = atoi(substring.c_str());
118
+ getNextPound(stringBuf, substring);
119
+ m_score_type = substring;
120
+ }
121
+
122
+ if (binmode) {
123
+ loadbin(is, number_of_entries);
124
+ } else {
125
+ loadtxt(is, number_of_entries);
126
+ }
127
+
128
+ getline(*is, stringBuf);
129
+ if (!stringBuf.empty()) {
130
+ if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 &&
131
+ (loc = stringBuf.find(SCORES_BIN_END)) != 0) {
132
+ TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer");
133
+ return;
134
+ }
135
+ }
136
+ }
137
+
138
+ void ScoreArray::load(const string &file)
139
+ {
140
+ TRACE_ERR("loading data from " << file << endl);
141
+ inputfilestream input_stream(file); // matches a stream with a file. Opens the file
142
+ istream* is = &input_stream;
143
+ load(is);
144
+ input_stream.close();
145
+ }
146
+
147
+
148
+ void ScoreArray::merge(ScoreArray& e)
149
+ {
150
+ //dummy implementation
151
+ for (size_t i=0; i<e.size(); i++)
152
+ add(e.get(i));
153
+ }
154
+
155
+ bool ScoreArray::check_consistency() const
156
+ {
157
+ const size_t sz = NumberOfScores();
158
+ if (sz == 0)
159
+ return true;
160
+
161
+ for (scorearray_t::const_iterator i = m_array.begin();
162
+ i != m_array.end(); ++i) {
163
+ if (i->size() != sz)
164
+ return false;
165
+ }
166
+ return true;
167
+ }
168
+
169
+ }
mosesdecoder/mert/ScoreArray.h ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * ScoreArray.h
3
+ * mert - Minimum Error Rate Training
4
+ *
5
+ * Created by Nicola Bertoldi on 13/05/08.
6
+ *
7
+ */
8
+
9
+ #ifndef MERT_SCORE_ARRAY_H_
10
+ #define MERT_SCORE_ARRAY_H_
11
+
12
+ #include <vector>
13
+ #include <iostream>
14
+ #include <string>
15
+
16
+ #include "ScoreStats.h"
17
+
18
+ namespace MosesTuning
19
+ {
20
+
21
+ const char SCORES_TXT_BEGIN[] = "SCORES_TXT_BEGIN_0";
22
+ const char SCORES_TXT_END[] = "SCORES_TXT_END_0";
23
+ const char SCORES_BIN_BEGIN[] = "SCORES_BIN_BEGIN_0";
24
+ const char SCORES_BIN_END[] = "SCORES_BIN_END_0";
25
+
26
+ class ScoreArray
27
+ {
28
+ private:
29
+ scorearray_t m_array;
30
+ std::string m_score_type;
31
+ std::size_t m_num_scores;
32
+
33
+ // indexx to identify the utterance.
34
+ // It can differ from the index inside the vector.
35
+ int m_index;
36
+
37
+ public:
38
+ ScoreArray();
39
+ ~ScoreArray() {}
40
+
41
+ void clear() {
42
+ m_array.clear();
43
+ }
44
+
45
+ int getIndex() const {
46
+ return m_index;
47
+ }
48
+
49
+ void setIndex(int value) {
50
+ m_index = value;
51
+ }
52
+
53
+ ScoreStats& get(std::size_t i) {
54
+ return m_array.at(i);
55
+ }
56
+
57
+ const ScoreStats& get(std::size_t i) const {
58
+ return m_array.at(i);
59
+ }
60
+
61
+ void add(const ScoreStats& e) {
62
+ m_array.push_back(e);
63
+ }
64
+
65
+ //ADDED BY TS
66
+ void swap(std::size_t i, std::size_t j) {
67
+ std::swap(m_array[i], m_array[j]);
68
+ }
69
+
70
+ void resize(std::size_t new_size) {
71
+ m_array.resize(std::min(new_size, m_array.size()));
72
+ }
73
+ //END_ADDED
74
+
75
+ void merge(ScoreArray& e);
76
+
77
+ std::string name() const {
78
+ return m_score_type;
79
+ }
80
+
81
+ void name(std::string &score_type) {
82
+ m_score_type = score_type;
83
+ }
84
+
85
+ std::size_t size() const {
86
+ return m_array.size();
87
+ }
88
+
89
+ std::size_t NumberOfScores() const {
90
+ return m_num_scores;
91
+ }
92
+
93
+ void NumberOfScores(std::size_t v) {
94
+ m_num_scores = v;
95
+ }
96
+
97
+ void savetxt(std::ostream* os, const std::string& score_type);
98
+ void savebin(std::ostream* os, const std::string& score_type);
99
+ void save(std::ostream* os, const std::string& score_type, bool bin=false);
100
+ void save(const std::string &file, const std::string& score_type, bool bin=false);
101
+ void save(const std::string& score_type, bool bin=false);
102
+
103
+ void loadtxt(std::istream* is, std::size_t n);
104
+ void loadbin(std::istream* is, std::size_t n);
105
+ void load(std::istream* is);
106
+ void load(const std::string &file);
107
+
108
+ bool check_consistency() const;
109
+ };
110
+
111
+ }
112
+
113
+ #endif // MERT_SCORE_ARRAY_H_
mosesdecoder/mert/Util.h ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Util.h
3
+ * mert - Minimum Error Rate Training
4
+ *
5
+ * Created by Nicola Bertoldi on 13/05/08.
6
+ *
7
+ */
8
+
9
+ #ifndef MERT_UTIL_H_
10
+ #define MERT_UTIL_H_
11
+
12
+ #include <cmath>
13
+ #include <cstdlib>
14
+ #include <stdexcept>
15
+ #include <limits>
16
+ #include <vector>
17
+ #include <map>
18
+ #include <iostream>
19
+ #include <sstream>
20
+ #include <string>
21
+ #include <cstring>
22
+
23
+ #include "Types.h"
24
+
25
+ namespace MosesTuning
26
+ {
27
+
28
+ #ifdef TRACE_ENABLE
29
+ #define TRACE_ERR(str) { std::cerr << str; }
30
+ #else
31
+ #define TRACE_ERR(str) { }
32
+ #endif
33
+
34
+ #if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2)
35
+ // gcc nth_element() bug
36
+ #define NTH_ELEMENT3(begin, middle, end) std::sort(begin, end)
37
+ #define NTH_ELEMENT4(begin, middle, end, orderer) std::sort(begin, end, orderer)
38
+ #else
39
+ #define NTH_ELEMENT3(begin, middle, end) std::nth_element(begin, middle, end)
40
+ #define NTH_ELEMENT4(begin, middle, end, orderer) std::nth_element(begin, middle, end, orderer)
41
+ #endif
42
+
43
+ const char kDefaultDelimiterSymbol[] = " ";
44
+
45
+ int verboselevel();
46
+ int setverboselevel(int v);
47
+
48
+
49
+ const float kEPS = 0.0001f;
50
+
51
+ template <typename T>
52
+ bool IsAlmostEqual(T expected, T actual, float round=kEPS)
53
+ {
54
+ if (std::abs(expected - actual) < round) {
55
+ return true;
56
+ } else {
57
+ std::cerr << "Fail: expected = " << expected
58
+ << " (actual = " << actual << ")" << std::endl;
59
+ return false;
60
+ }
61
+ }
62
+
63
+ /**
64
+ * Find the specified delimiter for the string 'str', and 'str' is assigned
65
+ * to a substring object that starts at the position of first occurrence of
66
+ * the delimiter in 'str'. 'substr' is copied from 'str' ranging from
67
+ * the start position of 'str' to the position of first occurrence of
68
+ * the delimiter.
69
+ *
70
+ * It returns the position of first occurrence in the queried string.
71
+ * If the content is not found, std::string::npos is returned.
72
+ */
73
+ size_t getNextPound(std::string &str, std::string &substr,
74
+ const std::string &delimiter = kDefaultDelimiterSymbol);
75
+
76
+ void split(const std::string &s, char delim, std::vector<std::string> &elems);
77
+
78
+ /**
79
+ * Split the string 'str' with specified delimitter 'delim' into tokens.
80
+ * The resulting tokens are set to 'res'.
81
+ *
82
+ * ex. "a,b,c" => {"a", "b", "c"}.
83
+ */
84
+ void Tokenize(const char *str, const char delim, std::vector<std::string> *res);
85
+
86
+ template<typename T>
87
+ inline T Scan(const std::string &input)
88
+ {
89
+ std::stringstream stream(input);
90
+ T ret;
91
+ stream >> ret;
92
+ return ret;
93
+ }
94
+
95
+ /**
96
+ * Returns true iff "str" ends with "suffix".
97
+ * e.g., Given str = "abc:" and suffix = ":", this function returns true.
98
+ */
99
+ inline bool EndsWith(const std::string& str, const char* suffix)
100
+ {
101
+ return str.find_last_of(suffix) == str.size() - 1;
102
+ }
103
+
104
+ template<typename T>
105
+ inline std::string stringify(T x)
106
+ {
107
+ std::ostringstream o;
108
+ if (!(o << x))
109
+ throw std::runtime_error("stringify(template<typename T>)");
110
+ return o.str();
111
+ }
112
+
113
+ inline ScoreStatsType ConvertCharToScoreStatsType(const char *str)
114
+ {
115
+ return std::atoi(str);
116
+ }
117
+
118
+ inline ScoreStatsType ConvertStringToScoreStatsType(const std::string& str)
119
+ {
120
+ return ConvertCharToScoreStatsType(str.c_str());
121
+ }
122
+
123
+ inline FeatureStatsType ConvertCharToFeatureStatsType(const char *str)
124
+ {
125
+ return static_cast<FeatureStatsType>(std::atof(str));
126
+ }
127
+
128
+ inline FeatureStatsType ConvertStringToFeatureStatsType(const std::string &str)
129
+ {
130
+ return ConvertCharToFeatureStatsType(str.c_str());
131
+ }
132
+
133
+ inline std::string trimStr(const std::string& Src, const std::string& c = " \r\n")
134
+ {
135
+ size_t p2 = Src.find_last_not_of(c);
136
+ if (p2 == std::string::npos) return std::string();
137
+ size_t p1 = Src.find_first_not_of(c);
138
+ if (p1 == std::string::npos) p1 = 0;
139
+ return Src.substr(p1, (p2-p1)+1);
140
+ }
141
+
142
+ // Utilities to measure decoding time
143
+ void ResetUserTime();
144
+ void PrintUserTime(const std::string &message);
145
+ double GetUserTime();
146
+
147
+ }
148
+
149
+ #endif // MERT_UTIL_H_
mosesdecoder/moses/TranslationModel/UG/util/ibm1-align ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67f9b51b84f1b18fefcfe58feba9a9879648529fed29fbfb90ec0cec4f42a80e
3
+ size 1062799
mosesdecoder/scripts/Jamfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #See ../Jamroot for options.
2
+ import option path ;
3
+
4
+ build-project training ;
5
+
6
+ prefix = [ option.get "prefix" ] ;
7
+ if $(prefix) {
8
+ prefix = [ path.root $(prefix) [ path.pwd ] ] ;
9
+ location = [ option.get "install-scripts" : : $(prefix)$(GITTAG)/scripts ] ;
10
+ } else {
11
+ location = [ option.get "install-scripts" ] ;
12
+ }
13
+
14
+ if $(location) {
15
+ location = [ path.root $(location) [ path.pwd ] ] ;
16
+ install scripts :
17
+ [ glob-tree README *.js *.pl *.perl *.pm *.py *.sh *.php : tests regression-testing other bin ]
18
+ [ glob share/nonbreaking_prefixes/* ems/example/*.* ems/example/data/* ems/web/* analysis/smtgui/* : ems/web/javascripts ]
19
+ generic/fsa-sample.fsa
20
+ ems/experiment.machines
21
+ ems/experiment.meta
22
+ : <install-source-root>. <location>$(location) ;
23
+ }
mosesdecoder/scripts/README ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2006-07-29
2
+
3
+ This directory should contain all multi-purpose scripts for:
4
+
5
+ - training ... training moses (including BLEU evaluation needed for MERT)
6
+ - analysis ... analyzing MT output (for human analysis)
7
+ - generic ... script for handling generic issues (parallelization)
8
+ - lib ... perl modules used by various scripts
9
+
10
+
11
+ The Jamfile then takes care of proper 'release' from your git directory to
12
+ the shared directories.
13
+
14
+ The released scripts should remain in the *same directory structure*.
15
+
mosesdecoder/scripts/generic/binarize4moses2.perl ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ use strict;
4
+
5
+ use Getopt::Long;
6
+ use File::Basename;
7
+ use FindBin qw($RealBin);
8
+
9
+ sub systemCheck($);
10
+
11
+ my $mosesDir = "$RealBin/../..";
12
+ my $ptPath;
13
+ my $lexRoPath;
14
+ my $outPath;
15
+ my $numScores = 4;
16
+ my $numLexScores;
17
+ my $pruneNum = 100;
18
+ my $scfg = 0;
19
+
20
+ GetOptions("phrase-table=s" => \$ptPath,
21
+ "lex-ro=s" => \$lexRoPath,
22
+ "output-dir=s" => \$outPath,
23
+ "num-scores=s" => \$numScores,
24
+ "num-lex-scores=i" => \$numLexScores,
25
+ "prune=i" => \$pruneNum,
26
+ "scfg" => \$scfg
27
+ ) or exit 1;
28
+
29
+ #print STDERR "scfg=$scfg \n";
30
+ die("ERROR: please set --phrase-table") unless defined($ptPath);
31
+ #die("ERROR: please set --lex-ro") unless defined($lexRoPath);
32
+ die("ERROR: please set --output-dir") unless defined($outPath);
33
+ #die("ERROR: please set --num-lex-scores") unless defined($numLexScores);
34
+ die("ERROR: compile contrib/sigtest-filter") if (!-X "$mosesDir/contrib/sigtest-filter/filter-pt");
35
+ die("ERROR: compile with bjam --with-cmph") if (!-X "$mosesDir/bin/processLexicalTableMin");
36
+ die("ERROR: compile with bjam --with-xmlrpc-c") if (!-X "$mosesDir/bin/CreateProbingPT");
37
+
38
+ my $cmd;
39
+
40
+ my $tempPath = dirname($outPath) ."/tmp.$$";
41
+ `mkdir -p $tempPath`;
42
+
43
+ $cmd = "gzip -dc $ptPath | $mosesDir/contrib/sigtest-filter/filter-pt -n $pruneNum | gzip -c > $tempPath/pt.gz";
44
+ systemCheck($cmd);
45
+
46
+ if (defined($lexRoPath)) {
47
+ die("ERROR: please set --num-lex-scores") unless defined($numLexScores);
48
+
49
+ $cmd = "$mosesDir/bin/processLexicalTableMin -in $lexRoPath -out $tempPath/lex-ro -T . -threads all";
50
+ systemCheck($cmd);
51
+
52
+ $cmd = "$mosesDir/bin/addLexROtoPT $tempPath/pt.gz $tempPath/lex-ro.minlexr | gzip -c > $tempPath/pt.withLexRO.gz";
53
+ systemCheck($cmd);
54
+
55
+ $cmd = "ln -s pt.withLexRO.gz $tempPath/pt.txt.gz";
56
+ systemCheck($cmd);
57
+ }
58
+ else {
59
+ $cmd = "ln -s pt.gz $tempPath/pt.txt.gz";
60
+ systemCheck($cmd);
61
+ }
62
+
63
+ $cmd = "$mosesDir/bin/CreateProbingPT2 --num-scores $numScores --log-prob --input-pt $tempPath/pt.txt.gz --output-dir $outPath";
64
+
65
+ if (defined($lexRoPath)) {
66
+ $cmd .= " --num-lex-scores $numLexScores";
67
+ }
68
+
69
+ if ($scfg) {
70
+ $cmd .= " --scfg";
71
+ }
72
+
73
+ systemCheck($cmd);
74
+
75
+ exit(0);
76
+
77
+ #####################################################
78
+ sub systemCheck($)
79
+ {
80
+ my $cmd = shift;
81
+ print STDERR "Executing: $cmd\n";
82
+
83
+ my $retVal = system($cmd);
84
+ if ($retVal != 0)
85
+ {
86
+ exit(1);
87
+ }
88
+ }
mosesdecoder/scripts/generic/bsbleu.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # compute Bleu scores with confidence intervals via boostrap resampling
3
+ # written by Ulrich Germann
4
+ #
5
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
6
+ # Public License version 2.1 or, at your option, any later version.
7
+
8
+ from argparse import ArgumentParser
9
+ import math
10
+ import os
11
+ from random import randint
12
+ import sys, gzip
13
+
14
+
15
+ def count_ngrams(snt, max_n):
16
+ """
17
+ Return a dictionary of ngram counts (up to length /max_n/)
18
+ for sentence (list of words) /snt/.
19
+ """
20
+ ret = {}
21
+ for i in xrange(len(snt)):
22
+ for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)):
23
+ key = tuple(snt[i:k])
24
+ ret[key] = ret.get(key, 0) + 1
25
+ return ret
26
+
27
+
28
+ def max_counts(ng1, ng2):
29
+ """
30
+ Return a dicitonary of ngram counts such that
31
+ each count is the greater of the two individual counts
32
+ for each ngram in the input ngram count dictionaries
33
+ /ng1/ and /ng2/.
34
+ """
35
+ ret = ng1.copy()
36
+ for k, v in ng2.items():
37
+ ret[k] = max(ret.get(k, 0), v)
38
+ return ret
39
+
40
+
41
+ def ng_hits(hyp, ref, max_n):
42
+ """
43
+ Return a list of ngram counts such that each ngram count
44
+ is the minimum of the counts in hyp and ref, up to ngram
45
+ length /max_n/.
46
+ """
47
+ ret = [0 for i in xrange(max_n)]
48
+ for ng, cnt in hyp.items():
49
+ k = ng
50
+ if len(k) <= max_n:
51
+ ret[len(k) - 1] += min(cnt, ref.get(ng, 0))
52
+ return ret
53
+
54
+
55
+ class BleuScore:
56
+ def __init__(self, hyp, ref, max_n=4, bootstrap=1000):
57
+ # print len(hyp.ngrams), len(ref.ngrams), "X"
58
+ self.hits = [
59
+ ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n)
60
+ for i in xrange(len(hyp.ngrams))]
61
+ self.max_n = max_n
62
+ self.hyp = hyp
63
+ self.ref = ref
64
+ self.lower = None
65
+ self.upper = None
66
+ self.median = None
67
+ self.actual = self.score([i for i in xrange(len(hyp.snt))])
68
+ if bootstrap:
69
+ self.bootstrap = [self.score([randint(0, len(hyp.snt) - 1)
70
+ for s in hyp.snt])
71
+ for i in xrange(bootstrap)]
72
+ self.bootstrap.sort()
73
+ else:
74
+ self.bootstrap = [self.actual]
75
+ pass
76
+
77
+ def score(self, sample):
78
+ hits = [0 for i in xrange(self.max_n)]
79
+ self.hyplen = 0
80
+ self.reflen = 0
81
+ self.total = [0 for i in hits]
82
+ for i in sample:
83
+ self.hyplen += len(self.hyp.snt[i])
84
+ self.reflen += len(self.ref.snt[i])
85
+ for n in xrange(self.max_n):
86
+ hits[n] += self.hits[i][n]
87
+ self.total[n] += max(len(self.hyp.snt[i]) - n, 0)
88
+ pass
89
+ self.prec = [float(hits[n]) / self.total[n]
90
+ for n in xrange(self.max_n)]
91
+ ret = sum([math.log(x) for x in self.prec]) / self.max_n
92
+ self.BP = min(
93
+ 1, math.exp(1. - float(self.reflen) / float(self.hyplen)))
94
+ ret += math.log(self.BP)
95
+ return math.exp(ret)
96
+
97
+
98
+ class Document:
99
+ def __init__(self, fname=None):
100
+ self.fname = fname
101
+ if fname:
102
+ if fname[-3:] == ".gz":
103
+ self.snt = [line.strip().split() for line in gzip.open(fname).readlines()]
104
+ else:
105
+ self.snt = [line.strip().split() for line in open(fname)]
106
+ pass
107
+ self.ngrams = [count_ngrams(snt, 4) for snt in self.snt]
108
+ # print self.snt
109
+ else:
110
+ self.snt = None
111
+ self.ngrams = None
112
+
113
+ def merge(self, R):
114
+ self.fname = "multi-ref"
115
+ self.ngrams = [x for x in R[0].ngrams]
116
+ self.snt = [x for x in R[0].snt]
117
+ for i in xrange(len(R[0].ngrams)):
118
+ for k in xrange(1, len(R)):
119
+ self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i])
120
+
121
+ def update(self, hyp, R):
122
+ for i, hyp_snt in enumerate(hyp.snt):
123
+ clen = len(hyp_snt)
124
+ K = 0
125
+ for k in xrange(1, len(R)):
126
+ k_snt = R[k].snt[i]
127
+ assert len(R[k].snt) == len(hyp.snt), (
128
+ "Mismatch in number of sentences " +
129
+ "between reference and candidate")
130
+ if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen):
131
+ if len(k_snt) < len(R[K].snt[i]):
132
+ K = k
133
+ elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen):
134
+ K = k
135
+ self.snt[i] = R[K].snt[i]
136
+
137
+
138
+ if __name__ == "__main__":
139
+ argparser = ArgumentParser()
140
+ argparser.add_argument(
141
+ "-r", "--ref", nargs='+', help="Reference translation(s).")
142
+ argparser.add_argument(
143
+ "-c", "--cand", nargs='+', help="Candidate translations.")
144
+ argparser.add_argument(
145
+ "-i", "--individual", action='store_true',
146
+ help="Compute BLEU scores for individual references.")
147
+ argparser.add_argument(
148
+ "-b", "--bootstrap", type=int, default=1000,
149
+ help="Sample size for bootstrap resampling.")
150
+ argparser.add_argument(
151
+ "-a", "--alpha", type=float, default=.05,
152
+ help="1-alpha = confidence interval.")
153
+ args = argparser.parse_args(sys.argv[1:])
154
+ R = [Document(fname) for fname in args.ref]
155
+ C = [Document(fname) for fname in args.cand]
156
+ Rx = Document() # for multi-reference BLEU
157
+ Rx.merge(R)
158
+ for c in C:
159
+ # compute multi-reference BLEU
160
+ Rx.update(c, R)
161
+ bleu = BleuScore(c, Rx, bootstrap=args.bootstrap)
162
+ print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % (
163
+ 100 * bleu.actual,
164
+ os.path.basename(Rx.fname),
165
+ 100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)],
166
+ 100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)],
167
+ 100 * bleu.bootstrap[int(.5 * args.bootstrap)],
168
+ c.fname) # os.path.basename(c.fname))
169
+
170
+ if args.individual:
171
+ for r in R:
172
+ bleu = BleuScore(c, r, bootstrap=args.bootstrap)
173
+ print " %5.2f %s" % (
174
+ 100 * bleu.actual, os.path.basename(r.fname))
175
+ # print bleu.prec, bleu.hyplen, bleu.reflen, bleu.BP
176
+
177
+ # print [
178
+ # sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))])
179
+ # for n in xrange(4)]
mosesdecoder/scripts/generic/compound-splitter.perl ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ use warnings;
7
+ use strict;
8
+ use Getopt::Long "GetOptions";
9
+
10
+ my ($CORPUS,$MODEL,$TRAIN,$HELP,$VERBOSE);
11
+ my $FILLER = ":s:es";
12
+ my $MIN_SIZE = 3;
13
+ my $MIN_COUNT = 5;
14
+ my $MAX_COUNT = 5;
15
+ my $FACTORED = 0;
16
+ my $SYNTAX = 0;
17
+ my $MARK_SPLIT = 0;
18
+ my $BINARIZE = 0;
19
+ $HELP = 1
20
+ unless &GetOptions('corpus=s' => \$CORPUS,
21
+ 'model=s' => \$MODEL,
22
+ 'filler=s' => \$FILLER,
23
+ 'factored' => \$FACTORED,
24
+ 'min-size=i' => \$MIN_SIZE,
25
+ 'min-count=i' => \$MIN_COUNT,
26
+ 'max-count=i' => \$MAX_COUNT,
27
+ 'help' => \$HELP,
28
+ 'verbose' => \$VERBOSE,
29
+ 'syntax' => \$SYNTAX,
30
+ 'binarize' => \$BINARIZE,
31
+ 'mark-split' => \$MARK_SPLIT,
32
+ 'train' => \$TRAIN);
33
+
34
+ if ($HELP ||
35
+ ( $TRAIN && !$CORPUS) ||
36
+ (!$TRAIN && !$MODEL)) {
37
+ print "Compound splitter\n";
38
+ print "-----------------\n\n";
39
+ print "train: compound-splitter -train -corpus txt-file -model new-model\n";
40
+ print "apply: compound-splitter -model trained-model < in > out\n";
41
+ print "options: -min-size: minimum word size (default $MIN_SIZE)\n";
42
+ print " -min-count: minimum word count (default $MIN_COUNT)\n";
43
+ print " -filler: filler letters between words (default $FILLER)\n";
44
+ print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n";
45
+ print " -syntax: syntactically parsed data (default $SYNTAX)\n";
46
+ print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n";
47
+ print " -binarize: binarize subtree for split word (default $BINARIZE)\n";
48
+ exit;
49
+ }
50
+
51
+ if ($TRAIN) {
52
+ if ($SYNTAX) { &train_syntax(); }
53
+ elsif ($FACTORED) { &train_factored(); }
54
+ else { &train(); }
55
+ }
56
+ else {
57
+ &apply();
58
+ }
59
+
60
+ sub train {
61
+ my %COUNT;
62
+ open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
63
+ while(<CORPUS>) {
64
+ chop; s/\s+/ /g; s/^ //; s/ $//;
65
+ foreach (split) {
66
+ $COUNT{$_}++;
67
+ }
68
+ }
69
+ close(CORPUS);
70
+ &save_trained_model(\%COUNT);
71
+ }
72
+
73
+ sub save_trained_model {
74
+ my ($COUNT) = @_;
75
+ my $id = 0;
76
+ open(MODEL,">".$MODEL);
77
+ foreach my $word (keys %$COUNT) {
78
+ print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n";
79
+ }
80
+ close(MODEL);
81
+ print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n";
82
+ }
83
+
84
+ sub train_factored {
85
+ my (%COUNT,%FACTORED_COUNT);
86
+ # collect counts for interpretations for each surface word
87
+ open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
88
+ while(<CORPUS>) {
89
+ chop; s/\s+/ /g; s/^ //; s/ $//;
90
+ foreach my $factored_word (split) {
91
+ my $word = $factored_word;
92
+ $word =~ s/\|.+//g; # just first factor
93
+ $FACTORED_COUNT{$word}{$factored_word}++;
94
+ }
95
+ }
96
+ close(CORPUS);
97
+ # only preserve most frequent interpretation, assign sum of counts
98
+ foreach my $word (keys %FACTORED_COUNT) {
99
+ my ($max,$best,$total) = (0,"",0);
100
+ foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) {
101
+ my $count = $FACTORED_COUNT{$word}{$factored_word};
102
+ $total += $count;
103
+ if ($count > $max) {
104
+ $max = $count;
105
+ $best = $factored_word;
106
+ }
107
+ }
108
+ $COUNT{$best} = $total;
109
+ }
110
+ &save_trained_model(\%COUNT);
111
+ }
112
+
113
+ sub train_syntax {
114
+ my (%COUNT,%LABELED_COUNT);
115
+ # collect counts for interpretations for each surface word
116
+ open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
117
+ while(<CORPUS>) {
118
+ chop; s/\s+/ /g; s/^ //; s/ $//;
119
+ my $label;
120
+ foreach (split) {
121
+ if (/^label="([^\"]+)"/) {
122
+ $label = $1;
123
+ }
124
+ elsif (! /^</) {
125
+ $LABELED_COUNT{$_}{$label}++;
126
+ }
127
+ }
128
+ }
129
+ close(CORPUS);
130
+
131
+ # only preserve most frequent label, assign sum of counts
132
+ foreach my $word (keys %LABELED_COUNT) {
133
+ my ($max,$best,$total) = (0,"",0);
134
+ foreach my $label (keys %{$LABELED_COUNT{$word}}) {
135
+ my $count = $LABELED_COUNT{$word}{$label};
136
+ $total += $count;
137
+ if ($count > $max) {
138
+ $max = $count;
139
+ $best = "$word $label";
140
+ }
141
+ }
142
+ $COUNT{$best} = $total;
143
+ }
144
+ &save_trained_model(\%COUNT);
145
+ }
146
+
147
+ sub apply {
148
+ my (%COUNT,%TRUECASE,%LABEL);
149
+ open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'");
150
+ while(<MODEL>) {
151
+ chomp;
152
+ my ($id,$factored_word,$count) = split(/\t/);
153
+ my $label;
154
+ ($factored_word,$label) = split(/ /,$factored_word);
155
+ my $word = $factored_word;
156
+ $word =~ s/\|.+//g; # just first factor
157
+ my $lc = lc($word);
158
+ # if word exists with multipe casings, only record most frequent
159
+ next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
160
+ $COUNT{$lc} = $count;
161
+ $TRUECASE{$lc} = $factored_word;
162
+ $LABEL{$lc} = $label if $SYNTAX;
163
+ }
164
+ close(MODEL);
165
+
166
+ while(<STDIN>) {
167
+ my $first = 1;
168
+ chop; s/\s+/ /g; s/^ //; s/ $//;
169
+ my @BUFFER; # for xml tags
170
+ foreach my $factored_word (split) {
171
+ print " " unless $first;
172
+ $first = 0;
173
+
174
+ # syntax: don't split xml
175
+ if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
176
+ push @BUFFER,$factored_word;
177
+ $first = 1;
178
+ next;
179
+ }
180
+
181
+ # get case class
182
+ my $word = $factored_word;
183
+ $word =~ s/\|.+//g; # just first factor
184
+ my $lc = lc($word);
185
+
186
+ print STDERR "considering $word ($lc)...\n" if $VERBOSE;
187
+ # don't split frequent words
188
+ if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
189
+ $lc !~ /[a-zA-Z]/) {; # has to have at least one letter
190
+ print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
191
+ print $factored_word;
192
+ print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
193
+ next;
194
+ }
195
+
196
+ # consider possible splits
197
+ my $final = length($word)-1;
198
+ my %REACHABLE;
199
+ for(my $i=0;$i<=$final;$i++) { $REACHABLE{$i} = (); }
200
+
201
+ print STDERR "splitting $word:\n" if $VERBOSE;
202
+ for(my $end=$MIN_SIZE;$end<length($word);$end++) {
203
+ for(my $start=0;$start<=$end-$MIN_SIZE;$start++) {
204
+ next unless $start == 0 || defined($REACHABLE{$start-1});
205
+ foreach my $filler (split(/:/,$FILLER)) {
206
+ next if $start == 0 && $filler ne "";
207
+ next if lc(substr($word,$start,length($filler))) ne $filler;
208
+ my $subword = lc(substr($word,
209
+ $start+length($filler),
210
+ $end-$start+1-length($filler)));
211
+ next unless defined($COUNT{$subword});
212
+ next unless $COUNT{$subword} >= $MIN_COUNT;
213
+ print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE;
214
+ push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}";
215
+ }
216
+ }
217
+ }
218
+
219
+ # no matches at all?
220
+ if (!defined($REACHABLE{$final})) {
221
+ print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
222
+ print $factored_word;
223
+ next;
224
+ }
225
+
226
+ my ($best_split,$best_score) = ("",0);
227
+
228
+ my %ITERATOR;
229
+ for(my $i=0;$i<=$final;$i++) { $ITERATOR{$i}=0; }
230
+ my $done = 0;
231
+ while(1) {
232
+ # read off word
233
+ my ($pos,$decomp,$score,$num,@INDEX) = ($final,"",1,0);
234
+ while($pos>0) {
235
+ last unless scalar @{$REACHABLE{$pos}} > $ITERATOR{$pos}; # dead end?
236
+ my ($nextpos,$subword,$count)
237
+ = split(/ /,$REACHABLE{$pos}[ $ITERATOR{$pos} ]);
238
+ $decomp = $subword." ".$decomp;
239
+ $score *= $count;
240
+ $num++;
241
+ push @INDEX,$pos;
242
+ # print STDERR "($nextpos-$pos,$decomp,$score,$num)\n";
243
+ $pos = $nextpos-1;
244
+ }
245
+
246
+ chop($decomp);
247
+ print STDERR "\tsplit: $decomp ($score ** 1/$num) = ".($score ** (1/$num))."\n" if $VERBOSE;
248
+ $score **= 1/$num;
249
+ if ($score>$best_score) {
250
+ $best_score = $score;
251
+ $best_split = $decomp;
252
+ }
253
+
254
+ # increase iterator
255
+ my $increase = -1;
256
+ while($increase<$final) {
257
+ $increase = pop @INDEX;
258
+ $ITERATOR{$increase}++;
259
+ last if scalar @{$REACHABLE{$increase}} > $ITERATOR{$increase};
260
+ }
261
+ last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final};
262
+ for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; }
263
+ }
264
+ if ($best_split !~ / /) {
265
+ print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
266
+ print $factored_word; # do not change case for unsplit words
267
+ next;
268
+ }
269
+ if (!$SYNTAX) {
270
+ print $best_split;
271
+ }
272
+ else {
273
+ $BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT;
274
+ $BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n");
275
+ my $pos = $1;
276
+ print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
277
+
278
+ my @SPLIT = split(/ /,$best_split);
279
+ my @OUT = ();
280
+ if ($BINARIZE) {
281
+ for(my $w=0;$w<scalar(@SPLIT)-2;$w++) {
282
+ push @OUT,"<tree label=\"\@$pos\">";
283
+ }
284
+ }
285
+ for(my $w=0;$w<scalar(@SPLIT);$w++) {
286
+ if ($BINARIZE && $w>=2) { push @OUT, "</tree>"; }
287
+ push @OUT,"<tree label=\"".$LABEL{lc($SPLIT[$w])}."\"> $SPLIT[$w] </tree>";
288
+ }
289
+ print join(" ",@OUT);
290
+ }
291
+ }
292
+ print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer
293
+ print "\n";
294
+ }
295
+ }
mosesdecoder/scripts/generic/extract-factors.pl ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ # $Id$
7
+ #extract-factors.pl: extract only the desired factors from a factored corpus
8
+ #usage: extract-factors corpusfile factor-index factor-index ... > outfile
9
+ #factor indices start at 0
10
+ #factor indices too large ought to be ignored
11
+
12
+ use warnings;
13
+ use strict;
14
+
15
+ my ($filename, @factors) = @ARGV;
16
+ my %indices = map {$_ => 1} @factors;
17
+
18
+ open(INFILE, "<$filename") or die "couldn't open '$filename' for read: $!\n";
19
+ while(my $line = <INFILE>)
20
+ {
21
+ chop $line;
22
+ print join(' ', map {my $i = 0; join('|', grep($indices{$i++}, split(/\|/, $_)))} split(/\s+/, $line)) . "\n";
23
+ }
24
+ close(INFILE);
mosesdecoder/scripts/generic/extract-parallel.perl ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ # example
7
+ # ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput
8
+
9
+ use warnings;
10
+ use strict;
11
+ use File::Basename;
12
+
13
+ sub RunFork($);
14
+ sub systemCheck($);
15
+ sub NumStr($);
16
+ sub DigitStr($);
17
+ sub CharStr($);
18
+ sub GetSplitVersion($);
19
+
20
+ my $alph = "abcdefghijklmnopqrstuvwxyz";
21
+ my @alph = (split(//,$alph));
22
+
23
+ print "Started ".localtime() ."\n";
24
+
25
+ my $numParallel= $ARGV[0];
26
+ $numParallel = 1 if $numParallel < 1;
27
+
28
+ my $splitCmd= $ARGV[1];
29
+ my $sortCmd= $ARGV[2];
30
+ my $extractCmd= $ARGV[3];
31
+
32
+ my $target = $ARGV[4]; # 1st arg of extract argument
33
+ my $source = $ARGV[5]; # 2nd arg of extract argument
34
+ my $align = $ARGV[6]; # 3rd arg of extract argument
35
+ my $extract = $ARGV[7]; # 4th arg of extract argument
36
+
37
+ my $makeTTable = 1; # whether to build the ttable extract files
38
+ my $otherExtractArgs= "";
39
+ my $weights = "";
40
+ my $baselineExtract;
41
+ my $glueFile;
42
+ my $phraseOrientation = 0;
43
+ my $phraseOrientationPriorsFile;
44
+ my $splitCmdOption = "";
45
+
46
+ my $GZIP_EXEC;
47
+ if(`which pigz 2> /dev/null`) {
48
+ $GZIP_EXEC = 'pigz';
49
+ }
50
+ else {
51
+ $GZIP_EXEC = 'gzip';
52
+ }
53
+ print STDERR "using $GZIP_EXEC \n";
54
+
55
+ my $isBSDSplit = GetSplitVersion($splitCmd);
56
+ print STDERR "isBSDSplit=$isBSDSplit \n";
57
+
58
+ if ($isBSDSplit == 0) {
59
+ $splitCmdOption .= "-d";
60
+ }
61
+
62
+ my $gzOut = 0;
63
+
64
+ for (my $i = 8; $i < $#ARGV + 1; ++$i)
65
+ {
66
+ $makeTTable = 0 if $ARGV[$i] eq "--NoTTable";
67
+ if ($ARGV[$i] eq '--BaselineExtract') {
68
+ $baselineExtract = $ARGV[++$i];
69
+ next;
70
+ }
71
+ if ($ARGV[$i] eq '--InstanceWeights') {
72
+ $weights = $ARGV[++$i];
73
+ next;
74
+ }
75
+ if ($ARGV[$i] eq '--GlueGrammar') {
76
+ $glueFile = $ARGV[++$i];
77
+ next;
78
+ }
79
+ $phraseOrientation = 1 if $ARGV[$i] eq "--PhraseOrientation";
80
+ if ($ARGV[$i] eq '--PhraseOrientationPriors') {
81
+ $phraseOrientationPriorsFile = $ARGV[++$i];
82
+ next;
83
+ }
84
+ if ($ARGV[$i] eq '--GZOutput') {
85
+ $gzOut = 1;
86
+ }
87
+
88
+ $otherExtractArgs .= $ARGV[$i] ." ";
89
+ }
90
+
91
+ die("Need to specify --GZOutput for parallel extract") if ($gzOut == 0);
92
+
93
+ my $cmd;
94
+ my $TMPDIR=dirname($extract) ."/tmp.$$";
95
+ $cmd = "mkdir -p $TMPDIR; ls -l $TMPDIR";
96
+ print STDERR "Executing: $cmd \n";
97
+ `$cmd`;
98
+
99
+ my $totalLines = int(`cat $align | wc -l`);
100
+ my $linesPerSplit = int($totalLines / $numParallel) + 1;
101
+
102
+ print "total=$totalLines line-per-split=$linesPerSplit \n";
103
+
104
+ my @children;
105
+ my $pid;
106
+
107
+ if ($numParallel > 1)
108
+ {
109
+ $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $target $TMPDIR/target.";
110
+ $pid = RunFork($cmd);
111
+ push(@children, $pid);
112
+
113
+ $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $source $TMPDIR/source.";
114
+ $pid = RunFork($cmd);
115
+ push(@children, $pid);
116
+
117
+ $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $align $TMPDIR/align.";
118
+ $pid = RunFork($cmd);
119
+ push(@children, $pid);
120
+
121
+ if ($weights) {
122
+ $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $weights $TMPDIR/weights.";
123
+ $pid = RunFork($cmd);
124
+ push(@children, $pid);
125
+ }
126
+
127
+ # wait for everything is finished
128
+ foreach (@children) {
129
+ waitpid($_, 0);
130
+ }
131
+
132
+ }
133
+ else
134
+ {
135
+ my $numStr = NumStr(0);
136
+
137
+ $cmd = "ln -s $target $TMPDIR/target.$numStr";
138
+ `$cmd`;
139
+
140
+ $cmd = "ln -s $source $TMPDIR/source.$numStr";
141
+ `$cmd`;
142
+
143
+ $cmd = "ln -s $align $TMPDIR/align.$numStr";
144
+ `$cmd`;
145
+
146
+ if ($weights) {
147
+ $cmd = "ln -s $weights $TMPDIR/weights.$numStr";
148
+ `$cmd`;
149
+ }
150
+ }
151
+
152
+ # run extract
153
+ @children = ();
154
+ for (my $i = 0; $i < $numParallel; ++$i)
155
+ {
156
+ my $pid = fork();
157
+
158
+ if ($pid == 0)
159
+ { # child
160
+ my $numStr = NumStr($i);
161
+ my $weightsCmd = "";
162
+ if ($weights) {
163
+ $weightsCmd = "--InstanceWeights $TMPDIR/weights.$numStr";
164
+ }
165
+
166
+ my $glueArg = "";
167
+ if (defined($glueFile)) {
168
+ $glueArg = "--GlueGrammar $TMPDIR/glue.$numStr";
169
+ }
170
+ #print STDERR "glueArg=$glueArg \n";
171
+
172
+ my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $glueArg $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n";
173
+ `$cmd`;
174
+
175
+ exit();
176
+ }
177
+ else
178
+ { # parent
179
+ push(@children, $pid);
180
+ }
181
+ }
182
+
183
+ # wait for everything is finished
184
+ foreach (@children) {
185
+ waitpid($_, 0);
186
+ }
187
+
188
+ # merge
189
+ my $catCmd = "gunzip -c ";
190
+ my $catInvCmd = $catCmd;
191
+ my $catOCmd = $catCmd;
192
+ my $catContextCmd = $catCmd;
193
+ my $catContextInvCmd = $catCmd;
194
+
195
+ for (my $i = 0; $i < $numParallel; ++$i)
196
+ {
197
+ my $numStr = NumStr($i);
198
+ $catCmd .= "$TMPDIR/extract.$numStr.gz ";
199
+ $catInvCmd .= "$TMPDIR/extract.$numStr.inv.gz ";
200
+ $catOCmd .= "$TMPDIR/extract.$numStr.o.gz ";
201
+ $catContextCmd .= "$TMPDIR/extract.$numStr.context ";
202
+ $catContextInvCmd .= "$TMPDIR/extract.$numStr.context.inv ";
203
+ }
204
+ if (defined($baselineExtract)) {
205
+ my $sorted = -e "$baselineExtract.sorted.gz" ? ".sorted" : "";
206
+ $catCmd .= "$baselineExtract$sorted.gz ";
207
+ $catInvCmd .= "$baselineExtract.inv$sorted.gz ";
208
+ $catOCmd .= "$baselineExtract.o$sorted.gz ";
209
+ }
210
+
211
+ $catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.sorted.gz 2>> /dev/stderr \n";
212
+ $catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.inv.sorted.gz 2>> /dev/stderr \n";
213
+ $catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.o.sorted.gz 2>> /dev/stderr \n";
214
+ $catContextCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.sorted.gz 2>> /dev/stderr \n";
215
+ $catContextInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.inv.sorted.gz 2>> /dev/stderr \n";
216
+
217
+
218
+ @children = ();
219
+ if ($makeTTable)
220
+ {
221
+ print STDERR "merging extract / extract.inv\n";
222
+ $pid = RunFork($catCmd);
223
+ push(@children, $pid);
224
+
225
+ $pid = RunFork($catInvCmd);
226
+ push(@children, $pid);
227
+ }
228
+ else {
229
+ print STDERR "skipping extract, doing only extract.o\n";
230
+ }
231
+
232
+ if ($otherExtractArgs =~ /--FlexibilityScore/) {
233
+ $pid = RunFork($catContextCmd);
234
+ push(@children, $pid);
235
+
236
+ $pid = RunFork($catContextInvCmd);
237
+ push(@children, $pid);
238
+ }
239
+
240
+ my $numStr = NumStr(0);
241
+ if (-e "$TMPDIR/extract.$numStr.o.gz")
242
+ {
243
+ $pid = RunFork($catOCmd);
244
+ push(@children, $pid);
245
+ }
246
+
247
+ # wait for all sorting to finish
248
+ foreach (@children) {
249
+ waitpid($_, 0);
250
+ }
251
+
252
+ # merge glue rules
253
+ if (defined($glueFile)) {
254
+ my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
255
+ print STDERR "Merging glue rules: $cmd \n";
256
+ print STDERR `$cmd`;
257
+ }
258
+
259
+ # merge phrase orientation priors (GHKM extraction)
260
+ if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
261
+ print STDERR "Merging phrase orientation priors\n";
262
+
263
+ my @orientationPriorsCountFiles = glob("$TMPDIR/*.phraseOrientationPriors");
264
+ my %priorCounts;
265
+
266
+ foreach my $filenamePhraseOrientationPriors (@orientationPriorsCountFiles) {
267
+ if (-f $filenamePhraseOrientationPriors) {
268
+ open my $infilePhraseOrientationPriors, '<', $filenamePhraseOrientationPriors or die "cannot open $filenamePhraseOrientationPriors: $!";
269
+ while (my $line = <$infilePhraseOrientationPriors>) {
270
+ print $line;
271
+ my ($key, $value) = split / /, $line;
272
+ $priorCounts{$key} += $value;
273
+ }
274
+ close $infilePhraseOrientationPriors;
275
+ }
276
+ }
277
+
278
+ open my $outPhraseOrientationPriors, '>', $phraseOrientationPriorsFile or die "cannot open $phraseOrientationPriorsFile: $!";
279
+ foreach my $key (sort keys %priorCounts) {
280
+ print $outPhraseOrientationPriors $key." ".$priorCounts{$key}."\n";
281
+ }
282
+ close($outPhraseOrientationPriors);
283
+ }
284
+
285
+ # delete temporary files
286
+ $cmd = "rm -rf $TMPDIR \n";
287
+ systemCheck($cmd);
288
+
289
+ print STDERR "Finished ".localtime() ."\n";
290
+
291
+ # -----------------------------------------
292
+ # -----------------------------------------
293
+
294
+ sub RunFork($)
295
+ {
296
+ my $cmd = shift;
297
+
298
+ my $pid = fork();
299
+
300
+ if ($pid == 0)
301
+ { # child
302
+ print STDERR $cmd;
303
+ systemCheck($cmd);
304
+ exit();
305
+ }
306
+ return $pid;
307
+ }
308
+
309
+ sub systemCheck($)
310
+ {
311
+ my $cmd = shift;
312
+ my $retVal = system($cmd);
313
+ if ($retVal != 0)
314
+ {
315
+ exit(1);
316
+ }
317
+ }
318
+
319
+ sub DigitStr($)
320
+ {
321
+ my $i = shift;
322
+ my $numStr;
323
+ if ($i < 10) {
324
+ $numStr = "000000$i";
325
+ }
326
+ elsif ($i < 100) {
327
+ $numStr = "00000$i";
328
+ }
329
+ elsif ($i < 1000) {
330
+ $numStr = "0000$i";
331
+ }
332
+ elsif ($i < 10000) {
333
+ $numStr = "000$i";
334
+ }
335
+ elsif ($i < 100000) {
336
+ $numStr = "00$i";
337
+ }
338
+ elsif ($i < 1000000) {
339
+ $numStr = "0$i";
340
+ }
341
+ else {
342
+ $numStr = $i;
343
+ }
344
+ return $numStr;
345
+ }
346
+
347
+ sub CharStr($)
348
+ {
349
+ my $i = shift;
350
+ my $charStr;
351
+ my @bit=();
352
+
353
+ while ($i>0){
354
+ push @bit, $i%26;
355
+ $i=int($i/26);
356
+ }
357
+ my $offset=scalar(@bit);
358
+ my $h;
359
+ for ($h=6;$h>=$offset;--$h) { $charStr.="a"; }
360
+ for ($h=$offset-1;$h>=0;--$h) { $charStr.="$alph[$bit[$h]]"; }
361
+ return $charStr;
362
+ }
363
+
364
+ sub NumStr($)
365
+ {
366
+ my $i = shift;
367
+ if ($isBSDSplit){
368
+ return CharStr($i);
369
+ }else{
370
+ return DigitStr($i);
371
+ }
372
+ }
373
+
374
+ sub GetSplitVersion($)
375
+ {
376
+ my $splitCmd = shift;
377
+ my $retVal = system("$splitCmd --help > /dev/null");
378
+ if ($retVal != 0) {
379
+ return 1;
380
+ }
381
+ else {
382
+ return 0;
383
+ }
384
+ }
385
+
mosesdecoder/scripts/generic/fsa-sample.fsa ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ 0 1 Prague 0.5
2
+ 1 2 Stock 1
3
+ 2 6 Market 1
4
+ 0 3 New 0.5
5
+ 3 4 York 1
6
+ 4 5 Stock 1
7
+ 5 6 Exchange 1
8
+ 6 7 falls 0.5
9
+ 6 7 drops 0.5
10
+ 7 8 . 1
mosesdecoder/scripts/generic/fsa2fsal.pl ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ # A very simple script that converts fsa format (openfst lattices) to the same
3
+ # thing represented one sentence per line. It uses '|||' to delimit columns and
4
+ # ' ' to delimit nodes (i.e. original lines).
5
+ # Some rudimentary sanity checks are done on the fly.
6
+ # Ondrej Bojar, bojar@ufal.mff.cuni.cz
7
+ #
8
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
9
+ # Public License version 2.1 or, at your option, any later version.
10
+
11
+ use warnings;
12
+ use strict;
13
+
14
+ my $errs = 0;
15
+ sub err {
16
+ my $nr = shift;
17
+ my $msg = shift;
18
+ print STDERR "$nr:$msg\n";
19
+ $errs++;
20
+ }
21
+
22
+ my $onr = 0;
23
+ my @lines = ();
24
+ sub flush {
25
+ return if 0 == scalar @lines;
26
+ print join(" ", @lines);
27
+ print "\n";
28
+ $onr++;
29
+ @lines = ();
30
+ }
31
+
32
+ my $nr = 0;
33
+ my $numscores = undef;
34
+ while (<>) {
35
+ chomp;
36
+ if ($_ eq "") {
37
+ flush();
38
+ next;
39
+ }
40
+ my ($a, $b, $label, $scores, $rest) = split /\s+/, $_, 5;
41
+ err($nr, "The delimiter '|||' can't appear in the input!") if /\|\|\|/;
42
+ err($nr, "Node id not numeric: $a") if $a !~ /^\d+$/;
43
+ err($nr, "Node id not numeric: $b") if $b !~ /^\d+$/;
44
+ err($nr, "Unexpected tail: '$rest'") if defined $rest && $rest !~ /^\s*$/;
45
+ my $thisnumscores = ($scores =~ tr/,/,/);
46
+ $numscores = $thisnumscores if !defined $numscores;
47
+ err($nr, "Incompatible number of arc scores, previous lines had ".($numscores+1).", now ".($thisnumscores+1))
48
+ if $numscores != $thisnumscores;
49
+ push @lines, join("|||", ($a,$b,$label,$scores));
50
+ }
51
+ flush();
52
+
53
+ exit 1 if $errs;
mosesdecoder/scripts/generic/fsa2plf.pl ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ # Converts AT&T FSA format to 'python lattice format'.
3
+ # Note that the input FSA needs to be epsilon-free and topologically sorted.
4
+ # This script checks for topological sortedness.
5
+ # The start node has to have the index 0.
6
+ # All path ends are assumed to be final nodes, not just the explicitly stated
7
+ # final nodes.
8
+ # Note that the output format may not contain any spaces.
9
+ # Ondrej Bojar, bojar@ufal.mff.cuni.cz
10
+ #
11
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
12
+ # Public License version 2.1 or, at your option, any later version.
13
+
14
+ use warnings;
15
+ use strict;
16
+ use Getopt::Long;
17
+
18
+ binmode(STDIN, ":utf8");
19
+ binmode(STDOUT, ":utf8");
20
+ binmode(STDERR, ":utf8");
21
+
22
+ my $filelist;
23
+ my $ignore_final_state_cost = 0;
24
+ my $mangle_weights = undef;
25
+ GetOptions(
26
+ "ignore-final-state-cost" => \$ignore_final_state_cost,
27
+ # sometimes, final states have a cost (e.g. "45 0.05\n")
28
+ # instead of dying there, ignore the problem
29
+ "filelist|fl=s" => \$filelist,
30
+ "mangle-weights=s" => \$mangle_weights,
31
+ ) or exit 1;
32
+
33
+ my @infiles;
34
+ if (defined $filelist) {
35
+ my $fh = my_open($filelist);
36
+ while (<$fh>) {
37
+ chomp;
38
+ push @infiles, $_;
39
+ }
40
+ close $fh;
41
+ }
42
+ push @infiles, @ARGV;
43
+ @ARGV = ();
44
+ if (0 == scalar(@infiles)) {
45
+ print STDERR "Reading input from stdin\n";
46
+ push @infiles, "-";
47
+ }
48
+
49
+ my $err = 0;
50
+ foreach my $inf (@infiles) {
51
+ my $nr = 0;
52
+ NEXTLATTICE:
53
+ my %usedids = (); # collect all used ids for densification
54
+ my %usedtgtids = (); # collect all used ids for densification
55
+ my @outnodes = ();
56
+ my $fh = my_open($inf);
57
+ my %is_final; # remember which nodes were final
58
+ while (<$fh>) {
59
+ chomp;
60
+ $nr++;
61
+ last if $_ eq ""; # assume a blank line delimits lattices
62
+ my ($src, $tgt, $label, $weight) = split /\s+/;
63
+ die "$inf:$nr:Bad src node index: $src" if $src !~ /^[0-9]+$/;
64
+
65
+ if (!defined $label && !defined $weight) {
66
+ # explicit final node, warn at the end if there are any intermed. final
67
+ # nodes
68
+ $is_final{$src};
69
+ # final nodes can have a cost
70
+ die "$inf:$nr:Final state $src has cost $tgt. Unsupported, use --ignore-final-state-cost"
71
+ if defined $tgt && !$ignore_final_state_cost;
72
+
73
+ next;
74
+ }
75
+ $weight = 0 if !defined $weight;
76
+
77
+ $usedids{$src} = 1;
78
+ $usedtgtids{$tgt} = 1;
79
+
80
+ # process the weight
81
+ # when reading RWTH FSA output, the weights are negated natural logarithms
82
+ # we need to negate them back
83
+ if (defined $mangle_weights) {
84
+ if ($mangle_weights eq "expneg") {
85
+ $weight = join(",", map {exp(-$_)} split /,/, $weight);
86
+ } else {
87
+ die "Bad weights mangling: $mangle_weights";
88
+ }
89
+ }
90
+ # remember the node
91
+ my $targetnode = $tgt-$src;
92
+ die "$inf:$nr:Not topologically sorted, got arc from $src to $tgt"
93
+ if $targetnode <= 0;
94
+ push @{$outnodes[$src]}, [ $label, $weight, $tgt ];
95
+ }
96
+ if (eof($fh)) {
97
+ close $fh;
98
+ $fh = undef;
99
+ }
100
+
101
+ # Assign our dense IDs: source node ids are assigned first
102
+ my %denseids = (); # maps node ids from the file to dense ids
103
+ my $nextid = 0;
104
+ foreach my $id (sort {$a<=>$b} keys %usedids) {
105
+ $denseids{$id} = $nextid;
106
+ $nextid++;
107
+ }
108
+ # All unseen target nodes then get the same next id, the final node id
109
+ foreach my $id (keys %usedtgtids) {
110
+ next if defined $denseids{$id};
111
+ $denseids{$id} = $nextid;
112
+ }
113
+
114
+ foreach my $f (keys %is_final) {
115
+ if (defined $outnodes[$f]) {
116
+ print STDERR "$inf:Node $f is final but it has outgoing edges!\n";
117
+ $err = 1;
118
+ }
119
+ }
120
+ # # Verbose: print original to dense IDs mapping
121
+ # foreach my $src (sort {$a<=>$b} keys %denseids) {
122
+ # print STDERR "$src ...> $denseids{$src}\n";
123
+ # }
124
+
125
+ print "(";
126
+ for(my $origsrc = 0; $origsrc < @outnodes; $origsrc++) {
127
+ my $src = $denseids{$origsrc};
128
+ next if !defined $src; # this original node ID is not used at all
129
+ next if $src == $nextid; # this is the ultimate merged final node
130
+ my $outnode = $outnodes[$origsrc];
131
+ print "(";
132
+ foreach my $arc (@$outnode) {
133
+ my $origtgt = $arc->[2];
134
+ my $tgt = $denseids{$origtgt};
135
+ if (!defined $tgt) {
136
+ # this was a final node only
137
+ $tgt = $denseids{$origtgt} = $nextid;
138
+ $nextid++;
139
+ }
140
+ my $step_to_target = $tgt - $src;
141
+ die "$inf:Bug, I damaged top-sortedness (orig $origsrc .. $origtgt; curr $src .. $tgt)." if $step_to_target <= 0;
142
+ print "('".apo($arc->[0])."',$arc->[1],$step_to_target),";
143
+ }
144
+ print "),";
145
+ }
146
+ print ")\n";
147
+ goto NEXTLATTICE if defined $fh && ! eof($fh);
148
+ }
149
+ die "There were errors." if $err;
150
+
151
+ sub apo {
152
+ my $s = shift;
153
+ # protects apostrophy and backslash
154
+ $s =~ s/\\/\\\\/g;
155
+ $s =~ s/(['])/\\$1/g;
156
+ return $s;
157
+ }
158
+
159
+ sub my_open {
160
+ my $f = shift;
161
+ if ($f eq "-") {
162
+ binmode(STDIN, ":utf8");
163
+ return *STDIN;
164
+ }
165
+
166
+ die "Not found: $f" if ! -e $f;
167
+
168
+ my $opn;
169
+ my $hdl;
170
+ my $ft = `file '$f'`;
171
+ # file might not recognize some files!
172
+ if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) {
173
+ $opn = "zcat '$f' |";
174
+ } elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) {
175
+ $opn = "bzcat '$f' |";
176
+ } else {
177
+ $opn = "$f";
178
+ }
179
+ open $hdl, $opn or die "Can't open '$opn': $!";
180
+ binmode $hdl, ":utf8";
181
+ return $hdl;
182
+ }
mosesdecoder/scripts/generic/generic-parallel.perl ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ use warnings;
7
+ use strict;
8
+ use utf8;
9
+
10
+ binmode STDIN, ":utf8";
11
+ binmode STDOUT, ":utf8";
12
+ binmode STDERR, ":utf8";
13
+
14
+ sub NumStr($);
15
+
16
+ my $NUM_SPLIT_LINES = $ARGV[0];
17
+
18
+ my $TMPDIR = $ARGV[1];
19
+ $TMPDIR = "$TMPDIR/tmp.$$";
20
+ mkdir $TMPDIR;
21
+ print STDERR "TMPDIR=$TMPDIR \n";
22
+
23
+ my $cmd = "";
24
+ for (my $i = 2; $i < scalar(@ARGV); ++$i)
25
+ {
26
+ $cmd .= $ARGV[$i] ." ";
27
+ }
28
+
29
+ # split input file
30
+ open (INPUT_ALL, "> $TMPDIR/input.all");
31
+ binmode INPUT_ALL, ":utf8";
32
+ while (my $line = <STDIN>)
33
+ {
34
+ chomp($line);
35
+ print INPUT_ALL $line."\n";
36
+ }
37
+ close(INPUT_ALL);
38
+
39
+ my $cmd2 = "split -l $NUM_SPLIT_LINES -a 5 -d $TMPDIR/input.all $TMPDIR/x";
40
+ `$cmd2`;
41
+
42
+ # create exec file
43
+ open (EXEC, "> $TMPDIR/exec");
44
+ binmode EXEC, ":utf8";
45
+
46
+ # execute in parallel
47
+ print STDERR "executing\n";
48
+
49
+ my $i = 0;
50
+ my $filePath = "$TMPDIR/x" .NumStr($i);
51
+ while (-f $filePath)
52
+ {
53
+ print EXEC "$cmd < $filePath > $filePath.out\n";
54
+
55
+ ++$i;
56
+ $filePath = "$TMPDIR/x" .NumStr($i);
57
+ }
58
+ close (EXEC);
59
+
60
+ $cmd2 = "parallel < $TMPDIR/exec";
61
+ `$cmd2`;
62
+
63
+ # concatenate
64
+ print STDERR "concatenating\n";
65
+
66
+ $i = 1;
67
+ my $firstPath = "$TMPDIR/x" .NumStr(0) .".out";
68
+ $filePath = "$TMPDIR/x" .NumStr($i) .".out";
69
+ while (-f $filePath)
70
+ {
71
+ $cmd = "cat $filePath >> $firstPath";
72
+ `$cmd`;
73
+
74
+ ++$i;
75
+ $filePath = "$TMPDIR/x" .NumStr($i) .".out";
76
+ }
77
+
78
+ # output
79
+ open (OUTPUT_ALL, "$firstPath");
80
+ binmode OUTPUT_ALL, ":utf8";
81
+ while (my $line = <OUTPUT_ALL>)
82
+ {
83
+ chomp($line);
84
+ print "$line\n";
85
+ }
86
+ close(OUTPUT_ALL);
87
+
88
+ $cmd = "rm -rf $TMPDIR/";
89
+ `$cmd`;
90
+
91
+ ###########################################
92
+ sub NumStr($)
93
+ {
94
+ my $i = shift;
95
+ my $numStr;
96
+ if ($i < 10) {
97
+ $numStr = "000000$i";
98
+ }
99
+ elsif ($i < 100) {
100
+ $numStr = "00000$i";
101
+ }
102
+ elsif ($i < 1000) {
103
+ $numStr = "0000$i";
104
+ }
105
+ elsif ($i < 10000) {
106
+ $numStr = "000$i";
107
+ }
108
+ elsif ($i < 100000) {
109
+ $numStr = "00$i";
110
+ }
111
+ elsif ($i < 1000000) {
112
+ $numStr = "0$i";
113
+ }
114
+ else {
115
+ $numStr = $i;
116
+ }
117
+ return $numStr;
118
+ }
119
+
mosesdecoder/scripts/generic/giza-parallel.perl ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ # example
7
+ # ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align
8
+
9
+ use warnings;
10
+ use strict;
11
+ use File::Basename;
12
+
13
+ sub NumStr($);
14
+
15
+ print "Started ".localtime() ."\n";
16
+
17
+ my $numParallel = $ARGV[0];
18
+ my $splitCmd = $ARGV[1];
19
+ my $trainCmd = $ARGV[2];
20
+ my $inputExt = $ARGV[3];
21
+ my $outputExt = $ARGV[4];
22
+ my $corpus = $ARGV[5];
23
+ my $align = $ARGV[6];
24
+
25
+ my $TMPDIR=dirname($align) ."/tmp.$$";
26
+ mkdir $TMPDIR;
27
+
28
+ my $scriptDir=dirname($trainCmd) ."/..";
29
+
30
+ # split corpus file
31
+ my $totalLines = int(`wc -l $corpus.$inputExt`);
32
+ my $linesPerSplit = int($totalLines / $numParallel) + 1;
33
+
34
+ my $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$inputExt $TMPDIR/source.";
35
+ `$cmd`;
36
+
37
+ $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$outputExt $TMPDIR/target.";
38
+ `$cmd`;
39
+
40
+ for (my $i = 0; $i < $numParallel; ++$i)
41
+ {
42
+ my $numStr = NumStr($i);
43
+ rename("$TMPDIR/source.$numStr", "$TMPDIR/$numStr.source");
44
+ rename("$TMPDIR/target.$numStr", "$TMPDIR/$numStr.target");
45
+ }
46
+
47
+ #fork & run giza & friends
48
+ my $isParent = 1;
49
+ my @childs;
50
+ for (my $i = 0; $i < $numParallel; ++$i)
51
+ {
52
+ my $pid = fork();
53
+
54
+ if ($pid == 0)
55
+ { # child
56
+ $isParent = 0;
57
+
58
+ my $numStr = NumStr($i);
59
+ my $cmd = "$trainCmd -dont-zip -last-step 1 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus $TMPDIR/$numStr -corpus-dir $TMPDIR/prepared.$numStr \n";
60
+ print $cmd;
61
+ `$cmd`;
62
+
63
+ $cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-e2f $TMPDIR/giza.$numStr -direction 2 \n";
64
+ print $cmd;
65
+ `$cmd`;
66
+
67
+ $cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -direction 1 \n";
68
+ print $cmd;
69
+ `$cmd`;
70
+
71
+ $cmd = "$trainCmd -dont-zip -first-step 3 -last-step 3 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -giza-e2f $TMPDIR/giza.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -alignment-file $TMPDIR/aligned.$numStr -alignment grow-diag-final-and \n";
72
+ print $cmd;
73
+ `$cmd`;
74
+
75
+ exit();
76
+ }
77
+ else
78
+ { # parent
79
+ push(@childs, $pid);
80
+ }
81
+
82
+ }
83
+
84
+ # wait for everything is finished
85
+ if ($isParent)
86
+ {
87
+ foreach (@childs) {
88
+ waitpid($_, 0);
89
+ }
90
+ }
91
+ else
92
+ {
93
+ die "shouldn't be here";
94
+ }
95
+
96
+ # cat all aligned files together. Voila
97
+ my $cmd = "cat ";
98
+ for (my $i = 0; $i < $numParallel; ++$i)
99
+ {
100
+ my $numStr = NumStr($i);
101
+ $cmd .= "$TMPDIR/aligned.$numStr.grow-diag-final-and ";
102
+ }
103
+ $cmd .= " > $align \n";
104
+ print $cmd;
105
+ `$cmd`;
106
+
107
+ sub NumStr($)
108
+ {
109
+ my $i = shift;
110
+ my $numStr;
111
+ if ($i < 10) {
112
+ $numStr = "000000$i";
113
+ }
114
+ elsif ($i < 100) {
115
+ $numStr = "00000$i";
116
+ }
117
+ elsif ($i < 1000) {
118
+ $numStr = "0000$i";
119
+ }
120
+ elsif ($i < 10000) {
121
+ $numStr = "000$i";
122
+ }
123
+ elsif ($i < 100000) {
124
+ $numStr = "00$i";
125
+ }
126
+ elsif ($i < 1000000) {
127
+ $numStr = "0$i";
128
+ }
129
+ else {
130
+ $numStr = $i;
131
+ }
132
+ return $numStr;
133
+ }
134
+
mosesdecoder/scripts/generic/lopar2pos.pl ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ # $Id$
7
+ #lopar2pos: extract POSs from LOPAR output
8
+ #usage: lopar2pos.pl CORPUS.lopar > CORPUS.pos
9
+
10
+ use warnings;
11
+
12
+ my $infilename = shift @ARGV;
13
+ open(INFILE, "<$infilename") or die "couldn't open '$infilename' for read: $!\n";
14
+ while(my $line = <INFILE>)
15
+ {
16
+ my @words = split(/\s+/, $line);
17
+ my @tags = map {$_ =~ /^[^_]*_([A-Z]+)/; $1} @words;
18
+ print join(' ', @tags) . "\n";
19
+ }
20
+ close(INFILE);
mosesdecoder/scripts/generic/moses_sim_pe.py ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Written by Michael Denkowski
4
+ #
5
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
6
+ # Public License version 2.1 or, at your option, any later version.
7
+
8
+ """Parallelize decoding with simulated post-editing via moses XML input.
9
+
10
+ (XML entities need to be escaped in tokenization). Memory mapped
11
+ dynamic phrase tables (Ulrich Germann,
12
+ www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models
13
+ (Kenneth Heafield,
14
+ http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19)
15
+ facilitate memory efficient multi process decoding. Input is divided into
16
+ batches, each of which is decoded sequentially. Each batch pre-loads the
17
+ data from previous batches.
18
+
19
+ To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the
20
+ alignment from input to references. Specify the number of jobs with
21
+ --decoder-flags="-threads N".
22
+ """
23
+
24
+ import gzip
25
+ import itertools
26
+ import math
27
+ import os
28
+ import shutil
29
+ import subprocess
30
+ import sys
31
+ import tempfile
32
+ import threading
33
+
34
+ HELP = '''Moses with simulated post-editing
35
+
36
+ Usage:
37
+ {} moses-cmd -config moses.ini -input-file text.src -ref text.tgt \
38
+ -symal text.src-tgt.symal [options] [decoder flags]
39
+
40
+ Options:
41
+ -threads N: number of decoders to run in parallel \
42
+ (default read from moses.ini, 1 if not present)
43
+ -n-best-list nbest.out N [distinct]: location and size of N-best list
44
+ -show-weights: for mert-moses.pl, just call moses and exit
45
+ -tmp: location of temp directory (default /tmp)
46
+
47
+ Other options (decoder flags) are passed through to moses-cmd\n'''
48
+
49
+
50
+ class ProgramFailure(Exception):
51
+ """Known kind of failure, with a known presentation to the user.
52
+
53
+ Error message will be printed, and the program will return an error,
54
+ but no traceback will be shown to the user.
55
+ """
56
+
57
+
58
+ class Progress:
59
+ """Provides progress bar."""
60
+
61
+ def __init__(self):
62
+ self.i = 0
63
+ self.lock = threading.Lock()
64
+
65
+ def inc(self):
66
+ self.lock.acquire()
67
+ self.i += 1
68
+ if self.i % 100 == 0:
69
+ sys.stderr.write('.')
70
+ if self.i % 1000 == 0:
71
+ sys.stderr.write(' [{}]\n'.format(self.i))
72
+ sys.stderr.flush()
73
+ self.lock.release()
74
+
75
+ def done(self):
76
+ self.lock.acquire()
77
+ if self.i % 1000 != 0:
78
+ sys.stderr.write('\n')
79
+ self.lock.release()
80
+
81
+
82
+ def atomic_io(cmd, in_file, out_file, err_file, prog=None):
83
+ """Run with atomic (synchronous) I/O."""
84
+ with open(in_file, 'r') as inp, open(out_file, 'w') as out, open(err_file, 'w') as err:
85
+ p = subprocess.Popen(
86
+ cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=err)
87
+ while True:
88
+ line = inp.readline()
89
+ if not line:
90
+ break
91
+ p.stdin.write(line)
92
+ out.write(p.stdout.readline())
93
+ out.flush()
94
+ if prog:
95
+ prog.inc()
96
+ p.stdin.close()
97
+ p.wait()
98
+
99
+
100
+ def gzopen(f):
101
+ """Open plain or gzipped text."""
102
+ return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r')
103
+
104
+
105
+ def wc(f):
106
+ """Word count."""
107
+ i = 0
108
+ for line in gzopen(f):
109
+ i += 1
110
+ return i
111
+
112
+
113
+ def write_gzfile(lines, f):
114
+ """Write lines to gzipped file."""
115
+ out = gzip.open(f, 'wb')
116
+ for line in lines:
117
+ out.write('{}\n'.format(line))
118
+ out.close()
119
+
120
+
121
+ def main(argv):
122
+ # Defaults
123
+ moses_ini = None
124
+ moses_ini_lines = None
125
+ text_src = None
126
+ text_tgt = None
127
+ text_symal = None
128
+ text_len = None
129
+ threads_found = False
130
+ threads = 1
131
+ n_best_out = None
132
+ n_best_size = None
133
+ n_best_distinct = False
134
+ hg_ext = None
135
+ hg_dir = None
136
+ tmp_dir = '/tmp'
137
+ xml_found = False
138
+ xml_input = 'exclusive'
139
+ show_weights = False
140
+ mmsapt_dynamic = []
141
+ mmsapt_static = []
142
+ mmsapt_l1 = None
143
+ mmsapt_l2 = None
144
+
145
+ # Decoder command
146
+ cmd = argv[1:]
147
+
148
+ # Parse special options and remove from cmd
149
+ i = 1
150
+ while i < len(cmd):
151
+ if cmd[i] in ('-f', '-config'):
152
+ moses_ini = cmd[i + 1]
153
+ cmd = cmd[:i] + cmd[i + 2:]
154
+ elif cmd[i] in ('-i', '-input-file'):
155
+ text_src = cmd[i + 1]
156
+ cmd = cmd[:i] + cmd[i + 2:]
157
+ elif cmd[i] == '-ref':
158
+ text_tgt = cmd[i + 1]
159
+ cmd = cmd[:i] + cmd[i + 2:]
160
+ elif cmd[i] == '-symal':
161
+ text_symal = cmd[i + 1]
162
+ cmd = cmd[:i] + cmd[i + 2:]
163
+ elif cmd[i] in ('-th', '-threads'):
164
+ threads_found = True
165
+ threads = int(cmd[i + 1])
166
+ cmd = cmd[:i] + cmd[i + 2:]
167
+ elif cmd[i] == '-n-best-list':
168
+ n_best_out = cmd[i + 1]
169
+ n_best_size = cmd[i + 2]
170
+ # Optional "distinct"
171
+ if i + 3 < len(cmd) and cmd[i + 3] == 'distinct':
172
+ n_best_distinct = True
173
+ cmd = cmd[:i] + cmd[i + 4:]
174
+ else:
175
+ cmd = cmd[:i] + cmd[i + 3:]
176
+ elif cmd[i] == '-output-search-graph-hypergraph':
177
+ # cmd[i + 1] == true
178
+ hg_ext = cmd[i + 2]
179
+ if i + 3 < len(cmd) and cmd[i + 3][0] != '-':
180
+ hg_dir = cmd[i + 3]
181
+ cmd = cmd[:i] + cmd[i + 4:]
182
+ else:
183
+ hg_dir = 'hypergraph'
184
+ cmd = cmd[:i] + cmd[i + 3:]
185
+ elif cmd[i] == '-tmp':
186
+ tmp_dir = cmd[i + 1]
187
+ cmd = cmd[:i] + cmd[i + 2:]
188
+ # Handled specially to make sure XML input is turned on somewhere
189
+ elif cmd[i] in ('-xi', '-xml-input'):
190
+ xml_found = True
191
+ xml_input = cmd[i + 1]
192
+ cmd = cmd[:i] + cmd[i + 2:]
193
+ # Handled specially for mert-moses.pl
194
+ elif cmd[i] == '-show-weights':
195
+ show_weights = True
196
+ # Do not remove from cmd
197
+ i += 1
198
+ else:
199
+ i += 1
200
+
201
+ # Read moses.ini
202
+ if moses_ini:
203
+ moses_ini_lines = [line.strip() for line in open(moses_ini, 'r')]
204
+ i = 0
205
+ while i < len(moses_ini_lines):
206
+ # PhraseDictionaryBitextSampling name=TranslationModel0
207
+ # output-factor=0 num-features=7 path=corpus. L1=src L2=tgt
208
+ # pfwd=g pbwd=g smooth=0 sample=1000 workers=1
209
+ if moses_ini_lines[i].startswith('PhraseDictionaryBitextSampling'):
210
+ for (k, v) in (pair.split('=') for pair in moses_ini_lines[i].split()[1:]):
211
+ if k == 'name':
212
+ # Dynamic means update this model
213
+ if v.startswith('Dynamic'):
214
+ mmsapt_dynamic.append(v)
215
+ moses_ini_lines[i] += '{mmsapt_extra}'
216
+ else:
217
+ mmsapt_static.append(v)
218
+ elif k == 'L1':
219
+ if mmsapt_l1 and v != mmsapt_l1:
220
+ raise ProgramFailure(
221
+ 'Error: All PhraseDictionaryBitextSampling '
222
+ 'entries should have same L1: '
223
+ '{} != {}\n'.format(v, mmsapt_l1))
224
+ mmsapt_l1 = v
225
+ elif k == 'L2':
226
+ if mmsapt_l2 and v != mmsapt_l2:
227
+ raise ProgramFailure(
228
+ 'Error: All PhraseDictionaryBitextSampling '
229
+ 'entries should have same L2: '
230
+ '{} != {}\n'.format(v, mmsapt_l2))
231
+ mmsapt_l2 = v
232
+ # [threads]
233
+ # 8
234
+ elif moses_ini_lines[i] == '[threads]':
235
+ # Prefer command line over moses.ini
236
+ if not threads_found:
237
+ threads = int(moses_ini_lines[i + 1])
238
+ i += 1
239
+ # [xml-input]
240
+ # exclusive
241
+ elif moses_ini_lines[i] == '[xml-input]':
242
+ # Prefer command line over moses.ini
243
+ if not xml_found:
244
+ xml_found = True
245
+ xml_input = moses_ini_lines[i + 1]
246
+ i += 1
247
+ i += 1
248
+
249
+ # If mert-moses.pl passes -show-weights, just call moses
250
+ if show_weights:
251
+ # re-append original moses.ini
252
+ cmd.append('-config')
253
+ cmd.append(moses_ini)
254
+ sys.stdout.write(subprocess.check_output(cmd))
255
+ sys.stdout.flush()
256
+ sys.exit(0)
257
+
258
+ # Input length
259
+ if text_src:
260
+ text_len = wc(text_src)
261
+
262
+ # Check inputs
263
+ if not (len(cmd) > 0 and all((moses_ini, text_src, text_tgt, text_symal))):
264
+ sys.stderr.write(HELP.format(argv[0]))
265
+ sys.exit(2)
266
+ if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)):
267
+ raise ProgramFailure(
268
+ 'Error: moses-cmd "{}" is not executable\n'.format(cmd[0]))
269
+ if not mmsapt_dynamic:
270
+ raise ProgramFailure((
271
+ 'Error: no PhraseDictionaryBitextSampling entries named '
272
+ '"Dynamic..." found in {}. See '
273
+ 'http://www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40\n'
274
+ ).format(moses_ini))
275
+ if wc(text_tgt) != text_len or wc(text_symal) != text_len:
276
+ raise ProgramFailure(
277
+ 'Error: length mismatch between "{}", "{}", and "{}"\n'.format(
278
+ text_src, text_tgt, text_symal))
279
+
280
+ # Setup
281
+ work_dir = tempfile.mkdtemp(prefix='moses.', dir=os.path.abspath(tmp_dir))
282
+ threads = min(threads, text_len)
283
+ batch_size = int(math.ceil(float(text_len) / threads))
284
+
285
+ # Report settings
286
+ sys.stderr.write(
287
+ 'Moses flags: {}\n'.format(
288
+ ' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:])))
289
+ for (i, n) in enumerate(mmsapt_dynamic):
290
+ sys.stderr.write(
291
+ 'Dynamic mmsapt {}: {} {} {}\n'.format(
292
+ i, n, mmsapt_l1, mmsapt_l2))
293
+ for (i, n) in enumerate(mmsapt_static):
294
+ sys.stderr.write(
295
+ 'Static mmsapt {}: {} {} {}\n'.format(i, n, mmsapt_l1, mmsapt_l2))
296
+ sys.stderr.write('XML mode: {}\n'.format(xml_input))
297
+ sys.stderr.write(
298
+ 'Inputs: {} {} {} ({})\n'.format(
299
+ text_src, text_tgt, text_symal, text_len))
300
+ sys.stderr.write('Jobs: {}\n'.format(threads))
301
+ sys.stderr.write('Batch size: {}\n'.format(batch_size))
302
+ if n_best_out:
303
+ sys.stderr.write(
304
+ 'N-best list: {} ({}{})\n'.format(
305
+ n_best_out, n_best_size,
306
+ ', distinct' if n_best_distinct else ''))
307
+ if hg_dir:
308
+ sys.stderr.write('Hypergraph dir: {} ({})\n'.format(hg_dir, hg_ext))
309
+ sys.stderr.write('Temp dir: {}\n'.format(work_dir))
310
+
311
+ # Accumulate seen lines
312
+ src_lines = []
313
+ tgt_lines = []
314
+ symal_lines = []
315
+
316
+ # Current XML source file
317
+ xml_out = None
318
+
319
+ # Split into batches. Each batch after 0 gets extra files with data from
320
+ # previous batches.
321
+ # Data from previous lines in the current batch is added using XML input.
322
+ job = -1
323
+ lc = -1
324
+ lines = itertools.izip(
325
+ gzopen(text_src), gzopen(text_tgt), gzopen(text_symal))
326
+ for (src, tgt, symal) in lines:
327
+ (src, tgt, symal) = (src.strip(), tgt.strip(), symal.strip())
328
+ lc += 1
329
+ if lc % batch_size == 0:
330
+ job += 1
331
+ xml_file = os.path.join(work_dir, 'input.{}.xml'.format(job))
332
+ extra_src_file = os.path.join(
333
+ work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l1))
334
+ extra_tgt_file = os.path.join(
335
+ work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l2))
336
+ extra_symal_file = os.path.join(
337
+ work_dir, 'extra.{}.{}-{}.symal.gz'.format(
338
+ job, mmsapt_l1, mmsapt_l2))
339
+ if job > 0:
340
+ xml_out.close()
341
+ write_gzfile(src_lines, extra_src_file)
342
+ write_gzfile(tgt_lines, extra_tgt_file)
343
+ write_gzfile(symal_lines, extra_symal_file)
344
+ xml_out = open(xml_file, 'w')
345
+ ini_file = os.path.join(work_dir, 'moses.{}.ini'.format(job))
346
+ with open(ini_file, 'w') as moses_ini_out:
347
+ if job == 0:
348
+ extra = ''
349
+ else:
350
+ extra = ' extra={}'.format(
351
+ os.path.join(work_dir, 'extra.{}.'.format(job)))
352
+ moses_ini_out.write(
353
+ '{}\n'.format(
354
+ '\n'.join(moses_ini_lines).format(mmsapt_extra=extra)))
355
+ src_lines.append(src)
356
+ tgt_lines.append(tgt)
357
+ symal_lines.append(symal)
358
+ # Lines after first start with update tag including previous
359
+ # translation.
360
+ # Translation of last line of each batch is included in extra for
361
+ # next batch.
362
+ xml_tags = []
363
+ if lc % batch_size != 0:
364
+ tag_template = (
365
+ '<update '
366
+ 'name="{}" source="{}" target="{}" alignment="{}" /> ')
367
+ for n in mmsapt_dynamic:
368
+ # Note: space after tag.
369
+ xml_tags.append(
370
+ tag_template.format(
371
+ n, src_lines[-2], tgt_lines[-2], symal_lines[-2]))
372
+ xml_out.write('{}{}\n'.format(''.join(xml_tags), src))
373
+ xml_out.close()
374
+
375
+ # Run decoders in parallel
376
+ workers = []
377
+ prog = Progress()
378
+ for i in range(threads):
379
+ work_cmd = cmd[:]
380
+ work_cmd.append('-config')
381
+ work_cmd.append(os.path.join(work_dir, 'moses.{}.ini'.format(i)))
382
+ # Workers use 1 CPU each
383
+ work_cmd.append('-threads')
384
+ work_cmd.append('1')
385
+ if not xml_found:
386
+ work_cmd.append('-xml-input')
387
+ work_cmd.append(xml_input)
388
+ if n_best_out:
389
+ work_cmd.append('-n-best-list')
390
+ work_cmd.append(os.path.join(work_dir, 'nbest.{}'.format(i)))
391
+ work_cmd.append(str(n_best_size))
392
+ if n_best_distinct:
393
+ work_cmd.append('distinct')
394
+ if hg_dir:
395
+ work_cmd.append('-output-search-graph-hypergraph')
396
+ work_cmd.append('true')
397
+ work_cmd.append(hg_ext)
398
+ work_cmd.append(os.path.join(work_dir, 'hg.{}'.format(i)))
399
+ in_file = os.path.join(work_dir, 'input.{}.xml'.format(i))
400
+ out_file = os.path.join(work_dir, 'out.{}'.format(i))
401
+ err_file = os.path.join(work_dir, 'err.{}'.format(i))
402
+ t = threading.Thread(
403
+ target=atomic_io,
404
+ args=(work_cmd, in_file, out_file, err_file, prog))
405
+ workers.append(t)
406
+ t.start()
407
+ # Wait for all to finish
408
+ for t in workers:
409
+ t.join()
410
+ prog.done()
411
+
412
+ # Gather N-best lists
413
+ if n_best_out:
414
+ with open(n_best_out, 'w') as out:
415
+ for i in range(threads):
416
+ path = os.path.join(work_dir, 'nbest.{}'.format(i))
417
+ for line in open(path, 'r'):
418
+ entry = line.partition(' ')
419
+ out.write(
420
+ '{} {}'.format(
421
+ int(entry[0]) + (i * batch_size), entry[2]))
422
+
423
+ # Gather hypergraphs
424
+ if hg_dir:
425
+ if not os.path.exists(hg_dir):
426
+ os.mkdir(hg_dir)
427
+ shutil.copy(
428
+ os.path.join(work_dir, 'hg.0', 'weights'),
429
+ os.path.join(hg_dir, 'weights'))
430
+ for i in range(threads):
431
+ for j in range(batch_size):
432
+ shutil.copy(
433
+ os.path.join(
434
+ work_dir, 'hg.{}'.format(i),
435
+ '{}.{}'.format(j, hg_ext)),
436
+ os.path.join(
437
+ hg_dir, '{}.{}'.format((i * batch_size) + j, hg_ext)))
438
+
439
+ # Gather stdout
440
+ for i in range(threads):
441
+ for line in open(os.path.join(work_dir, 'out.{}'.format(i)), 'r'):
442
+ sys.stdout.write(line)
443
+
444
+ # Cleanup
445
+ shutil.rmtree(work_dir)
446
+
447
+ if __name__ == '__main__':
448
+ try:
449
+ main(sys.argv)
450
+ except ProgramFailure as error:
451
+ sys.stderr.write("%s\n" % error)
452
+ sys.exit(1)
mosesdecoder/scripts/generic/mteval-v11b.pl ADDED
@@ -0,0 +1,761 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ use strict;
4
+
5
+ #################################
6
+ # History:
7
+ #
8
+ # version 11b -- text normalization modified:
9
+ # * take out the join digit line because it joins digits
10
+ # when it shouldn't have
11
+ # $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
12
+ #
13
+ # version 11a -- corrected output of individual n-gram precision values
14
+ #
15
+ # version 11 -- bug fixes:
16
+ # * make filehandle operate in binary mode to prevent Perl from operating
17
+ # (by default in Red Hat 9) in UTF-8
18
+ # * fix failure on joining digits
19
+ # version 10 -- updated output to include more details of n-gram scoring.
20
+ # Defaults to generate both NIST and BLEU scores. Use -b for BLEU
21
+ # only, use -n for NIST only
22
+ #
23
+ # version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
24
+ # being the max, regardless what was entered on the command line.)
25
+ #
26
+ # version 09c -- bug fix (During the calculation of ngram information,
27
+ # each ngram was being counted only once for each segment. This has
28
+ # been fixed so that each ngram is counted correctly in each segment.)
29
+ #
30
+ # version 09b -- text normalization modified:
31
+ # * option flag added to preserve upper case
32
+ # * non-ASCII characters left in place.
33
+ #
34
+ # version 09a -- text normalization modified:
35
+ # * &quot; and &amp; converted to "" and &, respectively
36
+ # * non-ASCII characters kept together (bug fix)
37
+ #
38
+ # version 09 -- modified to accommodate sgml tag and attribute
39
+ # names revised to conform to default SGML conventions.
40
+ #
41
+ # version 08 -- modifies the NIST metric in accordance with the
42
+ # findings on the 2001 Chinese-English dry run corpus. Also
43
+ # incorporates the BLEU metric as an option and supports the
44
+ # output of ngram detail.
45
+ #
46
+ # version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
47
+ # Keep strings of non-ASCII characters together as one word
48
+ # (rather than splitting them into one-character words).
49
+ # Change length penalty so that translations that are longer than
50
+ # the average reference translation are not penalized.
51
+ #
52
+ # version 06
53
+ # Prevent divide-by-zero when a segment has no evaluation N-grams.
54
+ # Correct segment index for level 3 debug output.
55
+ #
56
+ # version 05
57
+ # improve diagnostic error messages
58
+ #
59
+ # version 04
60
+ # tag segments
61
+ #
62
+ # version 03
63
+ # add detailed output option (intermediate document and segment scores)
64
+ #
65
+ # version 02
66
+ # accommodation of modified sgml tags and attributes
67
+ #
68
+ # version 01
69
+ # same as bleu version 15, but modified to provide formal score output.
70
+ #
71
+ # original IBM version
72
+ # Author: Kishore Papineni
73
+ # Date: 06/10/2001
74
+ #################################
75
+
76
+ ######
77
+ # Intro
78
+ my ($date, $time) = date_time_stamp();
79
+ print "MT evaluation scorer began on $date at $time\n";
80
+ print "command line: ", $0, " ", join(" ", @ARGV), "\n";
81
+ my $usage = "\n\nUsage: $0 [-h] -r <ref_file> -s src_file -t <tst_file>\n\n".
82
+ "Description: This Perl script evaluates MT system performance.\n".
83
+ "\n".
84
+ "Required arguments:\n".
85
+ " -r <ref_file> is a file containing the reference translations for\n".
86
+ " the documents to be evaluated.\n".
87
+ " -s <src_file> is a file containing the source documents for which\n".
88
+ " translations are to be evaluated\n".
89
+ " -t <tst_file> is a file containing the translations to be evaluated\n".
90
+ "\n".
91
+ "Optional arguments:\n".
92
+ " -c preserves upper-case alphabetic characters\n".
93
+ " -b generate BLEU scores only\n".
94
+ " -n generate NIST scores only\n".
95
+ " -d detailed output flag used in conjunction with \"-b\" or \"-n\" flags:\n".
96
+ " 0 (default) for system-level score only\n".
97
+ " 1 to include document-level scores\n".
98
+ " 2 to include segment-level scores\n".
99
+ " 3 to include ngram-level scores\n".
100
+ " -h prints this help message to STDOUT\n".
101
+ "\n";
102
+
103
+ use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x);
104
+ use Getopt::Std;
105
+ getopts ('r:s:t:d:hbncx:');
106
+ die $usage if defined($opt_h);
107
+ die "Error in command line: ref_file not defined$usage" unless defined $opt_r;
108
+ die "Error in command line: src_file not defined$usage" unless defined $opt_s;
109
+ die "Error in command line: tst_file not defined$usage" unless defined $opt_t;
110
+ my $max_Ngram = 9;
111
+ my $detail = defined $opt_d ? $opt_d : 0;
112
+ my $preserve_case = defined $opt_c ? 1 : 0;
113
+
114
+ my $METHOD = "BOTH";
115
+ if (defined $opt_b) { $METHOD = "BLEU"; }
116
+ if (defined $opt_n) { $METHOD = "NIST"; }
117
+ my $method;
118
+
119
+ my ($ref_file) = $opt_r;
120
+ my ($src_file) = $opt_s;
121
+ my ($tst_file) = $opt_t;
122
+
123
+ ######
124
+ # Global variables
125
+ my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
126
+ my (%tst_data, %ref_data); # the data -- with structure: {system}{document}[segments]
127
+ my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
128
+ my %eval_docs; # document information for the evaluation data set
129
+ my %ngram_info; # the information obtained from (the last word in) the ngram
130
+
131
+ ######
132
+ # Get source document ID's
133
+ ($src_id) = get_source_info ($src_file);
134
+
135
+ ######
136
+ # Get reference translations
137
+ ($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
138
+
139
+ compute_ngram_info ();
140
+
141
+ ######
142
+ # Get translations to evaluate
143
+ ($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
144
+
145
+ ######
146
+ # Check data for completeness and correctness
147
+ check_MT_data ();
148
+
149
+ ######
150
+ #
151
+ my %NISTmt = ();
152
+ my %BLEUmt = ();
153
+
154
+ ######
155
+ # Evaluate
156
+ print " Evaluation of $src_lang-to-$tgt_lang translation using:\n";
157
+ my $cum_seg = 0;
158
+ foreach my $doc (sort keys %eval_docs) {
159
+ $cum_seg += @{$eval_docs{$doc}{SEGS}};
160
+ }
161
+ print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
162
+ print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
163
+ print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
164
+
165
+ foreach my $sys (sort @tst_sys) {
166
+ for (my $n=1; $n<=$max_Ngram; $n++) {
167
+ $NISTmt{$n}{$sys}{cum} = 0;
168
+ $NISTmt{$n}{$sys}{ind} = 0;
169
+ $BLEUmt{$n}{$sys}{cum} = 0;
170
+ $BLEUmt{$n}{$sys}{ind} = 0;
171
+ }
172
+
173
+ if (($METHOD eq "BOTH") || ($METHOD eq "NIST")) {
174
+ $method="NIST";
175
+ score_system ($sys, %NISTmt);
176
+ }
177
+ if (($METHOD eq "BOTH") || ($METHOD eq "BLEU")) {
178
+ $method="BLEU";
179
+ score_system ($sys, %BLEUmt);
180
+ }
181
+ }
182
+
183
+ ######
184
+ printout_report ();
185
+
186
+ ($date, $time) = date_time_stamp();
187
+ print "MT evaluation scorer ended on $date at $time\n";
188
+
189
+ exit 0;
190
+
191
+ #################################
192
+
193
+ sub get_source_info {
194
+
195
+ my ($file) = @_;
196
+ my ($name, $id, $src, $doc);
197
+ my ($data, $tag, $span);
198
+
199
+
200
+ #read data from file
201
+ open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
202
+ binmode FILE;
203
+ $data .= $_ while <FILE>;
204
+ close (FILE);
205
+
206
+ #get source set info
207
+ die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n"
208
+ unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
209
+
210
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
211
+ unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
212
+
213
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
214
+ unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
215
+ die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
216
+ ." with $name in previous input data ('$src_lang')\n\n"
217
+ unless (not defined $src_lang or $src eq $src_lang);
218
+ $src_lang = $src;
219
+
220
+ #get doc info -- ID and # of segs
221
+ $data = $span;
222
+ while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) {
223
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
224
+ unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
225
+ die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n"
226
+ if defined $eval_docs{$doc};
227
+ $span =~ s/[\s\n\r]+/ /g; # concatenate records
228
+ my $jseg=0, my $seg_data = $span;
229
+ while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
230
+ ($eval_docs{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
231
+ }
232
+ die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
233
+ if $jseg == 0;
234
+ }
235
+ die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n"
236
+ unless keys %eval_docs > 0;
237
+ return $id;
238
+ }
239
+
240
+ #################################
241
+
242
+ sub get_MT_data {
243
+
244
+ my ($docs, $set_tag, $file) = @_;
245
+ my ($name, $id, $src, $tgt, $sys, $doc);
246
+ my ($tag, $span, $data);
247
+
248
+ #read data from file
249
+ open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
250
+ binmode FILE;
251
+ $data .= $_ while <FILE>;
252
+ close (FILE);
253
+
254
+ #get tag info
255
+ while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) {
256
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
257
+ ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
258
+
259
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
260
+ ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
261
+ die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
262
+ ." with $name of source ('$src_lang')\n\n"
263
+ unless $src eq $src_lang;
264
+
265
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
266
+ ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
267
+ die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n"
268
+ ." with $name of the evaluation ('$tgt_lang')\n\n"
269
+ unless (not defined $tgt_lang or $tgt eq $tgt_lang);
270
+ $tgt_lang = $tgt;
271
+
272
+ my $mtdata = $span;
273
+ while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) {
274
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
275
+ (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
276
+
277
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
278
+ $doc = extract_sgml_tag_attribute ($name="DocID", $tag);
279
+
280
+ die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n"
281
+ ." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
282
+ unless (not defined $docs->{$sys}{$doc});
283
+
284
+ $span =~ s/[\s\n\r]+/ /g; # concatenate records
285
+ my $jseg=0, my $seg_data = $span;
286
+ while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
287
+ ($docs->{$sys}{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
288
+ }
289
+ die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
290
+ if $jseg == 0;
291
+ $docs->{$sys}{$doc}{FILE} = $file;
292
+ }
293
+ }
294
+ return $id;
295
+ }
296
+
297
+ #################################
298
+
299
+ sub check_MT_data {
300
+
301
+ @tst_sys = sort keys %tst_data;
302
+ @ref_sys = sort keys %ref_data;
303
+
304
+ #every evaluation document must be represented for every system and every reference
305
+ foreach my $doc (sort keys %eval_docs) {
306
+ my $nseg_source = @{$eval_docs{$doc}{SEGS}};
307
+ foreach my $sys (@tst_sys) {
308
+ die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n"
309
+ unless defined $tst_data{$sys}{$doc};
310
+ my $nseg = @{$tst_data{$sys}{$doc}{SEGS}};
311
+ die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
312
+ ." document '$doc' for system '$sys' contains $nseg segments, while\n"
313
+ ." the source document contains $nseg_source segments.\n\n"
314
+ unless $nseg == $nseg_source;
315
+ }
316
+
317
+ foreach my $sys (@ref_sys) {
318
+ die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n"
319
+ unless defined $ref_data{$sys}{$doc};
320
+ my $nseg = @{$ref_data{$sys}{$doc}{SEGS}};
321
+ die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
322
+ ." document '$doc' for system '$sys' contains $nseg segments, while\n"
323
+ ." the source document contains $nseg_source segments.\n\n"
324
+ unless $nseg == $nseg_source;
325
+ }
326
+ }
327
+ }
328
+
329
+ #################################
330
+
331
+ sub compute_ngram_info {
332
+
333
+ my ($ref, $doc, $seg);
334
+ my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
335
+ my (%ngram_count, @tot_ngrams);
336
+
337
+ foreach $ref (keys %ref_data) {
338
+ foreach $doc (keys %{$ref_data{$ref}}) {
339
+ foreach $seg (@{$ref_data{$ref}{$doc}{SEGS}}) {
340
+ @wrds = split /\s+/, $seg;
341
+ $tot_wrds += @wrds;
342
+ %ngrams = %{Words2Ngrams (@wrds)};
343
+ foreach $ngram (keys %ngrams) {
344
+ $ngram_count{$ngram} += $ngrams{$ngram};
345
+ }
346
+ }
347
+ }
348
+ }
349
+
350
+ foreach $ngram (keys %ngram_count) {
351
+ @wrds = split / /, $ngram;
352
+ pop @wrds, $mgram = join " ", @wrds;
353
+ $ngram_info{$ngram} = - log
354
+ ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram}
355
+ : $ngram_count{$ngram}/$tot_wrds) / log 2;
356
+ if (defined $opt_x and $opt_x eq "ngram info") {
357
+ @wrds = split / /, $ngram;
358
+ printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
359
+ $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
360
+ }
361
+ }
362
+ }
363
+
364
+ #################################
365
+
366
+ sub score_system {
367
+
368
+ my ($sys, $ref, $doc, %SCOREmt);
369
+ ($sys, %SCOREmt) = @_;
370
+ my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
371
+ my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
372
+
373
+ $cum_ref_length = 0;
374
+ for (my $j=1; $j<=$max_Ngram; $j++) {
375
+ $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
376
+ }
377
+
378
+ foreach $doc (sort keys %eval_docs) {
379
+ ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc);
380
+
381
+ #output document summary score
382
+ if (($detail >= 1 ) && ($METHOD eq "NIST")) {
383
+ my %DOCmt = ();
384
+ printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
385
+ nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt),
386
+ scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
387
+ }
388
+ if (($detail >= 1 ) && ($METHOD eq "BLEU")) {
389
+ my %DOCmt = ();
390
+ printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
391
+ bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt),
392
+ scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
393
+ }
394
+
395
+ $cum_ref_length += $shortest_ref_length;
396
+ for (my $j=1; $j<=$max_Ngram; $j++) {
397
+ $cum_match[$j] += $match_cnt->[$j];
398
+ $cum_tst_cnt[$j] += $tst_cnt->[$j];
399
+ $cum_ref_cnt[$j] += $ref_cnt->[$j];
400
+ $cum_tst_info[$j] += $tst_info->[$j];
401
+ $cum_ref_info[$j] += $ref_info->[$j];
402
+ printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
403
+ $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
404
+ if (defined $opt_x and $opt_x eq "document info");
405
+ }
406
+ }
407
+
408
+ #x #output system summary score
409
+ #x printf "$method score = %.4f for system \"$sys\"\n",
410
+ #x $method eq "BLEU" ? bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt) :
411
+ #x nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
412
+ if ($method eq "BLEU") {
413
+ bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, %SCOREmt);
414
+ }
415
+ if ($method eq "NIST") {
416
+ nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
417
+ }
418
+ }
419
+
420
+ #################################
421
+
422
+ sub score_document {
423
+
424
+ my ($sys, $ref, $doc);
425
+ ($sys, $doc) = @_;
426
+ my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
427
+ my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
428
+
429
+ $cum_ref_length = 0;
430
+ for (my $j=1; $j<=$max_Ngram; $j++) {
431
+ $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
432
+ }
433
+
434
+ #score each segment
435
+ for (my $jseg=0; $jseg<@{$tst_data{$sys}{$doc}{SEGS}}; $jseg++) {
436
+ my @ref_segments = ();
437
+ foreach $ref (@ref_sys) {
438
+ push @ref_segments, $ref_data{$ref}{$doc}{SEGS}[$jseg];
439
+ printf "ref '$ref', seg %d: %s\n", $jseg+1, $ref_data{$ref}{$doc}{SEGS}[$jseg]
440
+ if $detail >= 3;
441
+ }
442
+ printf "sys '$sys', seg %d: %s\n", $jseg+1, $tst_data{$sys}{$doc}{SEGS}[$jseg]
443
+ if $detail >= 3;
444
+ ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) =
445
+ score_segment ($tst_data{$sys}{$doc}{SEGS}[$jseg], @ref_segments);
446
+
447
+ #output segment summary score
448
+ #x printf "$method score = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
449
+ #x $method eq "BLEU" ? bleu_score($shortest_ref_length, $match_cnt, $tst_cnt) :
450
+ #x nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info),
451
+ #x $jseg+1, $tst_cnt->[1]
452
+ #x if $detail >= 2;
453
+ if (($detail >=2) && ($METHOD eq "BLEU")) {
454
+ my %DOCmt = ();
455
+ printf " $method score using 4-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
456
+ bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
457
+ }
458
+ if (($detail >=2) && ($METHOD eq "NIST")) {
459
+ my %DOCmt = ();
460
+ printf " $method score using 5-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
461
+ nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
462
+ }
463
+
464
+
465
+ $cum_ref_length += $shortest_ref_length;
466
+ for (my $j=1; $j<=$max_Ngram; $j++) {
467
+ $cum_match[$j] += $match_cnt->[$j];
468
+ $cum_tst_cnt[$j] += $tst_cnt->[$j];
469
+ $cum_ref_cnt[$j] += $ref_cnt->[$j];
470
+ $cum_tst_info[$j] += $tst_info->[$j];
471
+ $cum_ref_info[$j] += $ref_info->[$j];
472
+ }
473
+ }
474
+ return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
475
+ }
476
+
477
+ #################################
478
+
479
+ sub score_segment {
480
+
481
+ my ($tst_seg, @ref_segs) = @_;
482
+ my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
483
+ my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
484
+ my ($ngram);
485
+ my (@nwrds_ref);
486
+ my $shortest_ref_length;
487
+
488
+ for (my $j=1; $j<= $max_Ngram; $j++) {
489
+ $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
490
+ }
491
+
492
+ # get the ngram counts for the test segment
493
+ @tst_wrds = split /\s+/, $tst_seg;
494
+ %tst_ngrams = %{Words2Ngrams (@tst_wrds)};
495
+ for (my $j=1; $j<=$max_Ngram; $j++) { # compute ngram counts
496
+ $tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
497
+ }
498
+
499
+ # get the ngram counts for the reference segments
500
+ foreach $ref_seg (@ref_segs) {
501
+ @ref_wrds = split /\s+/, $ref_seg;
502
+ %ref_ngrams = %{Words2Ngrams (@ref_wrds)};
503
+ foreach $ngram (keys %ref_ngrams) { # find the maximum # of occurrences
504
+ my @wrds = split / /, $ngram;
505
+ $ref_info[@wrds] += $ngram_info{$ngram};
506
+ $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ?
507
+ max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) :
508
+ $ref_ngrams{$ngram};
509
+ }
510
+ for (my $j=1; $j<=$max_Ngram; $j++) { # update ngram counts
511
+ $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
512
+ }
513
+ $shortest_ref_length = scalar @ref_wrds # find the shortest reference segment
514
+ if (not defined $shortest_ref_length) or @ref_wrds < $shortest_ref_length;
515
+ }
516
+
517
+ # accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
518
+ foreach $ngram (keys %tst_ngrams) {
519
+ next unless defined $ref_ngrams_max{$ngram};
520
+ my @wrds = split / /, $ngram;
521
+ $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
522
+ $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
523
+ printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
524
+ if $detail >= 3;
525
+ }
526
+
527
+ return ($shortest_ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
528
+ }
529
+
530
+ #################################
531
+
532
+ sub bleu_score {
533
+
534
+ my ($shortest_ref_length, $matching_ngrams, $tst_ngrams, $sys, %SCOREmt) = @_;
535
+
536
+ my $score = 0;
537
+ my $iscore = 0;
538
+ my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
539
+
540
+ for (my $j=1; $j<=$max_Ngram; $j++) {
541
+ if ($matching_ngrams->[$j] == 0) {
542
+ $SCOREmt{$j}{$sys}{cum}=0;
543
+ } else {
544
+ # Cumulative N-Gram score
545
+ $score += log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
546
+ $SCOREmt{$j}{$sys}{cum} = exp($score/$j + $len_score);
547
+ # Individual N-Gram score
548
+ $iscore = log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
549
+ $SCOREmt{$j}{$sys}{ind} = exp($iscore);
550
+ }
551
+ }
552
+ return $SCOREmt{4}{$sys}{cum};
553
+ }
554
+
555
+ #################################
556
+
557
+ sub nist_score {
558
+
559
+ my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, %SCOREmt) = @_;
560
+
561
+ my $score = 0;
562
+ my $iscore = 0;
563
+
564
+
565
+ for (my $n=1; $n<=$max_Ngram; $n++) {
566
+ $score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
567
+ $SCOREmt{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
568
+
569
+ $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
570
+ $SCOREmt{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
571
+ }
572
+ return $SCOREmt{5}{$sys}{cum};
573
+ }
574
+
575
+ #################################
576
+
577
+ sub Words2Ngrams { #convert a string of words to an Ngram count hash
578
+
579
+ my %count = ();
580
+
581
+ for (; @_; shift) {
582
+ my ($j, $ngram, $word);
583
+ for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) {
584
+ $ngram .= defined $ngram ? " $word" : $word;
585
+ $count{$ngram}++;
586
+ }
587
+ }
588
+ return {%count};
589
+ }
590
+
591
+ #################################
592
+
593
+ sub NormalizeText {
594
+ my ($norm_text) = @_;
595
+
596
+ # language-independent part:
597
+ $norm_text =~ s/<skipped>//g; # strip "skipped" tags
598
+ $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
599
+ $norm_text =~ s/\n/ /g; # join lines
600
+ $norm_text =~ s/&quot;/"/g; # convert SGML tag for quote to "
601
+ $norm_text =~ s/&amp;/&/g; # convert SGML tag for ampersand to &
602
+ $norm_text =~ s/&lt;/</g; # convert SGML tag for less-than to >
603
+ $norm_text =~ s/&gt;/>/g; # convert SGML tag for greater-than to <
604
+
605
+ # language-dependent part (assuming Western languages):
606
+ $norm_text = " $norm_text ";
607
+ $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
608
+ $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation
609
+ $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
610
+ $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
611
+ $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
612
+ $norm_text =~ s/\s+/ /g; # one space only between words
613
+ $norm_text =~ s/^\s+//; # no leading space
614
+ $norm_text =~ s/\s+$//; # no trailing space
615
+
616
+ return $norm_text;
617
+ }
618
+
619
+ #################################
620
+
621
+ sub nist_length_penalty {
622
+
623
+ my ($ratio) = @_;
624
+ return 1 if $ratio >= 1;
625
+ return 0 if $ratio <= 0;
626
+ my $ratio_x = 1.5;
627
+ my $score_x = 0.5;
628
+ my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
629
+ return exp (-$beta*log($ratio)*log($ratio));
630
+ }
631
+
632
+ #################################
633
+
634
+ sub date_time_stamp {
635
+
636
+ my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
637
+ my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
638
+ my ($date, $time);
639
+
640
+ $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
641
+ $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
642
+ return ($date, $time);
643
+ }
644
+
645
+ #################################
646
+
647
+ sub extract_sgml_tag_and_span {
648
+
649
+ my ($name, $data) = @_;
650
+
651
+ ($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
652
+ }
653
+
654
+ #################################
655
+
656
+ sub extract_sgml_tag_attribute {
657
+
658
+ my ($name, $data) = @_;
659
+
660
+ ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
661
+ }
662
+
663
+ #################################
664
+
665
+ sub max {
666
+
667
+ my ($max, $next);
668
+
669
+ return unless defined ($max=pop);
670
+ while (defined ($next=pop)) {
671
+ $max = $next if $next > $max;
672
+ }
673
+ return $max;
674
+ }
675
+
676
+ #################################
677
+
678
+ sub min {
679
+
680
+ my ($min, $next);
681
+
682
+ return unless defined ($min=pop);
683
+ while (defined ($next=pop)) {
684
+ $min = $next if $next < $min;
685
+ }
686
+ return $min;
687
+ }
688
+
689
+ #################################
690
+
691
+ sub printout_report
692
+ {
693
+
694
+ if ( $METHOD eq "BOTH" ) {
695
+ foreach my $sys (sort @tst_sys) {
696
+ printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
697
+ }
698
+ } elsif ($METHOD eq "NIST" ) {
699
+ foreach my $sys (sort @tst_sys) {
700
+ printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
701
+ }
702
+ } elsif ($METHOD eq "BLEU" ) {
703
+ foreach my $sys (sort @tst_sys) {
704
+ printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
705
+ }
706
+ }
707
+
708
+
709
+ printf "\n# ------------------------------------------------------------------------\n\n";
710
+ printf "Individual N-gram scoring\n";
711
+ printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
712
+ printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
713
+
714
+ if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
715
+ foreach my $sys (sort @tst_sys) {
716
+ printf " NIST:";
717
+ for (my $i=1; $i<=$max_Ngram; $i++) {
718
+ printf " %2.4f ",$NISTmt{$i}{$sys}{ind}
719
+ }
720
+ printf " \"$sys\"\n";
721
+ }
722
+ printf "\n";
723
+ }
724
+
725
+ if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
726
+ foreach my $sys (sort @tst_sys) {
727
+ printf " BLEU:";
728
+ for (my $i=1; $i<=$max_Ngram; $i++) {
729
+ printf " %2.4f ",$BLEUmt{$i}{$sys}{ind}
730
+ }
731
+ printf " \"$sys\"\n";
732
+ }
733
+ }
734
+
735
+ printf "\n# ------------------------------------------------------------------------\n";
736
+ printf "Cumulative N-gram scoring\n";
737
+ printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
738
+ printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
739
+
740
+ if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
741
+ foreach my $sys (sort @tst_sys) {
742
+ printf " NIST:";
743
+ for (my $i=1; $i<=$max_Ngram; $i++) {
744
+ printf " %2.4f ",$NISTmt{$i}{$sys}{cum}
745
+ }
746
+ printf " \"$sys\"\n";
747
+ }
748
+ }
749
+ printf "\n";
750
+
751
+
752
+ if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
753
+ foreach my $sys (sort @tst_sys) {
754
+ printf " BLEU:";
755
+ for (my $i=1; $i<=$max_Ngram; $i++) {
756
+ printf " %2.4f ",$BLEUmt{$i}{$sys}{cum}
757
+ }
758
+ printf " \"$sys\"\n";
759
+ }
760
+ }
761
+ }
mosesdecoder/scripts/generic/mteval-v12.pl ADDED
@@ -0,0 +1,784 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ use warnings;
4
+ use strict;
5
+ use utf8;
6
+ use Encode;
7
+
8
+ binmode STDOUT, ":utf8";
9
+ binmode STDERR, ":utf8";
10
+
11
+ #################################
12
+ # History:
13
+ #
14
+ # version 12
15
+ # * Text normalization changes:
16
+ # * convert entity references (only the entities declared in the DTD)
17
+ # * now uses unicode categories
18
+ # * tokenize punctuation unless followed AND preceded by digits
19
+ # * tokenize symbols
20
+ # * UTF-8 handling:
21
+ # * files are now read using utf8 mode
22
+ # * Added the '-e' command-line option to enclose non-ASCII characters between spaces
23
+ #
24
+ # version 11b -- text normalization modified:
25
+ # * take out the join digit line because it joins digits
26
+ # when it shouldn't have
27
+ # $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
28
+ #
29
+ # version 11a -- corrected output of individual n-gram precision values
30
+ #
31
+ # version 11 -- bug fixes:
32
+ # * make filehandle operate in binary mode to prevent Perl from operating
33
+ # (by default in Red Hat 9) in UTF-8
34
+ # * fix failure on joining digits
35
+ # version 10 -- updated output to include more details of n-gram scoring.
36
+ # Defaults to generate both NIST and BLEU scores. Use -b for BLEU
37
+ # only, use -n for NIST only
38
+ #
39
+ # version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
40
+ # being the max, regardless what was entered on the command line.)
41
+ #
42
+ # version 09c -- bug fix (During the calculation of ngram information,
43
+ # each ngram was being counted only once for each segment. This has
44
+ # been fixed so that each ngram is counted correctly in each segment.)
45
+ #
46
+ # version 09b -- text normalization modified:
47
+ # * option flag added to preserve upper case
48
+ # * non-ASCII characters left in place.
49
+ #
50
+ # version 09a -- text normalization modified:
51
+ # * &quot; and &amp; converted to "" and &, respectively
52
+ # * non-ASCII characters kept together (bug fix)
53
+ #
54
+ # version 09 -- modified to accommodate sgml tag and attribute
55
+ # names revised to conform to default SGML conventions.
56
+ #
57
+ # version 08 -- modifies the NIST metric in accordance with the
58
+ # findings on the 2001 Chinese-English dry run corpus. Also
59
+ # incorporates the BLEU metric as an option and supports the
60
+ # output of ngram detail.
61
+ #
62
+ # version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
63
+ # Keep strings of non-ASCII characters together as one word
64
+ # (rather than splitting them into one-character words).
65
+ # Change length penalty so that translations that are longer than
66
+ # the average reference translation are not penalized.
67
+ #
68
+ # version 06
69
+ # Prevent divide-by-zero when a segment has no evaluation N-grams.
70
+ # Correct segment index for level 3 debug output.
71
+ #
72
+ # version 05
73
+ # improve diagnostic error messages
74
+ #
75
+ # version 04
76
+ # tag segments
77
+ #
78
+ # version 03
79
+ # add detailed output option (intermediate document and segment scores)
80
+ #
81
+ # version 02
82
+ # accommodation of modified sgml tags and attributes
83
+ #
84
+ # version 01
85
+ # same as bleu version 15, but modified to provide formal score output.
86
+ #
87
+ # original IBM version
88
+ # Author: Kishore Papineni
89
+ # Date: 06/10/2001
90
+ #################################
91
+
92
+ ######
93
+ # Intro
94
+ my ($date, $time) = date_time_stamp();
95
+ print "MT evaluation scorer began on $date at $time\n";
96
+ print "command line: ", $0, " ", join(" ", @ARGV), "\n";
97
+ my $usage = "\n\nUsage: $0 [-h] -r <ref_file> -s <src_file> -t <tst_file>\n\n".
98
+ "Description: This Perl script evaluates MT system performance.\n".
99
+ "\n".
100
+ "Required arguments:\n".
101
+ " -r <ref_file> is a file containing the reference translations for\n".
102
+ " the documents to be evaluated.\n".
103
+ " -s <src_file> is a file containing the source documents for which\n".
104
+ " translations are to be evaluated\n".
105
+ " -t <tst_file> is a file containing the translations to be evaluated\n".
106
+ "\n".
107
+ "Optional arguments:\n".
108
+ " -c preserves upper-case alphabetic characters\n".
109
+ " -b generate BLEU scores only\n".
110
+ " -n generate NIST scores only\n".
111
+ " -d detailed output flag used in conjunction with \"-b\" or \"-n\" flags:\n".
112
+ " 0 (default) for system-level score only\n".
113
+ " 1 to include document-level scores\n".
114
+ " 2 to include segment-level scores\n".
115
+ " 3 to include ngram-level scores\n".
116
+ " -e enclose non-ASCII characters between spaces\n".
117
+ " -h prints this help message to STDOUT\n".
118
+ "\n";
119
+
120
+ use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e);
121
+ use Getopt::Std;
122
+ getopts ('r:s:t:d:hbncx:e');
123
+ die $usage if defined($opt_h);
124
+ die "Error in command line: ref_file not defined$usage" unless defined $opt_r;
125
+ die "Error in command line: src_file not defined$usage" unless defined $opt_s;
126
+ die "Error in command line: tst_file not defined$usage" unless defined $opt_t;
127
+ my $max_Ngram = 9;
128
+ my $detail = defined $opt_d ? $opt_d : 0;
129
+ my $preserve_case = defined $opt_c ? 1 : 0;
130
+ my $split_non_ASCII = defined $opt_e ? 1 : 0;
131
+
132
+ my $METHOD = "BOTH";
133
+ if (defined $opt_b) { $METHOD = "BLEU"; }
134
+ if (defined $opt_n) { $METHOD = "NIST"; }
135
+ my $method;
136
+
137
+ my ($ref_file) = $opt_r;
138
+ my ($src_file) = $opt_s;
139
+ my ($tst_file) = $opt_t;
140
+
141
+ ######
142
+ # Global variables
143
+ my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
144
+ my (%tst_data, %ref_data); # the data -- with structure: {system}{document}[segments]
145
+ my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
146
+ my %eval_docs; # document information for the evaluation data set
147
+ my %ngram_info; # the information obtained from (the last word in) the ngram
148
+
149
+ ######
150
+ # Get source document ID's
151
+ ($src_id) = get_source_info ($src_file);
152
+
153
+ ######
154
+ # Get reference translations
155
+ ($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
156
+
157
+ compute_ngram_info ();
158
+
159
+ ######
160
+ # Get translations to evaluate
161
+ ($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
162
+
163
+ ######
164
+ # Check data for completeness and correctness
165
+ check_MT_data ();
166
+
167
+ ######
168
+ #
169
+ my %NISTmt = ();
170
+ my %BLEUmt = ();
171
+
172
+ ######
173
+ # Evaluate
174
+ print " Evaluation of $src_lang-to-$tgt_lang translation using:\n";
175
+ my $cum_seg = 0;
176
+ foreach my $doc (sort keys %eval_docs) {
177
+ $cum_seg += @{$eval_docs{$doc}{SEGS}};
178
+ }
179
+ print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
180
+ print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
181
+ print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
182
+
183
+ foreach my $sys (sort @tst_sys) {
184
+ for (my $n=1; $n<=$max_Ngram; $n++) {
185
+ $NISTmt{$n}{$sys}{cum} = 0;
186
+ $NISTmt{$n}{$sys}{ind} = 0;
187
+ $BLEUmt{$n}{$sys}{cum} = 0;
188
+ $BLEUmt{$n}{$sys}{ind} = 0;
189
+ }
190
+
191
+ if (($METHOD eq "BOTH") || ($METHOD eq "NIST")) {
192
+ $method="NIST";
193
+ score_system ($sys, %NISTmt);
194
+ }
195
+ if (($METHOD eq "BOTH") || ($METHOD eq "BLEU")) {
196
+ $method="BLEU";
197
+ score_system ($sys, %BLEUmt);
198
+ }
199
+ }
200
+
201
+ ######
202
+ printout_report ();
203
+
204
+ ($date, $time) = date_time_stamp();
205
+ print "MT evaluation scorer ended on $date at $time\n";
206
+
207
+ exit 0;
208
+
209
+ #################################
210
+
211
+ sub get_source_info {
212
+
213
+ my ($file) = @_;
214
+ my ($name, $id, $src, $doc);
215
+ my ($data, $tag, $span);
216
+
217
+
218
+ #read data from file
219
+ open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
220
+ binmode FILE, ":utf8";
221
+ $data .= $_ while <FILE>;
222
+ close (FILE);
223
+
224
+ #get source set info
225
+ die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n"
226
+ unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
227
+
228
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
229
+ unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
230
+
231
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
232
+ unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
233
+ die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
234
+ ." with $name in previous input data ('$src_lang')\n\n"
235
+ unless (not defined $src_lang or $src eq $src_lang);
236
+ $src_lang = $src;
237
+
238
+ #get doc info -- ID and # of segs
239
+ $data = $span;
240
+ while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) {
241
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
242
+ unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
243
+ die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n"
244
+ if defined $eval_docs{$doc};
245
+ $span =~ s/[\s\n\r]+/ /g; # concatenate records
246
+ my $jseg=0, my $seg_data = $span;
247
+ while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
248
+ ($eval_docs{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
249
+ }
250
+ die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
251
+ if $jseg == 0;
252
+ }
253
+ die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n"
254
+ unless keys %eval_docs > 0;
255
+ return $id;
256
+ }
257
+
258
+ #################################
259
+
260
+ sub get_MT_data {
261
+
262
+ my ($docs, $set_tag, $file) = @_;
263
+ my ($name, $id, $src, $tgt, $sys, $doc);
264
+ my ($tag, $span, $data);
265
+
266
+ #read data from file
267
+ open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
268
+ binmode FILE, ":utf8";
269
+ $data .= $_ while <FILE>;
270
+ close (FILE);
271
+
272
+ #get tag info
273
+ while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) {
274
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
275
+ ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
276
+
277
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
278
+ ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
279
+ die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
280
+ ." with $name of source ('$src_lang')\n\n"
281
+ unless $src eq $src_lang;
282
+
283
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
284
+ ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
285
+ die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n"
286
+ ." with $name of the evaluation ('$tgt_lang')\n\n"
287
+ unless (not defined $tgt_lang or $tgt eq $tgt_lang);
288
+ $tgt_lang = $tgt;
289
+
290
+ my $mtdata = $span;
291
+ while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) {
292
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
293
+ (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
294
+
295
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless
296
+ $doc = extract_sgml_tag_attribute ($name="DocID", $tag);
297
+
298
+ die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n"
299
+ ." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
300
+ unless (not defined $docs->{$sys}{$doc});
301
+
302
+ $span =~ s/[\s\n\r]+/ /g; # concatenate records
303
+ my $jseg=0, my $seg_data = $span;
304
+ while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
305
+ ($docs->{$sys}{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
306
+ }
307
+ die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
308
+ if $jseg == 0;
309
+ $docs->{$sys}{$doc}{FILE} = $file;
310
+ }
311
+ }
312
+ return $id;
313
+ }
314
+
315
+ #################################
316
+
317
+ sub check_MT_data {
318
+
319
+ @tst_sys = sort keys %tst_data;
320
+ @ref_sys = sort keys %ref_data;
321
+
322
+ #every evaluation document must be represented for every system and every reference
323
+ foreach my $doc (sort keys %eval_docs) {
324
+ my $nseg_source = @{$eval_docs{$doc}{SEGS}};
325
+ foreach my $sys (@tst_sys) {
326
+ die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n"
327
+ unless defined $tst_data{$sys}{$doc};
328
+ my $nseg = @{$tst_data{$sys}{$doc}{SEGS}};
329
+ die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
330
+ ." document '$doc' for system '$sys' contains $nseg segments, while\n"
331
+ ." the source document contains $nseg_source segments.\n\n"
332
+ unless $nseg == $nseg_source;
333
+ }
334
+
335
+ foreach my $sys (@ref_sys) {
336
+ die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n"
337
+ unless defined $ref_data{$sys}{$doc};
338
+ my $nseg = @{$ref_data{$sys}{$doc}{SEGS}};
339
+ die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
340
+ ." document '$doc' for system '$sys' contains $nseg segments, while\n"
341
+ ." the source document contains $nseg_source segments.\n\n"
342
+ unless $nseg == $nseg_source;
343
+ }
344
+ }
345
+ }
346
+
347
+ #################################
348
+
349
+ sub compute_ngram_info {
350
+
351
+ my ($ref, $doc, $seg);
352
+ my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
353
+ my (%ngram_count, @tot_ngrams);
354
+
355
+ foreach $ref (keys %ref_data) {
356
+ foreach $doc (keys %{$ref_data{$ref}}) {
357
+ foreach $seg (@{$ref_data{$ref}{$doc}{SEGS}}) {
358
+ @wrds = split /\s+/, $seg;
359
+ $tot_wrds += @wrds;
360
+ %ngrams = %{Words2Ngrams (@wrds)};
361
+ foreach $ngram (keys %ngrams) {
362
+ $ngram_count{$ngram} += $ngrams{$ngram};
363
+ }
364
+ }
365
+ }
366
+ }
367
+
368
+ foreach $ngram (keys %ngram_count) {
369
+ @wrds = split / /, $ngram;
370
+ pop @wrds, $mgram = join " ", @wrds;
371
+ $ngram_info{$ngram} = - log
372
+ ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram}
373
+ : $ngram_count{$ngram}/$tot_wrds) / log 2;
374
+ if (defined $opt_x and $opt_x eq "ngram info") {
375
+ @wrds = split / /, $ngram;
376
+ printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
377
+ $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
378
+ }
379
+ }
380
+ }
381
+
382
+ #################################
383
+
384
+ sub score_system {
385
+
386
+ my ($sys, $ref, $doc, %SCOREmt);
387
+ ($sys, %SCOREmt) = @_;
388
+ my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
389
+ my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
390
+
391
+ $cum_ref_length = 0;
392
+ for (my $j=1; $j<=$max_Ngram; $j++) {
393
+ $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
394
+ }
395
+
396
+ foreach $doc (sort keys %eval_docs) {
397
+ ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc);
398
+
399
+ #output document summary score
400
+ if (($detail >= 1 ) && ($METHOD eq "NIST")) {
401
+ my %DOCmt = ();
402
+ printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
403
+ nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt),
404
+ scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
405
+ }
406
+ if (($detail >= 1 ) && ($METHOD eq "BLEU")) {
407
+ my %DOCmt = ();
408
+ printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
409
+ bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt),
410
+ scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
411
+ }
412
+
413
+ $cum_ref_length += $shortest_ref_length;
414
+ for (my $j=1; $j<=$max_Ngram; $j++) {
415
+ $cum_match[$j] += $match_cnt->[$j];
416
+ $cum_tst_cnt[$j] += $tst_cnt->[$j];
417
+ $cum_ref_cnt[$j] += $ref_cnt->[$j];
418
+ $cum_tst_info[$j] += $tst_info->[$j];
419
+ $cum_ref_info[$j] += $ref_info->[$j];
420
+ printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
421
+ $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
422
+ if (defined $opt_x and $opt_x eq "document info");
423
+ }
424
+ }
425
+
426
+ #x #output system summary score
427
+ #x printf "$method score = %.4f for system \"$sys\"\n",
428
+ #x $method eq "BLEU" ? bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt) :
429
+ #x nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
430
+ if ($method eq "BLEU") {
431
+ bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, %SCOREmt);
432
+ }
433
+ if ($method eq "NIST") {
434
+ nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt);
435
+ }
436
+ }
437
+
438
+ #################################
439
+
440
+ sub score_document {
441
+
442
+ my ($sys, $ref, $doc);
443
+ ($sys, $doc) = @_;
444
+ my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
445
+ my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
446
+
447
+ $cum_ref_length = 0;
448
+ for (my $j=1; $j<=$max_Ngram; $j++) {
449
+ $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
450
+ }
451
+
452
+ #score each segment
453
+ for (my $jseg=0; $jseg<@{$tst_data{$sys}{$doc}{SEGS}}; $jseg++) {
454
+ my @ref_segments = ();
455
+ foreach $ref (@ref_sys) {
456
+ push @ref_segments, $ref_data{$ref}{$doc}{SEGS}[$jseg];
457
+ printf "ref '$ref', seg %d: %s\n", $jseg+1, $ref_data{$ref}{$doc}{SEGS}[$jseg]
458
+ if $detail >= 3;
459
+ }
460
+ printf "sys '$sys', seg %d: %s\n", $jseg+1, $tst_data{$sys}{$doc}{SEGS}[$jseg]
461
+ if $detail >= 3;
462
+ ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) =
463
+ score_segment ($tst_data{$sys}{$doc}{SEGS}[$jseg], @ref_segments);
464
+
465
+ #output segment summary score
466
+ #x printf "$method score = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
467
+ #x $method eq "BLEU" ? bleu_score($shortest_ref_length, $match_cnt, $tst_cnt) :
468
+ #x nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info),
469
+ #x $jseg+1, $tst_cnt->[1]
470
+ #x if $detail >= 2;
471
+ if (($detail >=2) && ($METHOD eq "BLEU")) {
472
+ my %DOCmt = ();
473
+ printf " $method score using 4-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
474
+ bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
475
+ }
476
+ if (($detail >=2) && ($METHOD eq "NIST")) {
477
+ my %DOCmt = ();
478
+ printf " $method score using 5-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
479
+ nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), $jseg+1, $tst_cnt->[1];
480
+ }
481
+
482
+
483
+ $cum_ref_length += $shortest_ref_length;
484
+ for (my $j=1; $j<=$max_Ngram; $j++) {
485
+ $cum_match[$j] += $match_cnt->[$j];
486
+ $cum_tst_cnt[$j] += $tst_cnt->[$j];
487
+ $cum_ref_cnt[$j] += $ref_cnt->[$j];
488
+ $cum_tst_info[$j] += $tst_info->[$j];
489
+ $cum_ref_info[$j] += $ref_info->[$j];
490
+ }
491
+ }
492
+ return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
493
+ }
494
+
495
+ #################################
496
+
497
+ sub score_segment {
498
+
499
+ my ($tst_seg, @ref_segs) = @_;
500
+ my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
501
+ my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
502
+ my ($ngram);
503
+ my (@nwrds_ref);
504
+ my $shortest_ref_length;
505
+
506
+ for (my $j=1; $j<= $max_Ngram; $j++) {
507
+ $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
508
+ }
509
+
510
+ # get the ngram counts for the test segment
511
+ @tst_wrds = split /\s+/, $tst_seg;
512
+ %tst_ngrams = %{Words2Ngrams (@tst_wrds)};
513
+ for (my $j=1; $j<=$max_Ngram; $j++) { # compute ngram counts
514
+ $tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
515
+ }
516
+
517
+ # get the ngram counts for the reference segments
518
+ foreach $ref_seg (@ref_segs) {
519
+ @ref_wrds = split /\s+/, $ref_seg;
520
+ %ref_ngrams = %{Words2Ngrams (@ref_wrds)};
521
+ foreach $ngram (keys %ref_ngrams) { # find the maximum # of occurrences
522
+ my @wrds = split / /, $ngram;
523
+ $ref_info[@wrds] += $ngram_info{$ngram};
524
+ $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ?
525
+ max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) :
526
+ $ref_ngrams{$ngram};
527
+ }
528
+ for (my $j=1; $j<=$max_Ngram; $j++) { # update ngram counts
529
+ $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
530
+ }
531
+ $shortest_ref_length = scalar @ref_wrds # find the shortest reference segment
532
+ if (not defined $shortest_ref_length) or @ref_wrds < $shortest_ref_length;
533
+ }
534
+
535
+ # accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
536
+ foreach $ngram (keys %tst_ngrams) {
537
+ next unless defined $ref_ngrams_max{$ngram};
538
+ my @wrds = split / /, $ngram;
539
+ $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
540
+ $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
541
+ printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
542
+ if $detail >= 3;
543
+ }
544
+
545
+ return ($shortest_ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
546
+ }
547
+
548
+ #################################
549
+
550
+ sub bleu_score {
551
+
552
+ my ($shortest_ref_length, $matching_ngrams, $tst_ngrams, $sys, %SCOREmt) = @_;
553
+
554
+ my $score = 0;
555
+ my $iscore = 0;
556
+ my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
557
+ print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n";
558
+
559
+ for (my $j=1; $j<=$max_Ngram; $j++) {
560
+ if ($matching_ngrams->[$j] == 0) {
561
+ $SCOREmt{$j}{$sys}{cum}=0;
562
+ } else {
563
+ # Cumulative N-Gram score
564
+ $score += log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
565
+ $SCOREmt{$j}{$sys}{cum} = exp($score/$j + $len_score);
566
+ # Individual N-Gram score
567
+ $iscore = log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
568
+ $SCOREmt{$j}{$sys}{ind} = exp($iscore);
569
+ }
570
+ }
571
+ return $SCOREmt{4}{$sys}{cum};
572
+ }
573
+
574
+ #################################
575
+
576
+ sub nist_score {
577
+
578
+ my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, %SCOREmt) = @_;
579
+
580
+ my $score = 0;
581
+ my $iscore = 0;
582
+
583
+
584
+ for (my $n=1; $n<=$max_Ngram; $n++) {
585
+ $score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
586
+ $SCOREmt{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
587
+
588
+ $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
589
+ $SCOREmt{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
590
+ }
591
+ return $SCOREmt{5}{$sys}{cum};
592
+ }
593
+
594
+ #################################
595
+
596
+ sub Words2Ngrams { #convert a string of words to an Ngram count hash
597
+
598
+ my %count = ();
599
+
600
+ for (; @_; shift) {
601
+ my ($j, $ngram, $word);
602
+ for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) {
603
+ $ngram .= defined $ngram ? " $word" : $word;
604
+ $count{$ngram}++;
605
+ }
606
+ }
607
+ return {%count};
608
+ }
609
+
610
+ #################################
611
+
612
+ sub NormalizeText {
613
+ my ($norm_text) = @_;
614
+
615
+ $norm_text =~ s/<skipped>//g; # strip "skipped" tags
616
+ $norm_text =~ s/\p{Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines
617
+ $norm_text =~ s/\p{Zl}/ /g; # join lines
618
+
619
+ # replace entities
620
+ $norm_text =~ s/&quot;/\"/g; # quote to "
621
+ $norm_text =~ s/&amp;/&/g; # ampersand to &
622
+ $norm_text =~ s/&lt;/</g; # less-than to <
623
+ $norm_text =~ s/&gt;/>/g; # greater-than to >
624
+ $norm_text =~ s/&apos;/\'/g; # apostrophe to '
625
+
626
+ $norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed
627
+ $norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII );
628
+
629
+ # punctuation: tokenize any punctuation unless followed AND preceded by a digit
630
+ $norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g;
631
+ $norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g;
632
+
633
+ $norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols
634
+
635
+ $norm_text =~ s/\p{Z}+/ /g; # one space only between words
636
+ $norm_text =~ s/^\p{Z}+//; # no leading space
637
+ $norm_text =~ s/\p{Z}+$//; # no trailing space
638
+
639
+ return $norm_text;
640
+ }
641
+
642
+ #################################
643
+
644
+ sub nist_length_penalty {
645
+
646
+ my ($ratio) = @_;
647
+ return 1 if $ratio >= 1;
648
+ return 0 if $ratio <= 0;
649
+ my $ratio_x = 1.5;
650
+ my $score_x = 0.5;
651
+ my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
652
+ return exp (-$beta*log($ratio)*log($ratio));
653
+ }
654
+
655
+ #################################
656
+
657
+ sub date_time_stamp {
658
+
659
+ my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
660
+ my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
661
+ my ($date, $time);
662
+
663
+ $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
664
+ $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
665
+ return ($date, $time);
666
+ }
667
+
668
+ #################################
669
+
670
+ sub extract_sgml_tag_and_span {
671
+
672
+ my ($name, $data) = @_;
673
+
674
+ ($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
675
+ }
676
+
677
+ #################################
678
+
679
+ sub extract_sgml_tag_attribute {
680
+
681
+ my ($name, $data) = @_;
682
+
683
+ ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
684
+ }
685
+
686
+ #################################
687
+
688
+ sub max {
689
+
690
+ my ($max, $next);
691
+
692
+ return unless defined ($max=pop);
693
+ while (defined ($next=pop)) {
694
+ $max = $next if $next > $max;
695
+ }
696
+ return $max;
697
+ }
698
+
699
+ #################################
700
+
701
+ sub min {
702
+
703
+ my ($min, $next);
704
+
705
+ return unless defined ($min=pop);
706
+ while (defined ($next=pop)) {
707
+ $min = $next if $next < $min;
708
+ }
709
+ return $min;
710
+ }
711
+
712
+ #################################
713
+
714
+ sub printout_report
715
+ {
716
+
717
+ if ( $METHOD eq "BOTH" ) {
718
+ foreach my $sys (sort @tst_sys) {
719
+ printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
720
+ }
721
+ } elsif ($METHOD eq "NIST" ) {
722
+ foreach my $sys (sort @tst_sys) {
723
+ printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
724
+ }
725
+ } elsif ($METHOD eq "BLEU" ) {
726
+ foreach my $sys (sort @tst_sys) {
727
+ printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
728
+ }
729
+ }
730
+
731
+
732
+ printf "\n# ------------------------------------------------------------------------\n\n";
733
+ printf "Individual N-gram scoring\n";
734
+ printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
735
+ printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
736
+
737
+ if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
738
+ foreach my $sys (sort @tst_sys) {
739
+ printf " NIST:";
740
+ for (my $i=1; $i<=$max_Ngram; $i++) {
741
+ printf " %2.4f ",$NISTmt{$i}{$sys}{ind}
742
+ }
743
+ printf " \"$sys\"\n";
744
+ }
745
+ printf "\n";
746
+ }
747
+
748
+ if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
749
+ foreach my $sys (sort @tst_sys) {
750
+ printf " BLEU:";
751
+ for (my $i=1; $i<=$max_Ngram; $i++) {
752
+ printf " %2.4f ",$BLEUmt{$i}{$sys}{ind}
753
+ }
754
+ printf " \"$sys\"\n";
755
+ }
756
+ }
757
+
758
+ printf "\n# ------------------------------------------------------------------------\n";
759
+ printf "Cumulative N-gram scoring\n";
760
+ printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
761
+ printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
762
+
763
+ if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) {
764
+ foreach my $sys (sort @tst_sys) {
765
+ printf " NIST:";
766
+ for (my $i=1; $i<=$max_Ngram; $i++) {
767
+ printf " %2.4f ",$NISTmt{$i}{$sys}{cum}
768
+ }
769
+ printf " \"$sys\"\n";
770
+ }
771
+ }
772
+ printf "\n";
773
+
774
+
775
+ if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) {
776
+ foreach my $sys (sort @tst_sys) {
777
+ printf " BLEU:";
778
+ for (my $i=1; $i<=$max_Ngram; $i++) {
779
+ printf " %2.4f ",$BLEUmt{$i}{$sys}{cum}
780
+ }
781
+ printf " \"$sys\"\n";
782
+ }
783
+ }
784
+ }
mosesdecoder/scripts/generic/mteval-v13a.pl ADDED
@@ -0,0 +1,1170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ use warnings;
4
+ use strict;
5
+ use utf8;
6
+ use Encode;
7
+ use XML::Twig;
8
+
9
+ binmode STDOUT, ":utf8";
10
+ binmode STDERR, ":utf8";
11
+
12
+
13
+ #################################
14
+ # History:
15
+ #
16
+ # version 13a
17
+ # * modified the scoring functions to prevent division-by-zero errors when a system segment is empty
18
+ # * affected methods: 'bleu_score' and 'bleu_score_smoothing'
19
+ # * use \p{Line_Breaks} instead of \p{Hyphen} when stripping end-of-line hyphenation and join lines
20
+ # * because \p{Hyphen} is deprecated since 2016-06-01, see http://www.unicode.org/reports/tr14/#Hyphen
21
+ #
22
+ # version 13
23
+ # * Uses a XML parser to read data (only when extension is .xml)
24
+ # * Smoothing of the segment-level BLEU scores, done by default
25
+ # * smoothing method similar to that of bleu-1.04.pl (IBM)
26
+ # * see comments above the 'bleu_score' method for more details on how the smoothing is computed
27
+ # * added a '--no-smoothing' option to simulate old scripts behavior
28
+ # * Introduction of the 'brevity-penalty' option, taking one of two values:
29
+ # * 'closest' (default) : act as IBM BLEU (taking the closest reference translation length)
30
+ # * in case two reference translations are at the same distance, will take the shortest one
31
+ # * for more details regarding how the BP is computed, see comments of the 'brevity_penalty_closest' function
32
+ # * 'shortest' : act as previous versions of the script (taking shortest reference translation length)
33
+ # * Introduction of the 'international-tokenization' option, boolean, disabled by default
34
+ # by default (when the option is not provided), uses 11b's tokenization function
35
+ # when option specified, uses v12's tokenization function
36
+ # * Introduction of a 'Metrics MATR output' flag (option '--metricsMATR')
37
+ # when used, creates three files for both BLEU score and NIST score:
38
+ # * BLEU-seg.scr and NIST-seg.scr: contain segment-level scores
39
+ # * BLEU-doc.scr and NIST-doc.scr: contain document-level scores
40
+ # * BLEU-sys.scr and NIST-sys.scr: contain system-level scores
41
+ # * SGML parsing
42
+ # * script will halt if source, reference and test files don't share the same setid attribute value (used for metricsMATR output)
43
+ # * correct segment IDs extracted from the files (was previously using an array, and using the index as a segID for output)
44
+ # * detailed output flag (-d) can now be used when running both BLEU and NIST
45
+ #
46
+ # version 12
47
+ # * Text normalization changes:
48
+ # * convert entity references (only the entities declared in the DTD)
49
+ # * now uses unicode categories
50
+ # * tokenize punctuation unless followed AND preceded by digits
51
+ # * tokenize symbols
52
+ # * UTF-8 handling:
53
+ # * files are now read using utf8 mode
54
+ # * Added the '-e' command-line option to enclose non-ASCII characters between spaces
55
+ #
56
+ # version 11b -- text normalization modified:
57
+ # * take out the join digit line because it joins digits
58
+ # when it shouldn't have
59
+ # $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
60
+ #
61
+ # version 11a -- corrected output of individual n-gram precision values
62
+ #
63
+ # version 11 -- bug fixes:
64
+ # * make filehandle operate in binary mode to prevent Perl from operating
65
+ # (by default in Red Hat 9) in UTF-8
66
+ # * fix failure on joining digits
67
+ # version 10 -- updated output to include more details of n-gram scoring.
68
+ # Defaults to generate both NIST and BLEU scores. Use -b for BLEU
69
+ # only, use -n for NIST only
70
+ #
71
+ # version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
72
+ # being the max, regardless what was entered on the command line.)
73
+ #
74
+ # version 09c -- bug fix (During the calculation of ngram information,
75
+ # each ngram was being counted only once for each segment. This has
76
+ # been fixed so that each ngram is counted correctly in each segment.)
77
+ #
78
+ # version 09b -- text normalization modified:
79
+ # * option flag added to preserve upper case
80
+ # * non-ASCII characters left in place.
81
+ #
82
+ # version 09a -- text normalization modified:
83
+ # * &quot; and &amp; converted to "" and &, respectively
84
+ # * non-ASCII characters kept together (bug fix)
85
+ #
86
+ # version 09 -- modified to accommodate sgml tag and attribute
87
+ # names revised to conform to default SGML conventions.
88
+ #
89
+ # version 08 -- modifies the NIST metric in accordance with the
90
+ # findings on the 2001 Chinese-English dry run corpus. Also
91
+ # incorporates the BLEU metric as an option and supports the
92
+ # output of ngram detail.
93
+ #
94
+ # version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
95
+ # Keep strings of non-ASCII characters together as one word
96
+ # (rather than splitting them into one-character words).
97
+ # Change length penalty so that translations that are longer than
98
+ # the average reference translation are not penalized.
99
+ #
100
+ # version 06
101
+ # Prevent divide-by-zero when a segment has no evaluation N-grams.
102
+ # Correct segment index for level 3 debug output.
103
+ #
104
+ # version 05
105
+ # improve diagnostic error messages
106
+ #
107
+ # version 04
108
+ # tag segments
109
+ #
110
+ # version 03
111
+ # add detailed output option (intermediate document and segment scores)
112
+ #
113
+ # version 02
114
+ # accommodation of modified sgml tags and attributes
115
+ #
116
+ # version 01
117
+ # same as bleu version 15, but modified to provide formal score output.
118
+ #
119
+ # original IBM version
120
+ # Author: Kishore Papineni
121
+ # Date: 06/10/2001
122
+ #################################
123
+
124
+ ######
125
+ # Intro
126
+ my ($date, $time) = date_time_stamp();
127
+ print "MT evaluation scorer began on $date at $time\n";
128
+ print "command line: ", $0, " ", join(" ", @ARGV), "\n";
129
+ my $usage = "\n\nUsage: $0 -r <ref_file> -s <src_file> -t <tst_file>\n\n".
130
+ "Description: This Perl script evaluates MT system performance.\n".
131
+ "\n".
132
+ "Required arguments:\n".
133
+ " -r <ref_file> is a file containing the reference translations for\n".
134
+ " the documents to be evaluated.\n".
135
+ " -s <src_file> is a file containing the source documents for which\n".
136
+ " translations are to be evaluated\n".
137
+ " -t <tst_file> is a file containing the translations to be evaluated\n".
138
+ "\n".
139
+ "Optional arguments:\n".
140
+ " -h prints this help message to STDOUT\n".
141
+ " -c preserves upper-case alphabetic characters\n".
142
+ " -b generate BLEU scores only\n".
143
+ " -n generate NIST scores only\n".
144
+ " -d detailed output flag:\n".
145
+ " 0 (default) for system-level score only\n".
146
+ " 1 to include document-level scores\n".
147
+ " 2 to include segment-level scores\n".
148
+ " 3 to include ngram-level scores\n".
149
+ " -e enclose non-ASCII characters between spaces\n".
150
+ " --brevity-penalty ( closest | shortest )\n" .
151
+ " closest (default) : acts as IBM BLEU (takes the closest reference translation length)\n" .
152
+ " shortest : acts as previous versions of the script (takes the shortest reference translation length)\n" .
153
+ " --international-tokenization\n" .
154
+ " when specified, uses Unicode-based (only) tokenization rules\n" .
155
+ " when not specified (default), uses default tokenization (some language-dependant rules)\n" .
156
+ " --metricsMATR : create three files for both BLEU scores and NIST scores:\n" .
157
+ " BLEU-seg.scr and NIST-seg.scr : segment-level scores\n" .
158
+ " BLEU-doc.scr and NIST-doc.scr : document-level scores\n" .
159
+ " BLEU-sys.scr and NIST-sys.scr : system-level scores\n" .
160
+ " --no-smoothing : disable smoothing on BLEU scores\n" .
161
+ "\n";
162
+
163
+ use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e);
164
+ use Getopt::Long;
165
+ my $ref_file = '';
166
+ my $src_file = '';
167
+ my $tst_file = '';
168
+ my $detail = 0;
169
+ my $help = '';
170
+ my $preserve_case = '';
171
+ my $split_non_ASCII = '';
172
+ my $brevity_penalty = 'closest';
173
+ my $international_tokenization;
174
+ my $metricsMATR_output = '';
175
+ my $no_smoothing = '';
176
+ our $opt_x = '';
177
+ our $opt_b = '';
178
+ our $opt_n = '';
179
+ GetOptions(
180
+ 'r=s' => \$ref_file,
181
+ 's=s' => \$src_file,
182
+ 't=s' => \$tst_file,
183
+ 'd:i' => \$detail,
184
+ 'h|help' => \$help,
185
+ 'b',
186
+ 'n',
187
+ 'c' => \$preserve_case,
188
+ 'x:s',
189
+ 'e' => \$split_non_ASCII,
190
+ 'brevity-penalty:s' => \$brevity_penalty,
191
+ 'international-tokenization' => \$international_tokenization,
192
+ 'metricsMATR-output' => \$metricsMATR_output,
193
+ 'no-smoothing' => \$no_smoothing
194
+ );
195
+ die $usage if $help;
196
+
197
+ die "Error in command line: ref_file not defined$usage" unless ( $ref_file );
198
+ die "Error in command line: src_file not defined$usage" unless ( $src_file );
199
+ die "Error in command line: tst_file not defined$usage" unless ( $tst_file );
200
+ my $BLEU_BP;
201
+ if ( !( $brevity_penalty cmp 'closest' ) )
202
+ {
203
+ $BLEU_BP = \&brevity_penalty_closest;
204
+ }
205
+ elsif ( !( $brevity_penalty cmp 'shortest' ) )
206
+ {
207
+ $BLEU_BP = \&brevity_penalty_shortest;
208
+ }
209
+ else
210
+ {
211
+ die "Incorrect value supplied for 'brevity_penalty'$usage";
212
+ }
213
+ my $TOKENIZATION = \&tokenization;
214
+ $TOKENIZATION = \&tokenization_international if ( $international_tokenization );
215
+
216
+ my $BLEU_SCORE = \&bleu_score;
217
+ $BLEU_SCORE = \&bleu_score_nosmoothing if ( $no_smoothing );
218
+
219
+ my $max_Ngram = 9;
220
+
221
+ my $METHOD = "BOTH";
222
+ if ( $opt_b ) { $METHOD = "BLEU"; }
223
+ if ( $opt_n ) { $METHOD = "NIST"; }
224
+ my $method;
225
+
226
+ ######
227
+ # Global variables
228
+ my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
229
+ my (%tst_data, %ref_data); # the data -- with structure: {system}{document}{segments}
230
+ my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
231
+ my %eval_docs; # document information for the evaluation data set
232
+ my %ngram_info; # the information obtained from (the last word in) the ngram
233
+
234
+ ######
235
+ # Get source document ID's
236
+ ($src_id) = get_source_info ($src_file);
237
+
238
+ ######
239
+ # Get reference translations
240
+ ($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
241
+
242
+ compute_ngram_info ();
243
+
244
+ ######
245
+ # Get translations to evaluate
246
+ ($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
247
+
248
+ ######
249
+ # Check data for completeness and correctness
250
+ check_MT_data ();
251
+
252
+ ######
253
+ #
254
+ my %NISTmt;
255
+ my %NISTOverall;
256
+ my %BLEUmt;
257
+ my %BLEUOverall;
258
+
259
+ ######
260
+ # Evaluate
261
+ print " Evaluation of $src_lang-to-$tgt_lang translation using:\n";
262
+ my $cum_seg = 0;
263
+ foreach my $doc (sort keys %eval_docs)
264
+ {
265
+ $cum_seg += scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
266
+ }
267
+ print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
268
+ print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
269
+ print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
270
+
271
+ foreach my $sys (sort @tst_sys)
272
+ {
273
+ for (my $n=1; $n<=$max_Ngram; $n++)
274
+ {
275
+ $NISTmt{$n}{$sys}{cum} = 0;
276
+ $NISTmt{$n}{$sys}{ind} = 0;
277
+ $BLEUmt{$n}{$sys}{cum} = 0;
278
+ $BLEUmt{$n}{$sys}{ind} = 0;
279
+ }
280
+ if ( ($METHOD eq "BOTH") || ($METHOD eq "NIST") )
281
+ {
282
+ $method="NIST";
283
+ score_system ($sys, \%NISTmt, \%NISTOverall);
284
+ }
285
+ if ( ($METHOD eq "BOTH") || ($METHOD eq "BLEU") )
286
+ {
287
+ $method="BLEU";
288
+ score_system ($sys, \%BLEUmt, \%BLEUOverall);
289
+ }
290
+ }
291
+
292
+ ######
293
+ printout_report ();
294
+ if ( $metricsMATR_output )
295
+ {
296
+ outputMetricsMATR( 'NIST', %NISTOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'NIST' ) );
297
+ outputMetricsMATR( 'BLEU', %BLEUOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'BLEU' ) );
298
+ }
299
+
300
+ ($date, $time) = date_time_stamp();
301
+ print "MT evaluation scorer ended on $date at $time\n";
302
+
303
+ exit 0;
304
+
305
+ #################################
306
+
307
+ sub get_source_info
308
+ {
309
+ my ($file) = @_;
310
+ my ($name, $id, $src, $doc, $seg);
311
+ my ($data, $tag, $span);
312
+
313
+ # Extension of the file determines the parser used:
314
+ # .xml : XML::Twig
315
+ # otherwise : simple SGML parsing functions
316
+ if ( $file =~ /\.xml$/i )
317
+ {
318
+ my $twig = XML::Twig->new();
319
+ $twig->parsefile( $file );
320
+ my $root = $twig->root;
321
+ my $currentSet = $root->first_child( 'srcset' );
322
+ die "Source XML file '$file' does not contain the 'srcset' element" if ( not $currentSet );
323
+ $id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
324
+ $src = $currentSet->{ 'att' }->{ 'srclang' } or die "No srcset 'srclang' attribute value in '$file'";
325
+ die "Not the same srclang attribute values across sets" unless ( not defined $src_lang or $src eq $src_lang );
326
+ $src_lang = $src;
327
+ foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
328
+ {
329
+ my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
330
+ foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
331
+ {
332
+ my $segID = $currentSeg->{ 'att' }->{ 'id' } or die "No segment 'id' attribute value in '$file'";
333
+ my $segData = $currentSeg->text;
334
+ ($eval_docs{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
335
+ }
336
+ }
337
+ }
338
+ else
339
+ {
340
+ #read data from file
341
+ open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
342
+ binmode FILE, ":utf8";
343
+ $data .= $_ while <FILE>;
344
+ close (FILE);
345
+
346
+ #get source set info
347
+ die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n"
348
+ unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
349
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
350
+ unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
351
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
352
+ unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
353
+ die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
354
+ ." with $name in previous input data ('$src_lang')\n\n"
355
+ unless (not defined $src_lang or $src eq $src_lang);
356
+ $src_lang = $src;
357
+
358
+ #get doc info -- ID and # of segs
359
+ $data = $span;
360
+ while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data))
361
+ {
362
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
363
+ unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
364
+ die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n"
365
+ if defined $eval_docs{$doc};
366
+ $span =~ s/[\s\n\r]+/ /g; # concatenate records
367
+ my $nseg=0, my $seg_data = $span;
368
+ while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
369
+ {
370
+ die "\n\nFATAL INPUT ERROR: no attribute '$name' in file '$file'\n\n"
371
+ unless ($seg) = extract_sgml_tag_attribute( $name='id', $tag );
372
+ ($eval_docs{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
373
+ $nseg++;
374
+ }
375
+ die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
376
+ if $nseg == 0;
377
+ }
378
+ die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n"
379
+ unless keys %eval_docs > 0;
380
+ }
381
+ return $id;
382
+ }
383
+
384
+ #################################
385
+
386
+ sub get_MT_data
387
+ {
388
+ my ($docs, $set_tag, $file) = @_;
389
+ my ($name, $id, $src, $tgt, $sys, $doc, $seg);
390
+ my ($tag, $span, $data);
391
+
392
+ # Extension of the file determines the parser used:
393
+ # .xml : XML::Twig
394
+ # otherwise : simple SGML parsing functions
395
+ if ( $file =~ /\.xml$/i )
396
+ {
397
+ my $twig = XML::Twig->new();
398
+ $twig->parsefile( $file );
399
+ my $root = $twig->root;
400
+ foreach my $currentSet ( $root->get_xpath( 'refset' ), $root->get_xpath( 'tstset' ) )
401
+ {
402
+ $id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
403
+ $src = $currentSet->{ 'att' }->{ 'srclang' } or die "No 'srclang' attribute value in '$file'";
404
+ $tgt = $currentSet->{ 'att' }->{ 'trglang' } or die "No 'trglang' attribute value in '$file'";
405
+ die "Not the same 'srclang' attribute value across sets" unless ( $src eq $src_lang );
406
+ die "Not the same 'trglang' attribute value across sets" unless ( ( not defined $tgt_lang ) or ( $tgt = $tgt_lang ) );
407
+ $tgt_lang = $tgt;
408
+ my $sys;
409
+ if ( $currentSet->name eq 'tstset' )
410
+ {
411
+ $sys = $currentSet->{ 'att' }->{ 'sysid' } or die "No 'sysid' attribute value in '$file'";
412
+ }
413
+ else
414
+ {
415
+ $sys = $currentSet->{ 'att' }->{ 'refid' } or die "No 'refid' attribute value in '$file'";
416
+ }
417
+ foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
418
+ {
419
+ my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
420
+ $docs->{ $sys }{ $docID }{ FILE } = $file;
421
+ foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
422
+ {
423
+ my $segID = $currentSeg->{ 'att' }->{ 'id' } or die "No segment 'id' attribute value in '$file'";
424
+ my $segData = $currentSeg->text;
425
+ ($docs->{$sys}{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
426
+ }
427
+ }
428
+ }
429
+ }
430
+ else
431
+ {
432
+ #read data from file
433
+ open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
434
+ binmode FILE, ":utf8";
435
+ $data .= $_ while <FILE>;
436
+ close (FILE);
437
+
438
+ #get tag info
439
+ while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data))
440
+ {
441
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
442
+ unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
443
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
444
+ unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
445
+ die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
446
+ ." with $name of source ('$src_lang')\n\n"
447
+ unless $src eq $src_lang;
448
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
449
+ unless ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
450
+ die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n"
451
+ ." with $name of the evaluation ('$tgt_lang')\n\n"
452
+ unless (not defined $tgt_lang or $tgt eq $tgt_lang);
453
+ $tgt_lang = $tgt;
454
+
455
+ my $mtdata = $span;
456
+ while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata))
457
+ {
458
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
459
+ unless (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
460
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
461
+ unless $doc = extract_sgml_tag_attribute ($name="DocID", $tag);
462
+ die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n"
463
+ ." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
464
+ unless (not defined $docs->{$sys}{$doc});
465
+
466
+ $span =~ s/[\s\n\r]+/ /g; # concatenate records
467
+ my $nseg=0, my $seg_data = $span;
468
+ while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
469
+ {
470
+ die "\n\nFATAIL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
471
+ unless $seg = extract_sgml_tag_attribute( $name="id", $tag );
472
+ ($docs->{$sys}{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
473
+ $nseg++;
474
+ }
475
+ die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" if $nseg == 0;
476
+ $docs->{$sys}{$doc}{FILE} = $file;
477
+ }
478
+ }
479
+ }
480
+ return $id;
481
+ }
482
+
483
+ #################################
484
+
485
+ sub check_MT_data
486
+ {
487
+ @tst_sys = sort keys %tst_data;
488
+ @ref_sys = sort keys %ref_data;
489
+
490
+ die "Not the same 'setid' attribute values across files" unless ( ( $src_id eq $tst_id ) && ( $src_id eq $ref_id ) );
491
+
492
+ #every evaluation document must be represented for every system and every reference
493
+ foreach my $doc (sort keys %eval_docs)
494
+ {
495
+ my $nseg_source = scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
496
+ foreach my $sys (@tst_sys)
497
+ {
498
+ die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n" unless defined $tst_data{$sys}{$doc};
499
+ my $nseg = scalar( keys( %{$tst_data{$sys}{$doc}{SEGS}} ) );
500
+ die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
501
+ ." document '$doc' for system '$sys' contains $nseg segments, while\n"
502
+ ." the source document contains $nseg_source segments.\n\n"
503
+ unless $nseg == $nseg_source;
504
+ }
505
+ foreach my $sys (@ref_sys)
506
+ {
507
+ die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n" unless defined $ref_data{$sys}{$doc};
508
+ my $nseg = scalar( keys( %{$ref_data{$sys}{$doc}{SEGS}} ) );
509
+ die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
510
+ ." document '$doc' for system '$sys' contains $nseg segments, while\n"
511
+ ." the source document contains $nseg_source segments.\n\n"
512
+ unless $nseg == $nseg_source;
513
+ }
514
+ }
515
+ }
516
+
517
+ #################################
518
+
519
+ sub compute_ngram_info
520
+ {
521
+ my ($ref, $doc, $seg);
522
+ my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
523
+ my (%ngram_count, @tot_ngrams);
524
+
525
+ foreach $ref (keys %ref_data)
526
+ {
527
+ foreach $doc (keys %{$ref_data{$ref}})
528
+ {
529
+ foreach $seg ( keys %{$ref_data{$ref}{$doc}{SEGS}})
530
+ {
531
+ @wrds = split /\s+/, $ref_data{ $ref }{ $doc }{ SEGS }{ $seg };
532
+ $tot_wrds += @wrds;
533
+ %ngrams = %{Words2Ngrams (@wrds)};
534
+ foreach $ngram (keys %ngrams)
535
+ {
536
+ $ngram_count{$ngram} += $ngrams{$ngram};
537
+ }
538
+ }
539
+ }
540
+ }
541
+
542
+ foreach $ngram (keys %ngram_count)
543
+ {
544
+ @wrds = split / /, $ngram;
545
+ pop @wrds, $mgram = join " ", @wrds;
546
+ $ngram_info{$ngram} = - log ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} : $ngram_count{$ngram}/$tot_wrds) / log 2;
547
+ if (defined $opt_x and $opt_x eq "ngram info")
548
+ {
549
+ @wrds = split / /, $ngram;
550
+ printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
551
+ $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
552
+ }
553
+ }
554
+ }
555
+
556
+ #################################
557
+
558
+ sub score_system
559
+ {
560
+ my ($sys, $ref, $doc, $SCOREmt, $overallScore);
561
+ ($sys, $SCOREmt, $overallScore) = @_;
562
+ my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
563
+ my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
564
+
565
+ $cum_ref_length = 0;
566
+ for (my $j=1; $j<=$max_Ngram; $j++)
567
+ {
568
+ $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
569
+ }
570
+ foreach $doc (sort keys %eval_docs)
571
+ {
572
+ ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc, $overallScore);
573
+ if ( $method eq "NIST" )
574
+ {
575
+ my %DOCmt = ();
576
+ my $docScore = nist_score( scalar( @ref_sys ), $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, \%DOCmt );
577
+ $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
578
+ if ( $detail >= 1 )
579
+ {
580
+ printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
581
+ $docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
582
+ }
583
+ }
584
+
585
+ if ( $method eq "BLEU" )
586
+ {
587
+ my %DOCmt = ();
588
+ my $docScore = &{$BLEU_SCORE}( $ref_length, $match_cnt, $tst_cnt, $sys, \%DOCmt );
589
+ $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
590
+ if ( $detail >= 1 )
591
+ {
592
+ printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
593
+ $docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
594
+ }
595
+ }
596
+
597
+ $cum_ref_length += $ref_length;
598
+ for (my $j=1; $j<=$max_Ngram; $j++)
599
+ {
600
+ $cum_match[$j] += $match_cnt->[$j];
601
+ $cum_tst_cnt[$j] += $tst_cnt->[$j];
602
+ $cum_ref_cnt[$j] += $ref_cnt->[$j];
603
+ $cum_tst_info[$j] += $tst_info->[$j];
604
+ $cum_ref_info[$j] += $ref_info->[$j];
605
+ printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
606
+ $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
607
+ if (defined $opt_x and $opt_x eq "document info");
608
+ }
609
+ }
610
+
611
+ if ($method eq "BLEU")
612
+ {
613
+ $overallScore->{ $sys }{ 'score' } = &{$BLEU_SCORE}($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, $SCOREmt, 1);
614
+ }
615
+ if ($method eq "NIST")
616
+ {
617
+ $overallScore->{ $sys }{ 'score' } = nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, $SCOREmt);
618
+ }
619
+ }
620
+
621
+ #################################
622
+
623
+ sub score_document
624
+ {
625
+ my ($sys, $ref, $doc, $overallScore);
626
+ ($sys, $doc, $overallScore) = @_;
627
+ my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
628
+ my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
629
+
630
+ $cum_ref_length = 0;
631
+ for (my $j=1; $j<=$max_Ngram; $j++)
632
+ {
633
+ $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
634
+ }
635
+
636
+ #score each segment
637
+ foreach my $seg ( sort{ $a <=> $b } keys( %{$tst_data{$sys}{$doc}{SEGS}} ) )
638
+ {
639
+ my @ref_segments = ();
640
+ foreach $ref (@ref_sys)
641
+ {
642
+ push @ref_segments, $ref_data{$ref}{$doc}{SEGS}{$seg};
643
+ if ( $detail >= 3 )
644
+ {
645
+ printf "ref '$ref', seg $seg: %s\n", $ref_data{$ref}{$doc}{SEGS}{$seg}
646
+ }
647
+
648
+ }
649
+
650
+ printf "sys '$sys', seg $seg: %s\n", $tst_data{$sys}{$doc}{SEGS}{$seg} if ( $detail >= 3 );
651
+ ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_segment ($tst_data{$sys}{$doc}{SEGS}{$seg}, @ref_segments);
652
+
653
+ if ( $method eq "BLEU" )
654
+ {
655
+ my %DOCmt = ();
656
+ my $segScore = &{$BLEU_SCORE}($ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt);
657
+ $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
658
+ if ( $detail >= 2 )
659
+ {
660
+ printf " $method score using 4-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1]
661
+ }
662
+ }
663
+ if ( $method eq "NIST" )
664
+ {
665
+ my %DOCmt = ();
666
+ my $segScore = nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt);
667
+ $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
668
+ if ( $detail >= 2 )
669
+ {
670
+ printf " $method score using 5-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1];
671
+ }
672
+ }
673
+ $cum_ref_length += $ref_length;
674
+ for (my $j=1; $j<=$max_Ngram; $j++)
675
+ {
676
+ $cum_match[$j] += $match_cnt->[$j];
677
+ $cum_tst_cnt[$j] += $tst_cnt->[$j];
678
+ $cum_ref_cnt[$j] += $ref_cnt->[$j];
679
+ $cum_tst_info[$j] += $tst_info->[$j];
680
+ $cum_ref_info[$j] += $ref_info->[$j];
681
+ }
682
+ }
683
+ return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
684
+ }
685
+
686
+ ###############################################################################################################################
687
+ # function returning the shortest reference length
688
+ # takes as input:
689
+ # - currentLength : the current (shortest) reference length
690
+ # - referenceSentenceLength : the current reference sentence length
691
+ # - candidateSentenceLength : the current candidate sentence length (unused)
692
+ ###############################################################################################################################
693
+ sub brevity_penalty_shortest
694
+ {
695
+ my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
696
+ return ( $referenceSentenceLength < $currentLength ? $referenceSentenceLength : $currentLength );
697
+ }
698
+
699
+ ###############################################################################################################################
700
+ # function returning the closest reference length (to the candidate sentence length)
701
+ # takes as input:
702
+ # - currentLength: the current (closest) reference length.
703
+ # - candidateSentenceLength : the current reference sentence length
704
+ # - candidateSentenceLength : the current candidate sentence length
705
+ # when two reference sentences are at the same distance, it will return the shortest reference sentence length
706
+ # example of 4 iterations, given:
707
+ # - one candidate sentence containing 7 tokens
708
+ # - one reference translation containing 11 tokens
709
+ # - one reference translation containing 8 tokens
710
+ # - one reference translation containing 6 tokens
711
+ # - one reference translation containing 7 tokens
712
+ # the multiple invokations will return:
713
+ # - currentLength is set to 11 (outside of this function)
714
+ # - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 )
715
+ # - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8
716
+ # - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 )
717
+ ###############################################################################################################################
718
+ sub brevity_penalty_closest
719
+ {
720
+ my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
721
+ my $result = $currentLength;
722
+ if ( abs( $candidateSentenceLength - $referenceSentenceLength ) <= abs( $candidateSentenceLength - $currentLength ) )
723
+ {
724
+ if ( abs( $candidateSentenceLength - $referenceSentenceLength ) == abs( $candidateSentenceLength - $currentLength ) )
725
+ {
726
+ if ( $currentLength > $referenceSentenceLength )
727
+ {
728
+ $result = $referenceSentenceLength;
729
+ }
730
+ }
731
+ else
732
+ {
733
+ $result = $referenceSentenceLength;
734
+ }
735
+ }
736
+ return $result;
737
+ }
738
+
739
+ #################################
740
+
741
+ sub score_segment
742
+ {
743
+ my ($tst_seg, @ref_segs) = @_;
744
+ my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
745
+ my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
746
+ my ($ngram);
747
+ my (@nwrds_ref);
748
+ my $ref_length;
749
+
750
+ for (my $j=1; $j<= $max_Ngram; $j++)
751
+ {
752
+ $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
753
+ }
754
+
755
+ # get the ngram counts for the test segment
756
+ @tst_wrds = split /\s+/, $tst_seg;
757
+ %tst_ngrams = %{Words2Ngrams (@tst_wrds)};
758
+ for (my $j=1; $j<=$max_Ngram; $j++)
759
+ {
760
+ # compute ngram counts
761
+ $tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
762
+ }
763
+
764
+ # get the ngram counts for the reference segments
765
+ foreach $ref_seg (@ref_segs)
766
+ {
767
+ @ref_wrds = split /\s+/, $ref_seg;
768
+ %ref_ngrams = %{Words2Ngrams (@ref_wrds)};
769
+ foreach $ngram (keys %ref_ngrams)
770
+ {
771
+ # find the maximum # of occurrences
772
+ my @wrds = split / /, $ngram;
773
+ $ref_info[@wrds] += $ngram_info{$ngram};
774
+ $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : $ref_ngrams{$ngram};
775
+ }
776
+ for (my $j=1; $j<=$max_Ngram; $j++)
777
+ {
778
+ # update ngram counts
779
+ $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
780
+ }
781
+ if ( not defined( $ref_length ) )
782
+ {
783
+ $ref_length = scalar( @ref_wrds );
784
+ }
785
+ else
786
+ {
787
+ $ref_length = &{$BLEU_BP}( $ref_length, scalar( @ref_wrds ), scalar( @tst_wrds ) );
788
+ }
789
+ }
790
+
791
+ # accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
792
+ foreach $ngram (keys %tst_ngrams)
793
+ {
794
+ next unless defined $ref_ngrams_max{$ngram};
795
+ my @wrds = split / /, $ngram;
796
+ $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
797
+ $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
798
+ printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
799
+ if $detail >= 3;
800
+ }
801
+
802
+ return ($ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
803
+ }
804
+
805
+ #################################
806
+
807
+ sub bleu_score_nosmoothing
808
+ {
809
+ my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_;
810
+ my $score = 0;
811
+ my $iscore = 0;
812
+
813
+ for ( my $j = 1; $j <= $max_Ngram; ++$j )
814
+ {
815
+ if ($matching_ngrams->[ $j ] == 0)
816
+ {
817
+ $SCOREmt->{ $j }{ $sys }{ cum }=0;
818
+ }
819
+ else
820
+ {
821
+ my $len_score = min (0, 1-$ref_length/$tst_ngrams->[1]);
822
+ # Cumulative N-Gram score
823
+ $score += log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
824
+ $SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j + $len_score );
825
+ # Individual N-Gram score
826
+ $iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
827
+ $SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
828
+ }
829
+ }
830
+ return $SCOREmt->{ 4 }{ $sys }{ cum };
831
+ }
832
+
833
+ ###############################################################################################################################
834
+ # Default method used to compute the BLEU score, using smoothing.
835
+ # Note that the method used can be overridden using the '--no-smoothing' command-line argument
836
+ # The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each precision score whose matching n-gram count is null
837
+ # k is 1 for the first 'n' value for which the n-gram match count is null
838
+ # For example, if the text contains:
839
+ # - one 2-gram match
840
+ # - and (consequently) two 1-gram matches
841
+ # the n-gram count for each individual precision score would be:
842
+ # - n=1 => prec_count = 2 (two unigrams)
843
+ # - n=2 => prec_count = 1 (one bigram)
844
+ # - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
845
+ # - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
846
+ ###############################################################################################################################
847
+ sub bleu_score
848
+ {
849
+ my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt,$report_length) = @_;
850
+ my $score = 0;
851
+ my $iscore = 0;
852
+ my $exp_len_score = 0;
853
+ $exp_len_score = exp( min (0, 1 - $ref_length / $tst_ngrams->[ 1 ] ) ) if ( $tst_ngrams->[ 1 ] > 0 );
854
+ print "length ratio: ".($tst_ngrams->[1]/$ref_length)." ($tst_ngrams->[1]/$ref_length), penalty (log): ".log($exp_len_score)."\n" if $report_length;
855
+ my $smooth = 1;
856
+ for ( my $j = 1; $j <= $max_Ngram; ++$j )
857
+ {
858
+ if ( $tst_ngrams->[ $j ] == 0 )
859
+ {
860
+ $iscore = 0;
861
+ }
862
+ elsif ( $matching_ngrams->[ $j ] == 0 )
863
+ {
864
+ $smooth *= 2;
865
+ $iscore = log( 1 / ( $smooth * $tst_ngrams->[ $j ] ) );
866
+ }
867
+ else
868
+ {
869
+ $iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
870
+ }
871
+ $SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
872
+ $score += $iscore;
873
+ $SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j ) * $exp_len_score;
874
+ }
875
+ return $SCOREmt->{ 4 }{ $sys }{ cum };
876
+ }
877
+
878
+ #################################
879
+
880
+ sub nist_score
881
+ {
882
+ my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, $SCOREmt) = @_;
883
+ my $score = 0;
884
+ my $iscore = 0;
885
+
886
+ for (my $n=1; $n<=$max_Ngram; $n++)
887
+ {
888
+ $score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
889
+ $SCOREmt->{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
890
+ $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
891
+ $SCOREmt->{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
892
+ }
893
+ return $SCOREmt->{5}{$sys}{cum};
894
+ }
895
+
896
+ #################################
897
+
898
+ sub Words2Ngrams
899
+ {
900
+ #convert a string of words to an Ngram count hash
901
+ my %count = ();
902
+
903
+ for (; @_; shift)
904
+ {
905
+ my ($j, $ngram, $word);
906
+ for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++)
907
+ {
908
+ $ngram .= defined $ngram ? " $word" : $word;
909
+ $count{$ngram}++;
910
+ }
911
+ }
912
+ return {%count};
913
+ }
914
+
915
+ #################################
916
+
917
+ sub tokenization
918
+ {
919
+ my ($norm_text) = @_;
920
+
921
+ # language-independent part:
922
+ $norm_text =~ s/<skipped>//g; # strip "skipped" tags
923
+ $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
924
+ $norm_text =~ s/\n/ /g; # join lines
925
+ $norm_text =~ s/&quot;/"/g; # convert SGML tag for quote to "
926
+ $norm_text =~ s/&amp;/&/g; # convert SGML tag for ampersand to &
927
+ $norm_text =~ s/&lt;/</g; # convert SGML tag for less-than to >
928
+ $norm_text =~ s/&gt;/>/g; # convert SGML tag for greater-than to <
929
+
930
+ # language-dependent part (assuming Western languages):
931
+ $norm_text = " $norm_text ";
932
+ $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
933
+ $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation
934
+ $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
935
+ $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
936
+ $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
937
+ $norm_text =~ s/\s+/ /g; # one space only between words
938
+ $norm_text =~ s/^\s+//; # no leading space
939
+ $norm_text =~ s/\s+$//; # no trailing space
940
+
941
+ return $norm_text;
942
+ }
943
+
944
+
945
+ sub tokenization_international
946
+ {
947
+ my ($norm_text) = @_;
948
+
949
+ $norm_text =~ s/<skipped>//g; # strip "skipped" tags
950
+ $norm_text =~ s/\p{Line_Break: Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines
951
+ $norm_text =~ s/\p{Zl}/ /g; # join lines
952
+
953
+ # replace entities
954
+ $norm_text =~ s/&quot;/\"/g; # quote to "
955
+ $norm_text =~ s/&amp;/&/g; # ampersand to &
956
+ $norm_text =~ s/&lt;/</g; # less-than to <
957
+ $norm_text =~ s/&gt;/>/g; # greater-than to >
958
+ $norm_text =~ s/&apos;/\'/g; # apostrophe to '
959
+
960
+ $norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed
961
+ $norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII );
962
+
963
+ # punctuation: tokenize any punctuation unless followed AND preceded by a digit
964
+ $norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g;
965
+ $norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g;
966
+
967
+ $norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols
968
+
969
+ $norm_text =~ s/\p{Z}+/ /g; # one space only between words
970
+ $norm_text =~ s/^\p{Z}+//; # no leading space
971
+ $norm_text =~ s/\p{Z}+$//; # no trailing space
972
+
973
+ return $norm_text;
974
+ }
975
+
976
+ #################################
977
+
978
+ sub nist_length_penalty
979
+ {
980
+ my ($ratio) = @_;
981
+ return 1 if $ratio >= 1;
982
+ return 0 if $ratio <= 0;
983
+ my $ratio_x = 1.5;
984
+ my $score_x = 0.5;
985
+ my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
986
+ return exp (-$beta*log($ratio)*log($ratio));
987
+ }
988
+
989
+ #################################
990
+
991
+ sub date_time_stamp
992
+ {
993
+ my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
994
+ my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
995
+ my ($date, $time);
996
+ $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
997
+ $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
998
+ return ($date, $time);
999
+ }
1000
+
1001
+ #################################
1002
+
1003
+ sub extract_sgml_tag_and_span
1004
+ {
1005
+ my ($name, $data) = @_;
1006
+ ($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
1007
+ }
1008
+
1009
+ #################################
1010
+
1011
+ sub extract_sgml_tag_attribute
1012
+ {
1013
+ my ($name, $data) = @_;
1014
+ ($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
1015
+ }
1016
+
1017
+ #################################
1018
+
1019
+ sub max
1020
+ {
1021
+ my ($max, $next);
1022
+
1023
+ return unless defined ($max=pop);
1024
+ while (defined ($next=pop))
1025
+ {
1026
+ $max = $next if $next > $max;
1027
+ }
1028
+ return $max;
1029
+ }
1030
+
1031
+ #################################
1032
+
1033
+ sub min
1034
+ {
1035
+ my ($min, $next);
1036
+
1037
+ return unless defined ($min=pop);
1038
+ while (defined ($next=pop))
1039
+ {
1040
+ $min = $next if $next < $min;
1041
+ }
1042
+ return $min;
1043
+ }
1044
+
1045
+ #################################
1046
+
1047
+ sub printout_report
1048
+ {
1049
+ if ( $METHOD eq "BOTH" )
1050
+ {
1051
+ foreach my $sys (sort @tst_sys)
1052
+ {
1053
+ printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
1054
+ }
1055
+ }
1056
+ elsif ($METHOD eq "NIST" )
1057
+ {
1058
+ foreach my $sys (sort @tst_sys)
1059
+ {
1060
+ printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
1061
+ }
1062
+ }
1063
+ elsif ($METHOD eq "BLEU" )
1064
+ {
1065
+ foreach my $sys (sort @tst_sys)
1066
+ {
1067
+ printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
1068
+ }
1069
+ }
1070
+ printf "\n# ------------------------------------------------------------------------\n\n";
1071
+ printf "Individual N-gram scoring\n";
1072
+ printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
1073
+ printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
1074
+
1075
+ if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "NIST") )
1076
+ {
1077
+ foreach my $sys (sort @tst_sys)
1078
+ {
1079
+ printf " NIST:";
1080
+ for (my $i=1; $i<=$max_Ngram; $i++)
1081
+ {
1082
+ printf " %2.4f ",$NISTmt{$i}{$sys}{ind}
1083
+ }
1084
+ printf " \"$sys\"\n";
1085
+ }
1086
+ printf "\n";
1087
+ }
1088
+
1089
+ if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
1090
+ {
1091
+ foreach my $sys (sort @tst_sys)
1092
+ {
1093
+ printf " BLEU:";
1094
+ for (my $i=1; $i<=$max_Ngram; $i++)
1095
+ {
1096
+ printf " %2.4f ",$BLEUmt{$i}{$sys}{ind}
1097
+ }
1098
+ printf " \"$sys\"\n";
1099
+ }
1100
+ }
1101
+
1102
+ printf "\n# ------------------------------------------------------------------------\n";
1103
+ printf "Cumulative N-gram scoring\n";
1104
+ printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
1105
+ printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
1106
+
1107
+ if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST"))
1108
+ {
1109
+ foreach my $sys (sort @tst_sys)
1110
+ {
1111
+ printf " NIST:";
1112
+ for (my $i=1; $i<=$max_Ngram; $i++)
1113
+ {
1114
+ printf " %2.4f ",$NISTmt{$i}{$sys}{cum}
1115
+ }
1116
+ printf " \"$sys\"\n";
1117
+ }
1118
+ }
1119
+ printf "\n";
1120
+ if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
1121
+ {
1122
+ foreach my $sys (sort @tst_sys)
1123
+ {
1124
+ printf " BLEU:";
1125
+ for (my $i=1; $i<=$max_Ngram; $i++)
1126
+ {
1127
+ printf " %2.4f ",$BLEUmt{$i}{$sys}{cum}
1128
+ }
1129
+ printf " \"$sys\"\n";
1130
+ }
1131
+ }
1132
+ }
1133
+
1134
+ ###############################################################################################################################
1135
+ # Create three files, by using:
1136
+ # - $prefix : the prefix used for the output file names
1137
+ # - %overall : a hash containing seg/doc/sys-level scores:
1138
+ # - $overall{ $SYSTEM_ID }{ 'score' } => system-level score
1139
+ # - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'score' } => document-level score
1140
+ # - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'segments' }{ $SEGMENT_ID } => segment-level score
1141
+ ###############################################################################################################################
1142
+ sub outputMetricsMATR
1143
+ {
1144
+ my ( $prefix, %overall ) = @_;
1145
+ my $fileNameSys = $prefix . '-sys.scr';
1146
+ my $fileNameDoc = $prefix . '-doc.scr';
1147
+ my $fileNameSeg = $prefix . '-seg.scr';
1148
+ open FILEOUT_SYS, '>', $fileNameSys or die "Could not open file: ${fileNameSys}";
1149
+ open FILEOUT_DOC, '>', $fileNameDoc or die "Could not open file: ${fileNameDoc}";
1150
+ open FILEOUT_SEG, '>', $fileNameSeg or die "Could not open file: ${fileNameSeg}";
1151
+ foreach my $sys ( sort( keys( %overall ) ) )
1152
+ {
1153
+ my $scoreSys = $overall{ $sys }{ 'score' };
1154
+ print FILEOUT_SYS "${tst_id}\t${sys}\t${scoreSys}\n";
1155
+ foreach my $doc ( sort( keys( %{$overall{ $sys }{ 'documents' }} ) ) )
1156
+ {
1157
+ my $scoreDoc = $overall{ $sys }{ 'documents' }{ $doc }{ 'score' };
1158
+ print FILEOUT_DOC "${tst_id}\t${sys}\t${doc}\t${scoreDoc}\n";
1159
+ foreach my $seg ( sort{ $a <=> $b }( keys( %{$overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }} ) ) )
1160
+ {
1161
+ my $scoreSeg = $overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' };
1162
+ print FILEOUT_SEG "${tst_id}\t${sys}\t${doc}\t${seg}\t${scoreSeg}\n";
1163
+ }
1164
+ }
1165
+ }
1166
+ close FILEOUT_SEG;
1167
+ close FILEOUT_DOC;
1168
+ close FILEOUT_SYS;
1169
+ }
1170
+
mosesdecoder/scripts/generic/mteval-v14.pl ADDED
@@ -0,0 +1,1179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ use warnings;
4
+ use strict;
5
+ use utf8;
6
+ use Encode;
7
+ use XML::Twig;
8
+ use Sort::Naturally;
9
+
10
+ binmode STDOUT, ":utf8";
11
+ binmode STDERR, ":utf8";
12
+
13
+
14
+ #################################
15
+ # History:
16
+ #
17
+ # version 14
18
+ # (2016-03-29 lukas.diduch@nist.gov)
19
+ # * Fixed warning message in case seg-id is a string, by sorting in correct order using Sort::Naturally.
20
+ #
21
+ # version 13b
22
+ # * Fixed die 'bug' in case seg->id = 0
23
+ #
24
+ # version 13a
25
+ # * modified the scoring functions to prevent division-by-zero errors when a system segment is empty
26
+ # * affected methods: 'bleu_score' and 'bleu_score_smoothing'
27
+ #
28
+ # version 13
29
+ # * Uses a XML parser to read data (only when extension is .xml)
30
+ # * Smoothing of the segment-level BLEU scores, done by default
31
+ # * smoothing method similar to that of bleu-1.04.pl (IBM)
32
+ # * see comments above the 'bleu_score' method for more details on how the smoothing is computed
33
+ # * added a '--no-smoothing' option to simulate old scripts behavior
34
+ # * Introduction of the 'brevity-penalty' option, taking one of two values:
35
+ # * 'closest' (default) : act as IBM BLEU (taking the closest reference translation length)
36
+ # * in case two reference translations are at the same distance, will take the shortest one
37
+ # * for more details regarding how the BP is computed, see comments of the 'brevity_penalty_closest' function
38
+ # * 'shortest' : act as previous versions of the script (taking shortest reference translation length)
39
+ # * Introduction of the 'international-tokenization' option, boolean, disabled by default
40
+ # by default (when the option is not provided), uses 11b's tokenization function
41
+ # when option specified, uses v12's tokenization function
42
+ # * Introduction of a 'Metrics MATR output' flag (option '--metricsMATR')
43
+ # when used, creates three files for both BLEU score and NIST score:
44
+ # * BLEU-seg.scr and NIST-seg.scr: contain segment-level scores
45
+ # * BLEU-doc.scr and NIST-doc.scr: contain document-level scores
46
+ # * BLEU-sys.scr and NIST-sys.scr: contain system-level scores
47
+ # * SGML parsing
48
+ # * script will halt if source, reference and test files don't share the same setid attribute value (used for metricsMATR output)
49
+ # * correct segment IDs extracted from the files (was previously using an array, and using the index as a segID for output)
50
+ # * detailed output flag (-d) can now be used when running both BLEU and NIST
51
+ #
52
+ # version 12
53
+ # * Text normalization changes:
54
+ # * convert entity references (only the entities declared in the DTD)
55
+ # * now uses unicode categories
56
+ # * tokenize punctuation unless followed AND preceded by digits
57
+ # * tokenize symbols
58
+ # * UTF-8 handling:
59
+ # * files are now read using utf8 mode
60
+ # * Added the '-e' command-line option to enclose non-ASCII characters between spaces
61
+ #
62
+ # version 11b -- text normalization modified:
63
+ # * take out the join digit line because it joins digits
64
+ # when it shouldn't have
65
+ # $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
66
+ #
67
+ # version 11a -- corrected output of individual n-gram precision values
68
+ #
69
+ # version 11 -- bug fixes:
70
+ # * make filehandle operate in binary mode to prevent Perl from operating
71
+ # (by default in Red Hat 9) in UTF-8
72
+ # * fix failure on joining digits
73
+ # version 10 -- updated output to include more details of n-gram scoring.
74
+ # Defaults to generate both NIST and BLEU scores. Use -b for BLEU
75
+ # only, use -n for NIST only
76
+ #
77
+ # version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
78
+ # being the max, regardless what was entered on the command line.)
79
+ #
80
+ # version 09c -- bug fix (During the calculation of ngram information,
81
+ # each ngram was being counted only once for each segment. This has
82
+ # been fixed so that each ngram is counted correctly in each segment.)
83
+ #
84
+ # version 09b -- text normalization modified:
85
+ # * option flag added to preserve upper case
86
+ # * non-ASCII characters left in place.
87
+ #
88
+ # version 09a -- text normalization modified:
89
+ # * &quot; and &amp; converted to "" and &, respectively
90
+ # * non-ASCII characters kept together (bug fix)
91
+ #
92
+ # version 09 -- modified to accommodate sgml tag and attribute
93
+ # names revised to conform to default SGML conventions.
94
+ #
95
+ # version 08 -- modifies the NIST metric in accordance with the
96
+ # findings on the 2001 Chinese-English dry run corpus. Also
97
+ # incorporates the BLEU metric as an option and supports the
98
+ # output of ngram detail.
99
+ #
100
+ # version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
101
+ # Keep strings of non-ASCII characters together as one word
102
+ # (rather than splitting them into one-character words).
103
+ # Change length penalty so that translations that are longer than
104
+ # the average reference translation are not penalized.
105
+ #
106
+ # version 06
107
+ # Prevent divide-by-zero when a segment has no evaluation N-grams.
108
+ # Correct segment index for level 3 debug output.
109
+ #
110
+ # version 05
111
+ # improve diagnostic error messages
112
+ #
113
+ # version 04
114
+ # tag segments
115
+ #
116
+ # version 03
117
+ # add detailed output option (intermediate document and segment scores)
118
+ #
119
+ # version 02
120
+ # accommodation of modified sgml tags and attributes
121
+ #
122
+ # version 01
123
+ # same as bleu version 15, but modified to provide formal score output.
124
+ #
125
+ # original IBM version
126
+ # Author: Kishore Papineni
127
+ # Date: 06/10/2001
128
+ #################################
129
+
130
+ ######
131
+ # Intro
132
+ my ($date, $time) = date_time_stamp();
133
+ print "MT evaluation scorer began on $date at $time\n";
134
+ print "\ncommand line: ", $0, " ", join(" ", @ARGV), "\n";
135
+ my $usage = "\n\nUsage: $0 -r <ref_file> -s <src_file> -t <tst_file>\n\n".
136
+ "Description: This Perl script evaluates MT system performance.\n".
137
+ "\n".
138
+ "Required arguments:\n".
139
+ " -r <ref_file> is a file containing the reference translations for\n".
140
+ " the documents to be evaluated.\n".
141
+ " -s <src_file> is a file containing the source documents for which\n".
142
+ " translations are to be evaluated\n".
143
+ " -t <tst_file> is a file containing the translations to be evaluated\n".
144
+ "\n".
145
+ "Optional arguments:\n".
146
+ " -h prints this help message to STDOUT\n".
147
+ " -c preserves upper-case alphabetic characters\n".
148
+ " -b generate BLEU scores only\n".
149
+ " -n generate NIST scores only\n".
150
+ " -d detailed output flag:\n".
151
+ " 0 (default) for system-level score only\n".
152
+ " 1 to include document-level scores\n".
153
+ " 2 to include segment-level scores\n".
154
+ " 3 to include ngram-level scores\n".
155
+ " -e enclose non-ASCII characters between spaces\n".
156
+ " --brevity-penalty ( closest | shortest )\n" .
157
+ " closest (default) : acts as IBM BLEU (takes the closest reference translation length)\n" .
158
+ " shortest : acts as previous versions of the script (takes the shortest reference translation length)\n" .
159
+ " --international-tokenization\n" .
160
+ " when specified, uses Unicode-based (only) tokenization rules\n" .
161
+ " when not specified (default), uses default tokenization (some language-dependant rules)\n" .
162
+ " --metricsMATR : create three files for both BLEU scores and NIST scores:\n" .
163
+ " BLEU-seg.scr and NIST-seg.scr : segment-level scores\n" .
164
+ " BLEU-doc.scr and NIST-doc.scr : document-level scores\n" .
165
+ " BLEU-sys.scr and NIST-sys.scr : system-level scores\n" .
166
+ " --no-smoothing : disable smoothing on BLEU scores\n" .
167
+ "\n";
168
+
169
+ use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e);
170
+ use Getopt::Long;
171
+ my $ref_file = '';
172
+ my $src_file = '';
173
+ my $tst_file = '';
174
+ my $detail = 0;
175
+ my $help = '';
176
+ my $preserve_case = '';
177
+ my $split_non_ASCII = '';
178
+ my $brevity_penalty = 'closest';
179
+ my $international_tokenization;
180
+ my $metricsMATR_output = '';
181
+ my $no_smoothing = '';
182
+ our $opt_x = '';
183
+ our $opt_b = '';
184
+ our $opt_n = '';
185
+ GetOptions(
186
+ 'r=s' => \$ref_file,
187
+ 's=s' => \$src_file,
188
+ 't=s' => \$tst_file,
189
+ 'd:i' => \$detail,
190
+ 'h|help' => \$help,
191
+ 'b',
192
+ 'n',
193
+ 'c' => \$preserve_case,
194
+ 'x:s',
195
+ 'e' => \$split_non_ASCII,
196
+ 'brevity-penalty:s' => \$brevity_penalty,
197
+ 'international-tokenization' => \$international_tokenization,
198
+ 'metricsMATR-output' => \$metricsMATR_output,
199
+ 'no-smoothing' => \$no_smoothing
200
+ );
201
+ die $usage if $help;
202
+
203
+ die "Error in command line: ref_file not defined$usage" unless ( $ref_file );
204
+ die "Error in command line: src_file not defined$usage" unless ( $src_file );
205
+ die "Error in command line: tst_file not defined$usage" unless ( $tst_file );
206
+ my $BLEU_BP;
207
+ if ( !( $brevity_penalty cmp 'closest' ) )
208
+ {
209
+ $BLEU_BP = \&brevity_penalty_closest;
210
+ }
211
+ elsif ( !( $brevity_penalty cmp 'shortest' ) )
212
+ {
213
+ $BLEU_BP = \&brevity_penalty_shortest;
214
+ }
215
+ else
216
+ {
217
+ die "Incorrect value supplied for 'brevity_penalty'$usage";
218
+ }
219
+ my $TOKENIZATION = \&tokenization;
220
+ $TOKENIZATION = \&tokenization_international if ( $international_tokenization );
221
+
222
+ my $BLEU_SCORE = \&bleu_score;
223
+ $BLEU_SCORE = \&bleu_score_nosmoothing if ( $no_smoothing );
224
+
225
+ my $max_Ngram = 9;
226
+
227
+ my $METHOD = "BOTH";
228
+ if ( $opt_b ) { $METHOD = "BLEU"; }
229
+ if ( $opt_n ) { $METHOD = "NIST"; }
230
+ my $method;
231
+
232
+ ######
233
+ # Global variables
234
+ my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
235
+ my (%tst_data, %ref_data); # the data -- with structure: {system}{document}{segments}
236
+ my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
237
+ my %eval_docs; # document information for the evaluation data set
238
+ my %ngram_info; # the information obtained from (the last word in) the ngram
239
+
240
+ ######
241
+ # Get source document ID's
242
+ ($src_id) = get_source_info ($src_file);
243
+
244
+ ######
245
+ # Get reference translations
246
+ ($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);
247
+
248
+ compute_ngram_info ();
249
+
250
+ ######
251
+ # Get translations to evaluate
252
+ ($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);
253
+
254
+ ######
255
+ # Check data for completeness and correctness
256
+ check_MT_data ();
257
+
258
+ ######
259
+ #
260
+ my %NISTmt;
261
+ my %NISTOverall;
262
+ my %BLEUmt;
263
+ my %BLEUOverall;
264
+
265
+ ######
266
+ # Evaluate
267
+ print "\nEvaluation of $src_lang-to-$tgt_lang translation using:\n";
268
+ my $cum_seg = 0;
269
+ foreach my $doc (sort keys %eval_docs)
270
+ {
271
+ $cum_seg += scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
272
+ }
273
+ print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
274
+ print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
275
+ print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";
276
+
277
+ foreach my $sys (sort @tst_sys)
278
+ {
279
+ for (my $n=1; $n<=$max_Ngram; $n++)
280
+ {
281
+ $NISTmt{$n}{$sys}{cum} = 0;
282
+ $NISTmt{$n}{$sys}{ind} = 0;
283
+ $BLEUmt{$n}{$sys}{cum} = 0;
284
+ $BLEUmt{$n}{$sys}{ind} = 0;
285
+ }
286
+ if ( ($METHOD eq "BOTH") || ($METHOD eq "NIST") )
287
+ {
288
+ $method="NIST";
289
+ score_system ($sys, \%NISTmt, \%NISTOverall);
290
+ }
291
+ if ( ($METHOD eq "BOTH") || ($METHOD eq "BLEU") )
292
+ {
293
+ $method="BLEU";
294
+ score_system ($sys, \%BLEUmt, \%BLEUOverall);
295
+ }
296
+ }
297
+
298
+ ######
299
+ printout_report ();
300
+ if ( $metricsMATR_output )
301
+ {
302
+ outputMetricsMATR( 'NIST', %NISTOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'NIST' ) );
303
+ outputMetricsMATR( 'BLEU', %BLEUOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'BLEU' ) );
304
+ }
305
+
306
+ ($date, $time) = date_time_stamp();
307
+ print "\nMT evaluation scorer ended on $date at $time\n";
308
+
309
+ exit 0;
310
+
311
+ #################################
312
+
313
+ sub get_source_info
314
+ {
315
+ my ($file) = @_;
316
+ my ($name, $id, $src, $doc, $seg);
317
+ my ($data, $tag, $span);
318
+
319
+ # Extension of the file determines the parser used:
320
+ # .xml : XML::Twig
321
+ # otherwise : simple SGML parsing functions
322
+ if ( $file =~ /\.xml$/i )
323
+ {
324
+ my $twig = XML::Twig->new();
325
+ $twig->parsefile( $file );
326
+ my $root = $twig->root;
327
+ my $currentSet = $root->first_child( 'srcset' );
328
+ die "Source XML file '$file' does not contain the 'srcset' element" if ( not $currentSet );
329
+ $id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
330
+ $src = $currentSet->{ 'att' }->{ 'srclang' } or die "No srcset 'srclang' attribute value in '$file'";
331
+ die "Not the same srclang attribute values across sets" unless ( not defined $src_lang or $src eq $src_lang );
332
+ $src_lang = $src;
333
+ foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
334
+ {
335
+ my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
336
+ foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
337
+ {
338
+
339
+ my $segID = $currentSeg->{ 'att' }->{ 'id' };
340
+ die "No segment 'id' attribute value in '$file'" if (! defined $segID);
341
+ my $segData = $currentSeg->text;
342
+ ($eval_docs{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
343
+ }
344
+ }
345
+ }
346
+ else
347
+ {
348
+ #read data from file
349
+ open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
350
+ binmode FILE, ":utf8";
351
+ $data .= $_ while <FILE>;
352
+ close (FILE);
353
+
354
+ #get source set info
355
+ die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n"
356
+ unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);
357
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
358
+ unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
359
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
360
+ unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
361
+ die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
362
+ ." with $name in previous input data ('$src_lang')\n\n"
363
+ unless (not defined $src_lang or $src eq $src_lang);
364
+ $src_lang = $src;
365
+
366
+ #get doc info -- ID and # of segs
367
+ $data = $span;
368
+ while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data))
369
+ {
370
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
371
+ unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
372
+ die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n"
373
+ if defined $eval_docs{$doc};
374
+ $span =~ s/[\s\n\r]+/ /g; # concatenate records
375
+ my $nseg=0, my $seg_data = $span;
376
+ while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
377
+ {
378
+ die "\n\nFATAL INPUT ERROR: no attribute '$name' in file '$file'\n\n"
379
+ unless ($seg) = extract_sgml_tag_attribute( $name='id', $tag );
380
+ ($eval_docs{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
381
+ $nseg++;
382
+ }
383
+ die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n"
384
+ if $nseg == 0;
385
+ }
386
+ die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n"
387
+ unless keys %eval_docs > 0;
388
+ }
389
+ return $id;
390
+ }
391
+
392
+ #################################
393
+
394
+ sub get_MT_data
395
+ {
396
+ my ($docs, $set_tag, $file) = @_;
397
+ my ($name, $id, $src, $tgt, $sys, $doc, $seg);
398
+ my ($tag, $span, $data);
399
+
400
+ # Extension of the file determines the parser used:
401
+ # .xml : XML::Twig
402
+ # otherwise : simple SGML parsing functions
403
+ if ( $file =~ /\.xml$/i )
404
+ {
405
+ my $twig = XML::Twig->new();
406
+ $twig->parsefile( $file );
407
+ my $root = $twig->root;
408
+ foreach my $currentSet ( $root->get_xpath( 'refset' ), $root->get_xpath( 'tstset' ) )
409
+ {
410
+ $id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'";
411
+ $src = $currentSet->{ 'att' }->{ 'srclang' } or die "No 'srclang' attribute value in '$file'";
412
+ $tgt = $currentSet->{ 'att' }->{ 'trglang' } or die "No 'trglang' attribute value in '$file'";
413
+ die "Not the same 'srclang' attribute value across sets" unless ( $src eq $src_lang );
414
+ die "Not the same 'trglang' attribute value across sets" unless ( ( not defined $tgt_lang ) or ( $tgt = $tgt_lang ) );
415
+ $tgt_lang = $tgt;
416
+ my $sys;
417
+ if ( $currentSet->name eq 'tstset' )
418
+ {
419
+ $sys = $currentSet->{ 'att' }->{ 'sysid' } or die "No 'sysid' attribute value in '$file'";
420
+ }
421
+ else
422
+ {
423
+ $sys = $currentSet->{ 'att' }->{ 'refid' } or die "No 'refid' attribute value in '$file'";
424
+ }
425
+ foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) )
426
+ {
427
+ my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'";
428
+ $docs->{ $sys }{ $docID }{ FILE } = $file;
429
+ foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) )
430
+ {
431
+ my $segID = $currentSeg->{ 'att' }->{ 'id' };
432
+ die "No segment 'id' attribute value in '$file'" if (! defined $segID);
433
+ my $segData = $currentSeg->text;
434
+ ($docs->{$sys}{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData );
435
+ }
436
+ }
437
+ }
438
+ }
439
+ else
440
+ {
441
+ #read data from file
442
+ open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
443
+ binmode FILE, ":utf8";
444
+ $data .= $_ while <FILE>;
445
+ close (FILE);
446
+
447
+ #get tag info
448
+ while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data))
449
+ {
450
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
451
+ unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);
452
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
453
+ unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
454
+ die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n"
455
+ ." with $name of source ('$src_lang')\n\n"
456
+ unless $src eq $src_lang;
457
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
458
+ unless ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
459
+ die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n"
460
+ ." with $name of the evaluation ('$tgt_lang')\n\n"
461
+ unless (not defined $tgt_lang or $tgt eq $tgt_lang);
462
+ $tgt_lang = $tgt;
463
+
464
+ my $mtdata = $span;
465
+ while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata))
466
+ {
467
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
468
+ unless (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
469
+ die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
470
+ unless $doc = extract_sgml_tag_attribute ($name="DocID", $tag);
471
+ die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n"
472
+ ." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
473
+ unless (not defined $docs->{$sys}{$doc});
474
+
475
+ $span =~ s/[\s\n\r]+/ /g; # concatenate records
476
+ my $nseg=0, my $seg_data = $span;
477
+ while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data))
478
+ {
479
+ die "\n\nFATAIL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n"
480
+ unless $seg = extract_sgml_tag_attribute( $name="id", $tag );
481
+ ($docs->{$sys}{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span );
482
+ $nseg++;
483
+ }
484
+ die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" if $nseg == 0;
485
+ $docs->{$sys}{$doc}{FILE} = $file;
486
+ }
487
+ }
488
+ }
489
+ return $id;
490
+ }
491
+
492
+ #################################
493
+
494
+ sub check_MT_data
495
+ {
496
+ @tst_sys = sort keys %tst_data;
497
+ @ref_sys = sort keys %ref_data;
498
+
499
+ die "Not the same 'setid' attribute values across files" unless ( ( $src_id eq $tst_id ) && ( $src_id eq $ref_id ) );
500
+
501
+ #every evaluation document must be represented for every system and every reference
502
+ foreach my $doc (sort keys %eval_docs)
503
+ {
504
+ my $nseg_source = scalar( keys( %{$eval_docs{$doc}{SEGS}} ) );
505
+ foreach my $sys (@tst_sys)
506
+ {
507
+ die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n" unless defined $tst_data{$sys}{$doc};
508
+ my $nseg = scalar( keys( %{$tst_data{$sys}{$doc}{SEGS}} ) );
509
+ die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
510
+ ." document '$doc' for system '$sys' contains $nseg segments, while\n"
511
+ ." the source document contains $nseg_source segments.\n\n"
512
+ unless $nseg == $nseg_source;
513
+ }
514
+ foreach my $sys (@ref_sys)
515
+ {
516
+ die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n" unless defined $ref_data{$sys}{$doc};
517
+ my $nseg = scalar( keys( %{$ref_data{$sys}{$doc}{SEGS}} ) );
518
+ die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n"
519
+ ." document '$doc' for system '$sys' contains $nseg segments, while\n"
520
+ ." the source document contains $nseg_source segments.\n\n"
521
+ unless $nseg == $nseg_source;
522
+ }
523
+ }
524
+ }
525
+
526
+ #################################
527
+
528
+ sub compute_ngram_info
529
+ {
530
+ my ($ref, $doc, $seg);
531
+ my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
532
+ my (%ngram_count, @tot_ngrams);
533
+
534
+ foreach $ref (keys %ref_data)
535
+ {
536
+ foreach $doc (keys %{$ref_data{$ref}})
537
+ {
538
+ foreach $seg ( keys %{$ref_data{$ref}{$doc}{SEGS}})
539
+ {
540
+ @wrds = split /\s+/, $ref_data{ $ref }{ $doc }{ SEGS }{ $seg };
541
+ $tot_wrds += @wrds;
542
+ %ngrams = %{Words2Ngrams (@wrds)};
543
+ foreach $ngram (keys %ngrams)
544
+ {
545
+ $ngram_count{$ngram} += $ngrams{$ngram};
546
+ }
547
+ }
548
+ }
549
+ }
550
+
551
+ foreach $ngram (keys %ngram_count)
552
+ {
553
+ @wrds = split / /, $ngram;
554
+ pop @wrds, $mgram = join " ", @wrds;
555
+ $ngram_info{$ngram} = - log ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} : $ngram_count{$ngram}/$tot_wrds) / log 2;
556
+ if (defined $opt_x and $opt_x eq "ngram info")
557
+ {
558
+ @wrds = split / /, $ngram;
559
+ printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
560
+ $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
561
+ }
562
+ }
563
+ }
564
+
565
+ #################################
566
+
567
+ sub score_system
568
+ {
569
+ my ($sys, $ref, $doc, $SCOREmt, $overallScore);
570
+ ($sys, $SCOREmt, $overallScore) = @_;
571
+ my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
572
+ my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
573
+
574
+ $cum_ref_length = 0;
575
+ for (my $j=1; $j<=$max_Ngram; $j++)
576
+ {
577
+ $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
578
+ }
579
+ foreach $doc (sort keys %eval_docs)
580
+ {
581
+ ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc, $overallScore);
582
+ if ( $method eq "NIST" )
583
+ {
584
+ my %DOCmt = ();
585
+ my $docScore = nist_score( scalar( @ref_sys ), $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, \%DOCmt );
586
+ $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
587
+ if ( $detail >= 1 )
588
+ {
589
+ printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
590
+ $docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
591
+ }
592
+ }
593
+
594
+ if ( $method eq "BLEU" )
595
+ {
596
+ my %DOCmt = ();
597
+ my $docScore = &{$BLEU_SCORE}( $ref_length, $match_cnt, $tst_cnt, $sys, \%DOCmt );
598
+ $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore;
599
+ if ( $detail >= 1 )
600
+ {
601
+ printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
602
+ $docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1];
603
+ }
604
+ }
605
+
606
+ $cum_ref_length += $ref_length;
607
+ for (my $j=1; $j<=$max_Ngram; $j++)
608
+ {
609
+ $cum_match[$j] += $match_cnt->[$j];
610
+ $cum_tst_cnt[$j] += $tst_cnt->[$j];
611
+ $cum_ref_cnt[$j] += $ref_cnt->[$j];
612
+ $cum_tst_info[$j] += $tst_info->[$j];
613
+ $cum_ref_info[$j] += $ref_info->[$j];
614
+ printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
615
+ $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
616
+ if (defined $opt_x and $opt_x eq "document info");
617
+ }
618
+ }
619
+
620
+ if ($method eq "BLEU")
621
+ {
622
+ $overallScore->{ $sys }{ 'score' } = &{$BLEU_SCORE}($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, $SCOREmt);
623
+ }
624
+ if ($method eq "NIST")
625
+ {
626
+ $overallScore->{ $sys }{ 'score' } = nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, $SCOREmt);
627
+ }
628
+ }
629
+
630
+ #################################
631
+
632
+ sub score_document
633
+ {
634
+ my ($sys, $ref, $doc, $overallScore);
635
+ ($sys, $doc, $overallScore) = @_;
636
+ my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
637
+ my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);
638
+
639
+ $cum_ref_length = 0;
640
+ for (my $j=1; $j<=$max_Ngram; $j++)
641
+ {
642
+ $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
643
+ }
644
+
645
+ # score each segment
646
+ foreach my $seg ( nsort keys( %{$tst_data{$sys}{$doc}{SEGS}} ) )
647
+ {
648
+
649
+ my @ref_segments = ();
650
+ foreach $ref (@ref_sys)
651
+ {
652
+ push @ref_segments, $ref_data{$ref}{$doc}{SEGS}{$seg};
653
+ if ( $detail >= 3 )
654
+ {
655
+ printf "ref '$ref', seg $seg: %s\n", $ref_data{$ref}{$doc}{SEGS}{$seg}
656
+ }
657
+
658
+ }
659
+
660
+ printf "sys '$sys', seg $seg: %s\n", $tst_data{$sys}{$doc}{SEGS}{$seg} if ( $detail >= 3 );
661
+ ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_segment ($tst_data{$sys}{$doc}{SEGS}{$seg}, @ref_segments);
662
+
663
+ if ( $method eq "BLEU" )
664
+ {
665
+ my %DOCmt = ();
666
+ my $segScore = &{$BLEU_SCORE}($ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt);
667
+ $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
668
+ if ( $detail >= 2 )
669
+ {
670
+ printf " $method score using 4-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1]
671
+ }
672
+ }
673
+ if ( $method eq "NIST" )
674
+ {
675
+ my %DOCmt = ();
676
+ my $segScore = nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt);
677
+ $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore;
678
+ if ( $detail >= 2 )
679
+ {
680
+ printf " $method score using 5-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1];
681
+ }
682
+ }
683
+ $cum_ref_length += $ref_length;
684
+ for (my $j=1; $j<=$max_Ngram; $j++)
685
+ {
686
+ $cum_match[$j] += $match_cnt->[$j];
687
+ $cum_tst_cnt[$j] += $tst_cnt->[$j];
688
+ $cum_ref_cnt[$j] += $ref_cnt->[$j];
689
+ $cum_tst_info[$j] += $tst_info->[$j];
690
+ $cum_ref_info[$j] += $ref_info->[$j];
691
+ }
692
+ }
693
+ return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
694
+ }
695
+
696
+ ###############################################################################################################################
697
+ # function returning the shortest reference length
698
+ # takes as input:
699
+ # - currentLength : the current (shortest) reference length
700
+ # - referenceSentenceLength : the current reference sentence length
701
+ # - candidateSentenceLength : the current candidate sentence length (unused)
702
+ ###############################################################################################################################
703
+ sub brevity_penalty_shortest
704
+ {
705
+ my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
706
+ return ( $referenceSentenceLength < $currentLength ? $referenceSentenceLength : $currentLength );
707
+ }
708
+
709
+ ###############################################################################################################################
710
+ # function returning the closest reference length (to the candidate sentence length)
711
+ # takes as input:
712
+ # - currentLength: the current (closest) reference length.
713
+ # - candidateSentenceLength : the current reference sentence length
714
+ # - candidateSentenceLength : the current candidate sentence length
715
+ # when two reference sentences are at the same distance, it will return the shortest reference sentence length
716
+ # example of 4 iterations, given:
717
+ # - one candidate sentence containing 7 tokens
718
+ # - one reference translation containing 11 tokens
719
+ # - one reference translation containing 8 tokens
720
+ # - one reference translation containing 6 tokens
721
+ # - one reference translation containing 7 tokens
722
+ # the multiple invokations will return:
723
+ # - currentLength is set to 11 (outside of this function)
724
+ # - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 )
725
+ # - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8
726
+ # - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 )
727
+ ###############################################################################################################################
728
+ sub brevity_penalty_closest
729
+ {
730
+ my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_;
731
+ my $result = $currentLength;
732
+ if ( abs( $candidateSentenceLength - $referenceSentenceLength ) <= abs( $candidateSentenceLength - $currentLength ) )
733
+ {
734
+ if ( abs( $candidateSentenceLength - $referenceSentenceLength ) == abs( $candidateSentenceLength - $currentLength ) )
735
+ {
736
+ if ( $currentLength > $referenceSentenceLength )
737
+ {
738
+ $result = $referenceSentenceLength;
739
+ }
740
+ }
741
+ else
742
+ {
743
+ $result = $referenceSentenceLength;
744
+ }
745
+ }
746
+ return $result;
747
+ }
748
+
749
+ #################################
750
+
751
+ sub score_segment
752
+ {
753
+ my ($tst_seg, @ref_segs) = @_;
754
+ my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
755
+ my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
756
+ my ($ngram);
757
+ my (@nwrds_ref);
758
+ my $ref_length;
759
+
760
+ for (my $j=1; $j<= $max_Ngram; $j++)
761
+ {
762
+ $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
763
+ }
764
+
765
+ # get the ngram counts for the test segment
766
+ @tst_wrds = split /\s+/, $tst_seg;
767
+ %tst_ngrams = %{Words2Ngrams (@tst_wrds)};
768
+ for (my $j=1; $j<=$max_Ngram; $j++)
769
+ {
770
+ # compute ngram counts
771
+ $tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
772
+ }
773
+
774
+ # get the ngram counts for the reference segments
775
+ foreach $ref_seg (@ref_segs)
776
+ {
777
+ @ref_wrds = split /\s+/, $ref_seg;
778
+ %ref_ngrams = %{Words2Ngrams (@ref_wrds)};
779
+ foreach $ngram (keys %ref_ngrams)
780
+ {
781
+ # find the maximum # of occurrences
782
+ my @wrds = split / /, $ngram;
783
+ $ref_info[@wrds] += $ngram_info{$ngram};
784
+ $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : $ref_ngrams{$ngram};
785
+ }
786
+ for (my $j=1; $j<=$max_Ngram; $j++)
787
+ {
788
+ # update ngram counts
789
+ $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
790
+ }
791
+ if ( not defined( $ref_length ) )
792
+ {
793
+ $ref_length = scalar( @ref_wrds );
794
+ }
795
+ else
796
+ {
797
+ $ref_length = &{$BLEU_BP}( $ref_length, scalar( @ref_wrds ), scalar( @tst_wrds ) );
798
+ }
799
+ }
800
+
801
+ # accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
802
+ foreach $ngram (keys %tst_ngrams)
803
+ {
804
+ next unless defined $ref_ngrams_max{$ngram};
805
+ my @wrds = split / /, $ngram;
806
+ $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
807
+ $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
808
+ printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
809
+ if $detail >= 3;
810
+ }
811
+
812
+ return ($ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
813
+ }
814
+
815
+ #################################
816
+
817
+ sub bleu_score_nosmoothing
818
+ {
819
+ my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_;
820
+ my $score = 0;
821
+ my $iscore = 0;
822
+
823
+ for ( my $j = 1; $j <= $max_Ngram; ++$j )
824
+ {
825
+ if ($matching_ngrams->[ $j ] == 0)
826
+ {
827
+ $SCOREmt->{ $j }{ $sys }{ cum }=0;
828
+ }
829
+ else
830
+ {
831
+ my $len_score = min (0, 1-$ref_length/$tst_ngrams->[1]);
832
+ # Cumulative N-Gram score
833
+ $score += log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
834
+ $SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j + $len_score );
835
+ # Individual N-Gram score
836
+ $iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
837
+ $SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
838
+ }
839
+ }
840
+ return $SCOREmt->{ 4 }{ $sys }{ cum };
841
+ }
842
+
843
+ ###############################################################################################################################
844
+ # Default method used to compute the BLEU score, using smoothing.
845
+ # Note that the method used can be overridden using the '--no-smoothing' command-line argument
846
+ # The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each precision score whose matching n-gram count is null
847
+ # k is 1 for the first 'n' value for which the n-gram match count is null
848
+ # For example, if the text contains:
849
+ # - one 2-gram match
850
+ # - and (consequently) two 1-gram matches
851
+ # the n-gram count for each individual precision score would be:
852
+ # - n=1 => prec_count = 2 (two unigrams)
853
+ # - n=2 => prec_count = 1 (one bigram)
854
+ # - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
855
+ # - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
856
+ ###############################################################################################################################
857
+ sub bleu_score
858
+ {
859
+ my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_;
860
+ my $score = 0;
861
+ my $iscore = 0;
862
+ my $exp_len_score = 0;
863
+ $exp_len_score = exp( min (0, 1 - $ref_length / $tst_ngrams->[ 1 ] ) ) if ( $tst_ngrams->[ 1 ] > 0 );
864
+ my $smooth = 1;
865
+ for ( my $j = 1; $j <= $max_Ngram; ++$j )
866
+ {
867
+ if ( $tst_ngrams->[ $j ] == 0 )
868
+ {
869
+ $iscore = 0;
870
+ }
871
+ elsif ( $matching_ngrams->[ $j ] == 0 )
872
+ {
873
+ $smooth *= 2;
874
+ $iscore = log( 1 / ( $smooth * $tst_ngrams->[ $j ] ) );
875
+ }
876
+ else
877
+ {
878
+ $iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] );
879
+ }
880
+ $SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore );
881
+ $score += $iscore;
882
+ $SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j ) * $exp_len_score;
883
+ }
884
+ return $SCOREmt->{ 4 }{ $sys }{ cum };
885
+ }
886
+
887
+ #################################
888
+
889
+ sub nist_score
890
+ {
891
+ my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, $SCOREmt) = @_;
892
+ my $score = 0;
893
+ my $iscore = 0;
894
+
895
+ for (my $n=1; $n<=$max_Ngram; $n++)
896
+ {
897
+ $score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
898
+ $SCOREmt->{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
899
+ $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1);
900
+ $SCOREmt->{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));
901
+ }
902
+ return $SCOREmt->{5}{$sys}{cum};
903
+ }
904
+
905
+ #################################
906
+
907
+ sub Words2Ngrams
908
+ {
909
+ #convert a string of words to an Ngram count hash
910
+ my %count = ();
911
+
912
+ for (; @_; shift)
913
+ {
914
+ my ($j, $ngram, $word);
915
+ for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++)
916
+ {
917
+ $ngram .= defined $ngram ? " $word" : $word;
918
+ $count{$ngram}++;
919
+ }
920
+ }
921
+ return {%count};
922
+ }
923
+
924
+ #################################
925
+
926
+ sub tokenization
927
+ {
928
+ my ($norm_text) = @_;
929
+
930
+ # language-independent part:
931
+ $norm_text =~ s/<skipped>//g; # strip "skipped" tags
932
+ $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
933
+ $norm_text =~ s/\n/ /g; # join lines
934
+ $norm_text =~ s/&quot;/"/g; # convert SGML tag for quote to "
935
+ $norm_text =~ s/&amp;/&/g; # convert SGML tag for ampersand to &
936
+ $norm_text =~ s/&lt;/</g; # convert SGML tag for less-than to >
937
+ $norm_text =~ s/&gt;/>/g; # convert SGML tag for greater-than to <
938
+
939
+ # language-dependent part (assuming Western languages):
940
+ $norm_text = " $norm_text ";
941
+ $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
942
+ $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation
943
+ $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
944
+ $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
945
+ $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
946
+ $norm_text =~ s/\s+/ /g; # one space only between words
947
+ $norm_text =~ s/^\s+//; # no leading space
948
+ $norm_text =~ s/\s+$//; # no trailing space
949
+
950
+ return $norm_text;
951
+ }
952
+
953
+
954
+ sub tokenization_international
955
+ {
956
+ my ($norm_text) = @_;
957
+
958
+ $norm_text =~ s/<skipped>//g; # strip "skipped" tags
959
+ #$norm_text =~ s/\p{Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines
960
+ $norm_text =~ s/\p{Zl}/ /g; # join lines
961
+
962
+ # replace entities
963
+ $norm_text =~ s/&quot;/\"/g; # quote to "
964
+ $norm_text =~ s/&amp;/&/g; # ampersand to &
965
+ $norm_text =~ s/&lt;/</g; # less-than to <
966
+ $norm_text =~ s/&gt;/>/g; # greater-than to >
967
+ $norm_text =~ s/&apos;/\'/g; # apostrophe to '
968
+
969
+ $norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed
970
+ $norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII );
971
+
972
+ # punctuation: tokenize any punctuation unless followed AND preceded by a digit
973
+ $norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g;
974
+ $norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g;
975
+
976
+ $norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols
977
+
978
+ $norm_text =~ s/\p{Z}+/ /g; # one space only between words
979
+ $norm_text =~ s/^\p{Z}+//; # no leading space
980
+ $norm_text =~ s/\p{Z}+$//; # no trailing space
981
+
982
+ return $norm_text;
983
+ }
984
+
985
+ #################################
986
+
987
+ sub nist_length_penalty
988
+ {
989
+ my ($ratio) = @_;
990
+ return 1 if $ratio >= 1;
991
+ return 0 if $ratio <= 0;
992
+ my $ratio_x = 1.5;
993
+ my $score_x = 0.5;
994
+ my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
995
+ return exp (-$beta*log($ratio)*log($ratio));
996
+ }
997
+
998
+ #################################
999
+
1000
+ sub date_time_stamp
1001
+ {
1002
+ my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
1003
+ my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1004
+ my ($date, $time);
1005
+ $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
1006
+ $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
1007
+ return ($date, $time);
1008
+ }
1009
+
1010
+ #################################
1011
+
1012
+ sub extract_sgml_tag_and_span
1013
+ {
1014
+ my ($name, $data) = @_;
1015
+ ($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
1016
+ }
1017
+
1018
+ #################################
1019
+
1020
+ sub extract_sgml_tag_attribute
1021
+ {
1022
+ my ($name, $data) = @_;
1023
+ ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
1024
+ }
1025
+
1026
+ #################################
1027
+
1028
+ sub max
1029
+ {
1030
+ my ($max, $next);
1031
+
1032
+ return unless defined ($max=pop);
1033
+ while (defined ($next=pop))
1034
+ {
1035
+ $max = $next if $next > $max;
1036
+ }
1037
+ return $max;
1038
+ }
1039
+
1040
+ #################################
1041
+
1042
+ sub min
1043
+ {
1044
+ my ($min, $next);
1045
+
1046
+ return unless defined ($min=pop);
1047
+ while (defined ($next=pop))
1048
+ {
1049
+ $min = $next if $next < $min;
1050
+ }
1051
+ return $min;
1052
+ }
1053
+
1054
+ #################################
1055
+
1056
+ sub printout_report
1057
+ {
1058
+ if ( $METHOD eq "BOTH" )
1059
+ {
1060
+ foreach my $sys (sort @tst_sys)
1061
+ {
1062
+ printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum};
1063
+ }
1064
+ }
1065
+ elsif ($METHOD eq "NIST" )
1066
+ {
1067
+ foreach my $sys (sort @tst_sys)
1068
+ {
1069
+ printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum};
1070
+ }
1071
+ }
1072
+ elsif ($METHOD eq "BLEU" )
1073
+ {
1074
+ foreach my $sys (sort @tst_sys)
1075
+ {
1076
+ printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum};
1077
+ }
1078
+ }
1079
+ printf "\n# ------------------------------------------------------------------------\n\n";
1080
+ printf "Individual N-gram scoring\n";
1081
+ printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
1082
+ printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
1083
+
1084
+ if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "NIST") )
1085
+ {
1086
+ foreach my $sys (sort @tst_sys)
1087
+ {
1088
+ printf " NIST:";
1089
+ for (my $i=1; $i<=$max_Ngram; $i++)
1090
+ {
1091
+ printf " %2.4f ",$NISTmt{$i}{$sys}{ind}
1092
+ }
1093
+ printf " \"$sys\"\n";
1094
+ }
1095
+ printf "\n";
1096
+ }
1097
+
1098
+ if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
1099
+ {
1100
+ foreach my $sys (sort @tst_sys)
1101
+ {
1102
+ printf " BLEU:";
1103
+ for (my $i=1; $i<=$max_Ngram; $i++)
1104
+ {
1105
+ printf " %2.4f ",$BLEUmt{$i}{$sys}{ind}
1106
+ }
1107
+ printf " \"$sys\"\n";
1108
+ }
1109
+ }
1110
+
1111
+ printf "\n# ------------------------------------------------------------------------\n";
1112
+ printf "\nCumulative N-gram scoring\n";
1113
+ printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n";
1114
+ printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n";
1115
+
1116
+ if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST"))
1117
+ {
1118
+ foreach my $sys (sort @tst_sys)
1119
+ {
1120
+ printf " NIST:";
1121
+ for (my $i=1; $i<=$max_Ngram; $i++)
1122
+ {
1123
+ printf " %2.4f ",$NISTmt{$i}{$sys}{cum}
1124
+ }
1125
+ printf " \"$sys\"\n";
1126
+ }
1127
+ }
1128
+ printf "\n";
1129
+ if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") )
1130
+ {
1131
+ foreach my $sys (sort @tst_sys)
1132
+ {
1133
+ printf " BLEU:";
1134
+ for (my $i=1; $i<=$max_Ngram; $i++)
1135
+ {
1136
+ printf " %2.4f ",$BLEUmt{$i}{$sys}{cum}
1137
+ }
1138
+ printf " \"$sys\"\n";
1139
+ }
1140
+ }
1141
+ }
1142
+
1143
+ ###############################################################################################################################
1144
+ # Create three files, by using:
1145
+ # - $prefix : the prefix used for the output file names
1146
+ # - %overall : a hash containing seg/doc/sys-level scores:
1147
+ # - $overall{ $SYSTEM_ID }{ 'score' } => system-level score
1148
+ # - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'score' } => document-level score
1149
+ # - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'segments' }{ $SEGMENT_ID } => segment-level score
1150
+ ###############################################################################################################################
1151
+ sub outputMetricsMATR
1152
+ {
1153
+ my ( $prefix, %overall ) = @_;
1154
+ my $fileNameSys = $prefix . '-sys.scr';
1155
+ my $fileNameDoc = $prefix . '-doc.scr';
1156
+ my $fileNameSeg = $prefix . '-seg.scr';
1157
+ open FILEOUT_SYS, '>', $fileNameSys or die "Could not open file: ${fileNameSys}";
1158
+ open FILEOUT_DOC, '>', $fileNameDoc or die "Could not open file: ${fileNameDoc}";
1159
+ open FILEOUT_SEG, '>', $fileNameSeg or die "Could not open file: ${fileNameSeg}";
1160
+ foreach my $sys ( sort( keys( %overall ) ) )
1161
+ {
1162
+ my $scoreSys = $overall{ $sys }{ 'score' };
1163
+ print FILEOUT_SYS "${tst_id}\t${sys}\t${scoreSys}\n";
1164
+ foreach my $doc ( sort( keys( %{$overall{ $sys }{ 'documents' }} ) ) )
1165
+ {
1166
+ my $scoreDoc = $overall{ $sys }{ 'documents' }{ $doc }{ 'score' };
1167
+ print FILEOUT_DOC "${tst_id}\t${sys}\t${doc}\t${scoreDoc}\n";
1168
+ foreach my $seg ( nsort keys( %{$overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }} ) )
1169
+ {
1170
+ my $scoreSeg = $overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' };
1171
+ print FILEOUT_SEG "${tst_id}\t${sys}\t${doc}\t${seg}\t${scoreSeg}\n";
1172
+ }
1173
+ }
1174
+ }
1175
+ close FILEOUT_SEG;
1176
+ close FILEOUT_DOC;
1177
+ close FILEOUT_SYS;
1178
+ }
1179
+
mosesdecoder/scripts/generic/multi-bleu-detok.perl ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ # This file uses the internal tokenization of mteval-v13a.pl,
7
+ # giving the exact same (case-sensitive) results on untokenized text.
8
+ # Using this script with detokenized output and untokenized references is
9
+ # preferrable over multi-bleu.perl, since scores aren't affected by tokenization differences.
10
+ #
11
+ # like multi-bleu.perl , it supports plain text input and multiple references.
12
+
13
+ # $Id$
14
+ use warnings;
15
+ use strict;
16
+
17
+ binmode(STDIN, ":utf8");
18
+ use open ':encoding(UTF-8)';
19
+
20
+ my $lowercase = 0;
21
+ if ($ARGV[0] eq "-lc") {
22
+ $lowercase = 1;
23
+ shift;
24
+ }
25
+
26
+ my $stem = $ARGV[0];
27
+ if (!defined $stem) {
28
+ print STDERR "usage: multi-bleu-detok.pl [-lc] reference < hypothesis\n";
29
+ print STDERR "Reads the references from reference or reference0, reference1, ...\n";
30
+ exit(1);
31
+ }
32
+
33
+ $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
34
+
35
+ my @REF;
36
+ my $ref=0;
37
+ while(-e "$stem$ref") {
38
+ &add_to_ref("$stem$ref",\@REF);
39
+ $ref++;
40
+ }
41
+ &add_to_ref($stem,\@REF) if -e $stem;
42
+ die("ERROR: could not find reference file $stem") unless scalar @REF;
43
+
44
+ # add additional references explicitly specified on the command line
45
+ shift;
46
+ foreach my $stem (@ARGV) {
47
+ &add_to_ref($stem,\@REF) if -e $stem;
48
+ }
49
+
50
+
51
+
52
+ sub add_to_ref {
53
+ my ($file,$REF) = @_;
54
+ my $s=0;
55
+ if ($file =~ /.gz$/) {
56
+ open(REF,"gzip -dc $file|") or die "Can't read $file";
57
+ } else {
58
+ open(REF,$file) or die "Can't read $file";
59
+ }
60
+ while(<REF>) {
61
+ chop;
62
+ $_ = tokenization($_);
63
+ push @{$$REF[$s++]}, $_;
64
+ }
65
+ close(REF);
66
+ }
67
+
68
+ my(@CORRECT,@TOTAL,$length_translation,$length_reference);
69
+ my $s=0;
70
+ while(<STDIN>) {
71
+ chop;
72
+ $_ = lc if $lowercase;
73
+ $_ = tokenization($_);
74
+ my @WORD = split;
75
+ my %REF_NGRAM = ();
76
+ my $length_translation_this_sentence = scalar(@WORD);
77
+ my ($closest_diff,$closest_length) = (9999,9999);
78
+ foreach my $reference (@{$REF[$s]}) {
79
+ # print "$s $_ <=> $reference\n";
80
+ $reference = lc($reference) if $lowercase;
81
+ my @WORD = split(' ',$reference);
82
+ my $length = scalar(@WORD);
83
+ my $diff = abs($length_translation_this_sentence-$length);
84
+ if ($diff < $closest_diff) {
85
+ $closest_diff = $diff;
86
+ $closest_length = $length;
87
+ # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
88
+ } elsif ($diff == $closest_diff) {
89
+ $closest_length = $length if $length < $closest_length;
90
+ # from two references with the same closeness to me
91
+ # take the *shorter* into account, not the "first" one.
92
+ }
93
+ for(my $n=1;$n<=4;$n++) {
94
+ my %REF_NGRAM_N = ();
95
+ for(my $start=0;$start<=$#WORD-($n-1);$start++) {
96
+ my $ngram = "$n";
97
+ for(my $w=0;$w<$n;$w++) {
98
+ $ngram .= " ".$WORD[$start+$w];
99
+ }
100
+ $REF_NGRAM_N{$ngram}++;
101
+ }
102
+ foreach my $ngram (keys %REF_NGRAM_N) {
103
+ if (!defined($REF_NGRAM{$ngram}) ||
104
+ $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
105
+ $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
106
+ # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
107
+ }
108
+ }
109
+ }
110
+ }
111
+ $length_translation += $length_translation_this_sentence;
112
+ $length_reference += $closest_length;
113
+ for(my $n=1;$n<=4;$n++) {
114
+ my %T_NGRAM = ();
115
+ for(my $start=0;$start<=$#WORD-($n-1);$start++) {
116
+ my $ngram = "$n";
117
+ for(my $w=0;$w<$n;$w++) {
118
+ $ngram .= " ".$WORD[$start+$w];
119
+ }
120
+ $T_NGRAM{$ngram}++;
121
+ }
122
+ foreach my $ngram (keys %T_NGRAM) {
123
+ $ngram =~ /^(\d+) /;
124
+ my $n = $1;
125
+ # my $corr = 0;
126
+ # print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
127
+ $TOTAL[$n] += $T_NGRAM{$ngram};
128
+ if (defined($REF_NGRAM{$ngram})) {
129
+ if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
130
+ $CORRECT[$n] += $T_NGRAM{$ngram};
131
+ # $corr = $T_NGRAM{$ngram};
132
+ # print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
133
+ }
134
+ else {
135
+ $CORRECT[$n] += $REF_NGRAM{$ngram};
136
+ # $corr = $REF_NGRAM{$ngram};
137
+ # print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
138
+ }
139
+ }
140
+ # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
141
+ # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
142
+ }
143
+ }
144
+ $s++;
145
+ }
146
+ my $brevity_penalty = 1;
147
+ my $bleu = 0;
148
+
149
+ my @bleu=();
150
+
151
+ for(my $n=1;$n<=4;$n++) {
152
+ if (defined ($TOTAL[$n])){
153
+ $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
154
+ # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
155
+ }else{
156
+ $bleu[$n]=0;
157
+ }
158
+ }
159
+
160
+ if ($length_reference==0){
161
+ printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
162
+ exit(1);
163
+ }
164
+
165
+ if ($length_translation<$length_reference) {
166
+ $brevity_penalty = exp(1-$length_reference/$length_translation);
167
+ }
168
+ $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
169
+ my_log( $bleu[2] ) +
170
+ my_log( $bleu[3] ) +
171
+ my_log( $bleu[4] ) ) / 4) ;
172
+ printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
173
+ 100*$bleu,
174
+ 100*$bleu[1],
175
+ 100*$bleu[2],
176
+ 100*$bleu[3],
177
+ 100*$bleu[4],
178
+ $brevity_penalty,
179
+ $length_translation / $length_reference,
180
+ $length_translation,
181
+ $length_reference;
182
+
183
+ sub my_log {
184
+ return -9999999999 unless $_[0];
185
+ return log($_[0]);
186
+ }
187
+
188
+
189
+
190
+ sub tokenization
191
+ {
192
+ my ($norm_text) = @_;
193
+
194
+ # language-independent part:
195
+ $norm_text =~ s/<skipped>//g; # strip "skipped" tags
196
+ $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
197
+ $norm_text =~ s/\n/ /g; # join lines
198
+ $norm_text =~ s/&quot;/"/g; # convert SGML tag for quote to "
199
+ $norm_text =~ s/&amp;/&/g; # convert SGML tag for ampersand to &
200
+ $norm_text =~ s/&lt;/</g; # convert SGML tag for less-than to >
201
+ $norm_text =~ s/&gt;/>/g; # convert SGML tag for greater-than to <
202
+
203
+ # language-dependent part (assuming Western languages):
204
+ $norm_text = " $norm_text ";
205
+ $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation
206
+ $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
207
+ $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
208
+ $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
209
+ $norm_text =~ s/\s+/ /g; # one space only between words
210
+ $norm_text =~ s/^\s+//; # no leading space
211
+ $norm_text =~ s/\s+$//; # no trailing space
212
+
213
+ return $norm_text;
214
+ }
mosesdecoder/scripts/generic/multi-bleu.perl ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ # $Id$
7
+ use warnings;
8
+ use strict;
9
+
10
+ my $lowercase = 0;
11
+ if ($ARGV[0] eq "-lc") {
12
+ $lowercase = 1;
13
+ shift;
14
+ }
15
+
16
+ my $stem = $ARGV[0];
17
+ if (!defined $stem) {
18
+ print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
19
+ print STDERR "Reads the references from reference or reference0, reference1, ...\n";
20
+ exit(1);
21
+ }
22
+
23
+ $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
24
+
25
+ my @REF;
26
+ my $ref=0;
27
+ while(-e "$stem$ref") {
28
+ &add_to_ref("$stem$ref",\@REF);
29
+ $ref++;
30
+ }
31
+ &add_to_ref($stem,\@REF) if -e $stem;
32
+ die("ERROR: could not find reference file $stem") unless scalar @REF;
33
+
34
+ # add additional references explicitly specified on the command line
35
+ shift;
36
+ foreach my $stem (@ARGV) {
37
+ &add_to_ref($stem,\@REF) if -e $stem;
38
+ }
39
+
40
+
41
+
42
+ sub add_to_ref {
43
+ my ($file,$REF) = @_;
44
+ my $s=0;
45
+ if ($file =~ /.gz$/) {
46
+ open(REF,"gzip -dc $file|") or die "Can't read $file";
47
+ } else {
48
+ open(REF,$file) or die "Can't read $file";
49
+ }
50
+ while(<REF>) {
51
+ chomp;
52
+ push @{$$REF[$s++]}, $_;
53
+ }
54
+ close(REF);
55
+ }
56
+
57
+ my(@CORRECT,@TOTAL,$length_translation,$length_reference);
58
+ my $s=0;
59
+ while(<STDIN>) {
60
+ chomp;
61
+ $_ = lc if $lowercase;
62
+ my @WORD = split;
63
+ my %REF_NGRAM = ();
64
+ my $length_translation_this_sentence = scalar(@WORD);
65
+ my ($closest_diff,$closest_length) = (9999,9999);
66
+ foreach my $reference (@{$REF[$s]}) {
67
+ # print "$s $_ <=> $reference\n";
68
+ $reference = lc($reference) if $lowercase;
69
+ my @WORD = split(' ',$reference);
70
+ my $length = scalar(@WORD);
71
+ my $diff = abs($length_translation_this_sentence-$length);
72
+ if ($diff < $closest_diff) {
73
+ $closest_diff = $diff;
74
+ $closest_length = $length;
75
+ # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
76
+ } elsif ($diff == $closest_diff) {
77
+ $closest_length = $length if $length < $closest_length;
78
+ # from two references with the same closeness to me
79
+ # take the *shorter* into account, not the "first" one.
80
+ }
81
+ for(my $n=1;$n<=4;$n++) {
82
+ my %REF_NGRAM_N = ();
83
+ for(my $start=0;$start<=$#WORD-($n-1);$start++) {
84
+ my $ngram = "$n";
85
+ for(my $w=0;$w<$n;$w++) {
86
+ $ngram .= " ".$WORD[$start+$w];
87
+ }
88
+ $REF_NGRAM_N{$ngram}++;
89
+ }
90
+ foreach my $ngram (keys %REF_NGRAM_N) {
91
+ if (!defined($REF_NGRAM{$ngram}) ||
92
+ $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
93
+ $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
94
+ # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
95
+ }
96
+ }
97
+ }
98
+ }
99
+ $length_translation += $length_translation_this_sentence;
100
+ $length_reference += $closest_length;
101
+ for(my $n=1;$n<=4;$n++) {
102
+ my %T_NGRAM = ();
103
+ for(my $start=0;$start<=$#WORD-($n-1);$start++) {
104
+ my $ngram = "$n";
105
+ for(my $w=0;$w<$n;$w++) {
106
+ $ngram .= " ".$WORD[$start+$w];
107
+ }
108
+ $T_NGRAM{$ngram}++;
109
+ }
110
+ foreach my $ngram (keys %T_NGRAM) {
111
+ $ngram =~ /^(\d+) /;
112
+ my $n = $1;
113
+ # my $corr = 0;
114
+ # print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
115
+ $TOTAL[$n] += $T_NGRAM{$ngram};
116
+ if (defined($REF_NGRAM{$ngram})) {
117
+ if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
118
+ $CORRECT[$n] += $T_NGRAM{$ngram};
119
+ # $corr = $T_NGRAM{$ngram};
120
+ # print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
121
+ }
122
+ else {
123
+ $CORRECT[$n] += $REF_NGRAM{$ngram};
124
+ # $corr = $REF_NGRAM{$ngram};
125
+ # print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
126
+ }
127
+ }
128
+ # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
129
+ # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
130
+ }
131
+ }
132
+ $s++;
133
+ }
134
+ my $brevity_penalty = 1;
135
+ my $bleu = 0;
136
+
137
+ my @bleu=();
138
+
139
+ for(my $n=1;$n<=4;$n++) {
140
+ if (defined ($TOTAL[$n])){
141
+ $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
142
+ # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
143
+ }else{
144
+ $bleu[$n]=0;
145
+ }
146
+ }
147
+
148
+ if ($length_reference==0){
149
+ printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
150
+ exit(1);
151
+ }
152
+
153
+ if ($length_translation<$length_reference) {
154
+ $brevity_penalty = exp(1-$length_reference/$length_translation);
155
+ }
156
+ $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
157
+ my_log( $bleu[2] ) +
158
+ my_log( $bleu[3] ) +
159
+ my_log( $bleu[4] ) ) / 4) ;
160
+ printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
161
+ 100*$bleu,
162
+ 100*$bleu[1],
163
+ 100*$bleu[2],
164
+ 100*$bleu[3],
165
+ 100*$bleu[4],
166
+ $brevity_penalty,
167
+ $length_translation / $length_reference,
168
+ $length_translation,
169
+ $length_reference;
170
+
171
+
172
+ print STDERR "It is not advisable to publish scores from multi-bleu.perl. The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups. Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization. Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n";
173
+
174
+ sub my_log {
175
+ return -9999999999 unless $_[0];
176
+ return log($_[0]);
177
+ }
mosesdecoder/scripts/generic/multi_moses.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Written by Michael Denkowski
4
+ #
5
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
6
+ # Public License version 2.1 or, at your option, any later version.
7
+
8
+ '''Parallelize decoding with multiple instances of moses on a local machine
9
+
10
+ To use with mert-moses.pl, activate --multi-moses and set the number of moses
11
+ instances and threads per instance with --decoder-flags='--threads P:T:E'
12
+
13
+ This script runs a specified number of moses instances, each using one or more
14
+ threads. The highest speed is generally seen with many single-threaded
15
+ instances while the lowest memory usage is seen with a single many-threaded
16
+ instance. It is recommended to use the maximum number of instances that will
17
+ fit into memory (up to the number of available CPUs) and distribute CPUs across
18
+ them equally. For example, a machine with 32 CPUs that can fit 3 copies of
19
+ moses into memory would use --threads 2:11:10 for 2 instances with 11 threads
20
+ each and an extra instance with 10 threads (3 instances total using all CPUs).
21
+
22
+ Memory mapped models can be shared by multiple processes and increase the number
23
+ of instances that can fit into memory:
24
+
25
+ Mmaped phrase tables (Ulrich Germann)
26
+ http://www.statmt.org/moses/?n=Advanced.Incremental#ntoc3
27
+
28
+ Mmaped mapped language models (Kenneth Heafield)
29
+ http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19
30
+ '''
31
+
32
+ import collections
33
+ import os
34
+ import Queue
35
+ import signal
36
+ import subprocess
37
+ import sys
38
+ import threading
39
+ import time
40
+
41
+ HELP = '''Multiple process decoding with Moses
42
+
43
+ Usage:
44
+ {} moses --config moses.ini [options] [decoder flags]
45
+
46
+ Options:
47
+ --threads P:T:E
48
+ P: Number of parallel instances to run
49
+ T: Number of threads per instance
50
+ E: Number of threads in optional extra instance
51
+ (default 1:1:0, overrides [threads] in moses.ini. Specifying T
52
+ and E is optional, e.g. --threads 16 starts 16 single-threaded
53
+ instances)
54
+ --n-best-list nbest.out N [distinct]: location and size of N-best list
55
+ --show-weights: for mert-moses.pl, just call moses and exit
56
+
57
+ Other options (decoder flags) are passed through to moses instances
58
+ '''
59
+
60
+ # Defaults
61
+ INPUT = sys.stdin
62
+ PROCS = 1
63
+ THREADS = 1
64
+ EXTRA = 0
65
+ DONE = threading.Event()
66
+ PID = os.getpid()
67
+ # A very long time, used as Queue operation timeout even though we don't
68
+ # actually want a timeout but we do want interruptibility
69
+ # (https://bugs.python.org/issue1360)
70
+ NEVER = 60 * 60 * 24 * 365 * 1000
71
+
72
+ # Single unit of computation: decode a line, output result, signal done
73
+ Task = collections.namedtuple('Task', ['id', 'line', 'out', 'event'])
74
+
75
+
76
+ def kill_main(msg):
77
+ '''kill -9 the main thread to stop everything immediately'''
78
+ sys.stderr.write('{}\n'.format(msg))
79
+ os.kill(PID, signal.SIGKILL)
80
+
81
+
82
+ def gzopen(f):
83
+ '''Open plain or gzipped text'''
84
+ return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r')
85
+
86
+
87
+ def run_instance(cmd_base, threads, tasks, cpu_affinity, cpu_offset, n_best=False):
88
+ '''Run an instance of moses that processes tasks (input lines) from a
89
+ queue using a specified number of threads'''
90
+ cmd = cmd_base[:]
91
+ cmd.append('--threads')
92
+ cmd.append(str(threads))
93
+
94
+ if cpu_affinity:
95
+ cmd.append('--cpu-affinity-offset')
96
+ cmd.append(str(cpu_offset))
97
+
98
+ #print 'BEFORE'
99
+ #print cmd
100
+ #print 'AFTER\n'
101
+
102
+ try:
103
+ # Queue of tasks instance is currently working on, limited to the number
104
+ # of threads * 2 (minimal buffering). The queue should be kept full for
105
+ # optimal CPU usage.
106
+ work = Queue.Queue(maxsize=(threads * 2))
107
+ # Multi-threaded instance
108
+ moses = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
109
+
110
+ # Read and handle instance output as available
111
+ def handle_output():
112
+ while True:
113
+ # Output line triggers task completion
114
+ line = moses.stdout.readline()
115
+ # End of output (instance finished)
116
+ if not line:
117
+ break
118
+ task = work.get(timeout=NEVER)
119
+ if n_best:
120
+ # Read and copy lines until sentinel line, copy real line id
121
+ # id ||| hypothesis words ||| feature scores ||| total score
122
+ (first_i, rest) = line.split(' ||| ', 1)
123
+ task.out.append(' ||| '.join((task.id, rest)))
124
+ while True:
125
+ line = moses.stdout.readline()
126
+ (i, rest) = line.split(' ||| ', 1)
127
+ # Sentinel
128
+ if i != first_i:
129
+ break
130
+ task.out.append(' ||| '.join((task.id, rest)))
131
+ else:
132
+ task.out.append(line)
133
+ # Signal task done
134
+ task.event.set()
135
+ # Output thread
136
+ handler = threading.Thread(target=handle_output, args=())
137
+ # Daemon: guaranteed to finish before non-daemons
138
+ handler.setDaemon(True)
139
+ handler.start()
140
+
141
+ # Input thread: take tasks as they are available and add them to work
142
+ # queue. Stop when DONE encountered.
143
+ while True:
144
+ task = tasks.get(timeout=NEVER)
145
+ work.put(task, timeout=NEVER)
146
+ if task.event == DONE:
147
+ break
148
+ if n_best:
149
+ # Input line followed by blank line (sentinel)
150
+ moses.stdin.write(task.line)
151
+ moses.stdin.write('\n')
152
+ else:
153
+ moses.stdin.write(task.line)
154
+
155
+ # Cleanup
156
+ moses.stdin.close()
157
+ moses.wait()
158
+ handler.join()
159
+
160
+ except:
161
+ kill_main('Error with moses instance: see stderr')
162
+
163
+
164
+ def write_results(results, n_best=False, n_best_out=None):
165
+ '''Write out results (output lines) from a queue as they are populated'''
166
+ while True:
167
+ task = results.get(timeout=NEVER)
168
+ if task.event == DONE:
169
+ break
170
+ task.event.wait()
171
+ if n_best:
172
+ # Write top-best and N-best
173
+ # id ||| hypothesis words ||| feature scores ||| total score
174
+ top_best = task.out[0].split(' ||| ', 2)[1]
175
+ # Except don't write top-best if writing N-best to stdout "-"
176
+ if n_best_out != sys.stdout:
177
+ sys.stdout.write('{}\n'.format(top_best))
178
+ sys.stdout.flush()
179
+ for line in task.out:
180
+ n_best_out.write(line)
181
+ n_best_out.flush()
182
+ else:
183
+ sys.stdout.write(task.out[0])
184
+ sys.stdout.flush()
185
+
186
+
187
+ def main(argv):
188
+ # Defaults
189
+ moses_ini = None
190
+ input = INPUT
191
+ procs = PROCS
192
+ threads = THREADS
193
+ extra = EXTRA
194
+ n_best = False
195
+ n_best_file = None
196
+ n_best_size = None
197
+ n_best_distinct = False
198
+ n_best_out = None
199
+ show_weights = False
200
+ cpu_affinity = False
201
+
202
+ # Decoder command
203
+ cmd = argv[1:]
204
+
205
+ # Parse special options and remove from cmd
206
+ i = 1
207
+ while i < len(cmd):
208
+ if cmd[i] in ('-f', '-config', '--config'):
209
+ moses_ini = cmd[i + 1]
210
+ # Do not remove from cmd
211
+ i += 2
212
+ elif cmd[i] in ('-i', '-input-file', '--input-file'):
213
+ input = gzopen(cmd[i + 1])
214
+ cmd = cmd[:i] + cmd[i + 2:]
215
+ elif cmd[i] in ('-th', '-threads', '--threads'):
216
+ # P:T:E
217
+ args = cmd[i + 1].split(':')
218
+ procs = int(args[0])
219
+ if len(args) > 1:
220
+ threads = int(args[1])
221
+ if len(args) > 2:
222
+ extra = int(args[2])
223
+ cmd = cmd[:i] + cmd[i + 2:]
224
+ elif cmd[i] in ('-n-best-list', '--n-best-list'):
225
+ n_best = True
226
+ n_best_file = cmd[i + 1]
227
+ n_best_size = cmd[i + 2]
228
+ # Optional "distinct"
229
+ if i + 3 < len(cmd) and cmd[i + 3] == 'distinct':
230
+ n_best_distinct = True
231
+ cmd = cmd[:i] + cmd[i + 4:]
232
+ else:
233
+ cmd = cmd[:i] + cmd[i + 3:]
234
+ # Handled specially for mert-moses.pl
235
+ elif cmd[i] in ('-show-weights', '--show-weights'):
236
+ show_weights = True
237
+ # Do not remove from cmd
238
+ i += 1
239
+ elif cmd[i] in ('-cpu-affinity', '--cpu-affinity'):
240
+ cpu_affinity = True
241
+ cmd = cmd[:i] + cmd[i + 1:]
242
+ else:
243
+ i += 1
244
+
245
+ # If mert-moses.pl passes -show-weights, just call moses
246
+ if show_weights:
247
+ sys.stdout.write(subprocess.check_output(cmd))
248
+ sys.stdout.flush()
249
+ return
250
+
251
+ # Check inputs
252
+ if not (len(cmd) > 0 and moses_ini):
253
+ sys.stderr.write(HELP.format(os.path.basename(argv[0])))
254
+ sys.exit(2)
255
+ if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)):
256
+ raise Exception('moses "{}" is not executable\n'.format(cmd[0]))
257
+
258
+ # Report settings
259
+ sys.stderr.write('Moses flags: {}\n'.format(' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:])))
260
+ sys.stderr.write('Instances: {}\n'.format(procs))
261
+ sys.stderr.write('Threads per: {}\n'.format(threads))
262
+ if extra:
263
+ sys.stderr.write('Extra: {}\n'.format(extra))
264
+ if n_best:
265
+ sys.stderr.write('N-best list: {} ({}{})\n'.format(n_best_file, n_best_size, ', distinct' if n_best_distinct else ''))
266
+
267
+ # Task and result queues (buffer 8 * total threads input lines)
268
+ tasks = Queue.Queue(maxsize=(8 * ((procs * threads) + extra)))
269
+ results = Queue.Queue()
270
+
271
+ # N-best capture
272
+ if n_best:
273
+ cmd.append('--n-best-list')
274
+ cmd.append('-')
275
+ cmd.append(n_best_size)
276
+ if n_best_distinct:
277
+ cmd.append('distinct')
278
+ if n_best_file == '-':
279
+ n_best_out = sys.stdout
280
+ else:
281
+ n_best_out = open(n_best_file, 'w')
282
+
283
+ # Start instances
284
+ cpu_offset = -threads
285
+ instances = []
286
+ for i in range(procs + (1 if extra else 0)):
287
+ if cpu_affinity:
288
+ cpu_offset += threads
289
+
290
+ t = threading.Thread(target=run_instance, args=(cmd, (threads if i < procs else extra), tasks, cpu_affinity, cpu_offset, n_best))
291
+ instances.append(t)
292
+ # Daemon: guaranteed to finish before non-daemons
293
+ t.setDaemon(True)
294
+ t.start()
295
+ #time.sleep(1)
296
+
297
+ # Start results writer
298
+ writer = threading.Thread(target=write_results, args=(results, n_best, n_best_out))
299
+ writer.start()
300
+
301
+ # Main loop: queue task for each input line
302
+ id = 0
303
+ while True:
304
+ line = input.readline()
305
+ if not line:
306
+ break
307
+ # (input, out lines, err lines, "done" event)
308
+ task = Task(str(id), line, [], threading.Event())
309
+ results.put(task, timeout=NEVER)
310
+ tasks.put(task, timeout=NEVER)
311
+ id += 1
312
+
313
+ # Tell instances to exit
314
+ for t in instances:
315
+ tasks.put(Task(None, None, None, DONE), timeout=NEVER)
316
+ for t in instances:
317
+ t.join()
318
+
319
+ # Stop results writer
320
+ results.put(Task(None, None, None, DONE), timeout=NEVER)
321
+ writer.join()
322
+
323
+ # Cleanup
324
+ if n_best:
325
+ n_best_out.close()
326
+
327
+
328
+ if __name__ == '__main__':
329
+ try:
330
+ main(sys.argv)
331
+ except:
332
+ kill_main('Error with main I/O: see stderr')
mosesdecoder/scripts/generic/ph_numbers.perl ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ package ph_numbers;
4
+
5
+ # Script to recognize and replace numbers in Moses training corpora
6
+ # and decoder input
7
+ #
8
+ # (c) 2013 TAUS
9
+ #
10
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
11
+ # Public License version 2.1 or, at your option, any later version.
12
+
13
+ use warnings;
14
+ use strict;
15
+
16
+ run() unless caller();
17
+ use Getopt::Std;
18
+
19
+ my $debug = $ENV{DEBUG} || 0;
20
+
21
+ sub run {
22
+ my %opts;
23
+ if(!getopts('s:t:cm:hl',\%opts) || $opts{h}) {
24
+ print "Usage: perl $0 [-s source_locale][-t target_locale][-c][-h][-l][-m symbol] < in > out\n";
25
+ exit;
26
+ }
27
+ my $sourceLocale = $opts{s} || "";
28
+ my $targetLocale = $opts{t} || "";
29
+ my $numberSymbol = $opts{m} || '@num@';
30
+ while(<>) {
31
+ chomp;
32
+ print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
33
+ }
34
+ }
35
+
36
+ sub mark_numbers {
37
+ my $input = shift;
38
+ my $corpusMode = shift;
39
+ my $legacyMode = shift;
40
+ my $numberSymbol = shift || '@num@';
41
+
42
+ my $numref = recognize($input);
43
+ my $input_length = length($input);
44
+ my $output = "";
45
+ my $position = 0;
46
+ for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) {
47
+ my $numstart = $numref->[$i][0];
48
+ my $numend = $numref->[$i][1];
49
+ if($position < $numstart) {
50
+ $output .= substr($input,$position,$numstart-$position);
51
+ }
52
+ my $number = substr($input,$numstart,$numend-$numstart);
53
+ if($corpusMode) {
54
+ $output .= $numberSymbol;
55
+ }
56
+ else {
57
+ if($legacyMode) {
58
+ $output .= "<ne translation=\"$number\">$numberSymbol</ne>";
59
+ }
60
+ else {
61
+ $output .= "<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
62
+ }
63
+ }
64
+ $position = $numend;
65
+ }
66
+ $output .= substr($input,$position);
67
+ return $output;
68
+ }
69
+
70
+ sub recognize {
71
+ my $input = shift;
72
+ #print STDERR "input=$input\n";
73
+
74
+ my @recognized = ();
75
+ while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) {
76
+ my $start = $-[3];
77
+ my $end = $+[3];
78
+ while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) {
79
+ $end = $+[2];
80
+ }
81
+
82
+ # ALL characters in the word must be
83
+ my $isRecognized = 1;
84
+ if ($start == 0 || substr($input, $start - 1, 1) eq " ") {
85
+ # 1st word, or previous char is a space
86
+ }
87
+ else {
88
+ $isRecognized = 0;
89
+ }
90
+
91
+ if ($end == length($input) || substr($input, $end, 1) eq " ") {
92
+ # last word, or next char is a space
93
+ }
94
+ else {
95
+ $isRecognized = 0;
96
+ }
97
+
98
+ #print STDERR "start=$start end=$end len=" .length($input) ."\n";
99
+ if ($isRecognized) {
100
+ push @recognized,[$start,$end];
101
+ }
102
+ }
103
+ return \@recognized;
104
+ }
105
+
106
+ 1;
mosesdecoder/scripts/generic/reverse-alignment.perl ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ use warnings;
7
+ use strict;
8
+
9
+ my $line;
10
+ while ($line = <STDIN>)
11
+ {
12
+ chomp($line);
13
+ my @toks = split(/ /, $line);
14
+
15
+ foreach (my $i = 0; $i < @toks; ++$i)
16
+ {
17
+ my $tok = $toks[$i];
18
+ my @alignPair = split(/-/, $tok);
19
+ (@alignPair == 2) or die("Something wrong");
20
+ print $alignPair[1]."-".$alignPair[0]." ";
21
+ }
22
+ print "\n";
23
+ }
24
+
mosesdecoder/scripts/generic/score-parallel.perl ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ # example
7
+ # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0
8
+ # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.inv.sorted.gz ./lex.2.e2f ./phrase-table.2.half.e2f --Inverse 1
9
+
10
+ use warnings;
11
+ use strict;
12
+ use File::Basename;
13
+
14
+ sub RunFork($);
15
+ sub systemCheck($);
16
+ sub GetSourcePhrase($);
17
+ sub NumStr($);
18
+ sub CutContextFile($$$);
19
+
20
+ my $GZIP_EXEC;
21
+ if(`which pigz`) {
22
+ $GZIP_EXEC = 'pigz';
23
+ }
24
+ else {
25
+ $GZIP_EXEC = 'gzip';
26
+ }
27
+ print STDERR "using $GZIP_EXEC \n";
28
+
29
+ #my $EXTRACT_SPLIT_LINES = 5000000;
30
+ my $EXTRACT_SPLIT_LINES = 50000000;
31
+
32
+ print STDERR "Started ".localtime() ."\n";
33
+
34
+ my $numParallel = $ARGV[0];
35
+ $numParallel = 1 if $numParallel < 1;
36
+
37
+ my $sortCmd = $ARGV[1];
38
+ my $scoreCmd = $ARGV[2];
39
+
40
+ my $extractFile = $ARGV[3]; # 1st arg of extract argument
41
+ my $lexFile = $ARGV[4];
42
+ my $ptHalf = $ARGV[5]; # output
43
+ my $inverse = 0;
44
+ my $sourceLabelsFile;
45
+ my $partsOfSpeechFile;
46
+ my $targetSyntacticPreferencesLabelsFile;
47
+
48
+ my $otherExtractArgs= "";
49
+ for (my $i = 6; $i < $#ARGV; ++$i)
50
+ {
51
+ if ($ARGV[$i] eq '--SourceLabels') {
52
+ $sourceLabelsFile = $ARGV[++$i];
53
+ $otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS ";
54
+ next;
55
+ }
56
+ if ($ARGV[$i] eq '--PartsOfSpeech') {
57
+ $partsOfSpeechFile = $ARGV[++$i];
58
+ $otherExtractArgs .= "--PartsOfSpeech ";
59
+ next;
60
+ }
61
+ if ($ARGV[$i] eq '--TargetSyntacticPreferences') {
62
+ $targetSyntacticPreferencesLabelsFile = $ARGV[++$i];
63
+ $otherExtractArgs .= "--TargetSyntacticPreferences ";
64
+ next;
65
+ }
66
+ if ($ARGV[$i] eq '--Inverse') {
67
+ $inverse = 1;
68
+ $otherExtractArgs .= $ARGV[$i] ." ";
69
+ next;
70
+ }
71
+ $otherExtractArgs .= $ARGV[$i] ." ";
72
+ }
73
+ #$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs
74
+
75
+ my $FlexibilityScore = $otherExtractArgs =~ /--FlexibilityScore/;
76
+ my $FlexibilityCmd = $otherExtractArgs;
77
+ $otherExtractArgs =~ s/--FlexibilityScore=\S+//; # don't pass flexibility_score command to score program
78
+ if ($FlexibilityCmd =~ /--FlexibilityScore=(\S+)/) {
79
+ $FlexibilityCmd = $1;
80
+ }
81
+
82
+ my $doSort = $ARGV[$#ARGV]; # last arg
83
+
84
+ my $TMPDIR=dirname($ptHalf) ."/tmp.$$";
85
+ mkdir $TMPDIR;
86
+
87
+ my $cmd;
88
+
89
+ my $extractFileContext;
90
+ if ($FlexibilityScore) {
91
+ $extractFileContext = $extractFile;
92
+ $extractFileContext =~ s/extract./extract.context./;
93
+ }
94
+
95
+ my $fileCount = 0;
96
+ if ($numParallel <= 1)
97
+ { # don't do parallel. Just link the extract file into place
98
+ $cmd = "ln -s $extractFile $TMPDIR/extract.0.gz";
99
+ if ($FlexibilityScore) {
100
+ $cmd .= " && ln -s $extractFileContext $TMPDIR/extract.context.0.gz";
101
+ }
102
+ print STDERR "$cmd \n";
103
+ systemCheck($cmd);
104
+
105
+ $fileCount = 1;
106
+ }
107
+ else
108
+ { # cut up extract file into smaller mini-extract files.
109
+ if ($extractFile =~ /\.gz$/) {
110
+ open(IN, "gunzip -c $extractFile |") || die "can't open pipe to $extractFile";
111
+ }
112
+ else {
113
+ open(IN, $extractFile) || die "can't open $extractFile";
114
+ }
115
+
116
+ my $lastlineContext;
117
+ if ($FlexibilityScore) {
118
+ $lastlineContext = "";
119
+ if ($extractFileContext =~ /\.gz$/) {
120
+ open(IN_CONTEXT, "gunzip -c $extractFileContext |") || die "can't open pipe to $extractFileContext";
121
+ }
122
+ else {
123
+ open(IN_CONTEXT, $extractFileContext) || die "can't open $extractFileContext";
124
+ }
125
+ }
126
+
127
+ my $filePath = "$TMPDIR/extract.$fileCount.gz";
128
+ open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
129
+
130
+ my $lineCount = 0;
131
+ my $line;
132
+ my $prevSourcePhrase = "";
133
+ while ($line=<IN>)
134
+ {
135
+ chomp($line);
136
+ ++$lineCount;
137
+
138
+ if ($lineCount > $EXTRACT_SPLIT_LINES)
139
+ { # over line limit. Cut off at next source phrase change
140
+ my $sourcePhrase = GetSourcePhrase($line);
141
+
142
+ if ($prevSourcePhrase eq "")
143
+ { # start comparing
144
+ $prevSourcePhrase = $sourcePhrase;
145
+ }
146
+ elsif ($sourcePhrase eq $prevSourcePhrase)
147
+ { # can't cut off yet. Do nothing
148
+ }
149
+ else
150
+ { # cut off, open next min-extract file & write to that instead
151
+ close OUT;
152
+
153
+ if ($FlexibilityScore) {
154
+ $lastlineContext = CutContextFile($prevSourcePhrase, $fileCount, $lastlineContext);
155
+ }
156
+ $prevSourcePhrase = "";
157
+ $lineCount = 0;
158
+ ++$fileCount;
159
+ my $filePath = $fileCount;
160
+ $filePath = "$TMPDIR/extract.$filePath.gz";
161
+ open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
162
+ }
163
+ }
164
+ else
165
+ { # keep on writing to current mini-extract file
166
+ }
167
+
168
+ print OUT "$line\n";
169
+
170
+ }
171
+ close OUT;
172
+ if ($FlexibilityScore) {
173
+ $lastlineContext = CutContextFile($prevSourcePhrase, $fileCount, $lastlineContext);
174
+ }
175
+ ++$fileCount;
176
+ }
177
+
178
+
179
+ # create run scripts
180
+ my @runFiles = (0..($numParallel-1));
181
+ for (my $i = 0; $i < $numParallel; ++$i)
182
+ {
183
+ my $path = "$TMPDIR/run.$i.sh";
184
+ open(my $fh, ">", $path) or die "cannot open $path: $!";
185
+ $runFiles[$i] = $fh;
186
+ }
187
+
188
+ # write scoring of mini-extracts to run scripts
189
+ for (my $i = 0; $i < $fileCount; ++$i)
190
+ {
191
+ my $numStr = NumStr($i);
192
+
193
+ my $fileInd = $i % $numParallel;
194
+ my $fh = $runFiles[$fileInd];
195
+
196
+ my $cmd = "$scoreCmd $TMPDIR/extract.$i.gz $lexFile $TMPDIR/phrase-table.half.$numStr.gz $otherExtractArgs 2>> /dev/stderr \n";
197
+ print STDERR $cmd;
198
+
199
+ if ($FlexibilityScore) {
200
+ $cmd .= "gzip -cd $TMPDIR/phrase-table.half.$numStr.gz | $FlexibilityCmd $TMPDIR/extract.context.$i.gz";
201
+ $cmd .= " --Inverse" if ($otherExtractArgs =~ /--Inverse/);
202
+ $cmd .= " --Hierarchical" if ($otherExtractArgs =~ /--Hierarchical/);
203
+ $cmd .= " | $GZIP_EXEC -c > $TMPDIR/phrase-table.half.$numStr.flex.gz\n";
204
+ $cmd .= "mv $TMPDIR/phrase-table.half.$numStr.flex.gz $TMPDIR/phrase-table.half.$numStr.gz\n";
205
+ }
206
+
207
+ print $fh $cmd;
208
+ }
209
+
210
+ # close run script files
211
+ for (my $i = 0; $i < $numParallel; ++$i)
212
+ {
213
+ close($runFiles[$i]);
214
+ my $path = "$TMPDIR/run.$i.sh";
215
+ systemCheck("chmod +x $path");
216
+ }
217
+
218
+ # run each score script in parallel
219
+ my @children;
220
+ for (my $i = 0; $i < $numParallel; ++$i)
221
+ {
222
+ my $cmd = "$TMPDIR/run.$i.sh";
223
+ my $pid = RunFork($cmd);
224
+ push(@children, $pid);
225
+ }
226
+
227
+ # wait for everything is finished
228
+ foreach (@children) {
229
+ waitpid($_, 0);
230
+ }
231
+
232
+ # merge & sort
233
+ $cmd = "\n\nOH SHIT. This should have been filled in \n\n";
234
+ if ($fileCount == 1 && !$doSort && !$FlexibilityScore)
235
+ {
236
+ my $numStr = NumStr(0);
237
+ $cmd = "mv $TMPDIR/phrase-table.half.$numStr.gz $ptHalf";
238
+ }
239
+ else
240
+ {
241
+ $cmd = "gunzip -c $TMPDIR/phrase-table.half.*.gz 2>> /dev/stderr";
242
+
243
+ if ($doSort) {
244
+ $cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR ";
245
+ }
246
+
247
+ $cmd .= " | $GZIP_EXEC -c > $ptHalf 2>> /dev/stderr ";
248
+ }
249
+ print STDERR $cmd;
250
+ systemCheck($cmd);
251
+
252
+ # merge coc
253
+ my $numStr = NumStr(0);
254
+ my $cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc";
255
+
256
+ if (-e $cocPath)
257
+ {
258
+ my @arrayCOC;
259
+ my $line;
260
+
261
+ # 1st file
262
+ open(FHCOC, $cocPath) || die "can't open pipe to $cocPath";
263
+ while ($line = <FHCOC>)
264
+ {
265
+ my $coc = int($line);
266
+ push(@arrayCOC, $coc);
267
+ }
268
+ close(FHCOC);
269
+
270
+ # all other files
271
+ for (my $i = 1; $i < $fileCount; ++$i)
272
+ {
273
+ $numStr = NumStr($i);
274
+ $cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc";
275
+ open(FHCOC, $cocPath) || die "can't open pipe to $cocPath";
276
+ my $arrayInd = 0;
277
+ while ($line = <FHCOC>)
278
+ {
279
+ my $coc = int($line);
280
+ $arrayCOC[$arrayInd] += $coc;
281
+
282
+ ++$arrayInd;
283
+ }
284
+
285
+ close(FHCOC);
286
+ }
287
+
288
+ # output
289
+ $cocPath = "$ptHalf.coc";
290
+ open(FHCOC, ">", $cocPath) or die "cannot open $cocPath: $!";
291
+ for (my $i = 0; $i < @arrayCOC; ++$i)
292
+ {
293
+ print FHCOC $arrayCOC[$i]."\n";
294
+ }
295
+ close(FHCOC);
296
+ }
297
+
298
+ # merge source labels files
299
+ if (!$inverse && defined($sourceLabelsFile))
300
+ {
301
+ my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; echo \"SSTART 2\"; echo \"SEND 3\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $sourceLabelsFile";
302
+ print STDERR "Merging source labels files: $cmd \n";
303
+ `$cmd`;
304
+ }
305
+
306
+ # merge parts-of-speech files
307
+ if (!$inverse && defined($partsOfSpeechFile))
308
+ {
309
+ my $cmd = "(echo \"SSTART 0\"; echo \"SEND 1\"; cat $TMPDIR/phrase-table.half.*.gz.partsOfSpeech | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $partsOfSpeechFile";
310
+ print STDERR "Merging parts-of-speech files: $cmd \n";
311
+ `$cmd`;
312
+ }
313
+
314
+ # merge target syntactic preferences labels files
315
+ if (!$inverse && defined($targetSyntacticPreferencesLabelsFile))
316
+ {
317
+ my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.tgtpref | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $targetSyntacticPreferencesLabelsFile";
318
+ print STDERR "Merging target syntactic preferences labels files: $cmd \n";
319
+ `$cmd`;
320
+ }
321
+
322
+ $cmd = "rm -rf $TMPDIR \n";
323
+ print STDERR $cmd;
324
+ systemCheck($cmd);
325
+
326
+ print STDERR "Finished ".localtime() ."\n";
327
+
328
+ # -----------------------------------------
329
+ # -----------------------------------------
330
+
331
+ sub RunFork($)
332
+ {
333
+ my $cmd = shift;
334
+
335
+ my $pid = fork();
336
+
337
+ if ($pid == 0)
338
+ { # child
339
+ print STDERR $cmd;
340
+ systemCheck($cmd);
341
+ exit();
342
+ }
343
+ return $pid;
344
+ }
345
+ sub systemCheck($)
346
+ {
347
+ my $cmd = shift;
348
+ my $retVal = system($cmd);
349
+ if ($retVal != 0)
350
+ {
351
+ exit(1);
352
+ }
353
+ }
354
+
355
+ sub GetSourcePhrase($)
356
+ {
357
+ my $line = shift;
358
+ my $pos = index($line, "|||");
359
+ my $sourcePhrase = substr($line, 0, $pos);
360
+ return $sourcePhrase;
361
+ }
362
+
363
+
364
+ sub NumStr($)
365
+ {
366
+ my $i = shift;
367
+ my $numStr;
368
+ if ($i < 10) {
369
+ $numStr = "000000$i";
370
+ }
371
+ elsif ($i < 100) {
372
+ $numStr = "00000$i";
373
+ }
374
+ elsif ($i < 1000) {
375
+ $numStr = "0000$i";
376
+ }
377
+ elsif ($i < 10000) {
378
+ $numStr = "000$i";
379
+ }
380
+ elsif ($i < 100000) {
381
+ $numStr = "00$i";
382
+ }
383
+ elsif ($i < 1000000) {
384
+ $numStr = "0$i";
385
+ }
386
+ else {
387
+ $numStr = $i;
388
+ }
389
+ return $numStr;
390
+ }
391
+
392
+
393
+ sub CutContextFile($$$)
394
+ {
395
+ my($lastsourcePhrase, $fileCount, $lastline) = @_;
396
+ my $line;
397
+ my $sourcePhrase;
398
+
399
+ my $filePath = "$TMPDIR/extract.context.$fileCount.gz";
400
+ open (OUT_CONTEXT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
401
+
402
+ if ($lastline ne "") {
403
+ print OUT_CONTEXT "$lastline\n";
404
+ }
405
+
406
+ #write all lines in context file until we meet last source phrase in extract file
407
+ while ($line=<IN_CONTEXT>)
408
+ {
409
+ chomp($line);
410
+ $sourcePhrase = GetSourcePhrase($line);
411
+ print OUT_CONTEXT "$line\n";
412
+ if ($sourcePhrase eq $lastsourcePhrase) {last;}
413
+ }
414
+
415
+ #write all lines in context file that correspond to last source phrase in extract file
416
+ while ($line=<IN_CONTEXT>)
417
+ {
418
+ chomp($line);
419
+ $sourcePhrase = GetSourcePhrase($line);
420
+ if ($sourcePhrase ne $lastsourcePhrase) {last;}
421
+ print OUT_CONTEXT "$line\n";
422
+ }
423
+
424
+ close(OUT_CONTEXT);
425
+
426
+ return $line;
427
+
428
+ }
mosesdecoder/scripts/generic/score_parallel.py ADDED
@@ -0,0 +1,776 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/env python
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+ #
6
+ # Script contributed by Precision Translation Tools.
7
+
8
+ """Run Moses `score` jobs in parallel.
9
+
10
+ This script is a replacement for `score-parallel.perl`. The two are similar,
11
+ but there are differences in usage. In addition, this script can be called
12
+ directly from Python code without the need to run it as a separate process.
13
+ """
14
+
15
+ from __future__ import (
16
+ absolute_import,
17
+ print_function,
18
+ unicode_literals,
19
+ )
20
+
21
+ __metaclass__ = type
22
+
23
+ from argparse import ArgumentParser
24
+ from contextlib import contextmanager
25
+ from datetime import datetime
26
+ import errno
27
+ import gzip
28
+ from multiprocessing import Pool
29
+ import os
30
+ import os.path
31
+ import pipes
32
+ from shutil import rmtree
33
+ from subprocess import check_call
34
+ import sys
35
+ import tempfile
36
+
37
+
38
+ def get_unicode_type():
39
+ """Return the Unicode string type appropriate to this Python version."""
40
+ if sys.version_info.major <= 2:
41
+ # Unicode string type. In Python 2 this is the "unicode" type,
42
+ # while "str" is a binary string type.
43
+ return unicode
44
+ else:
45
+ # Unicode string type. In Python 3 this is the default "str" type.
46
+ # The binary string type is now called "bytes".
47
+ return str
48
+
49
+
50
+ UNICODE_TYPE = get_unicode_type()
51
+
52
+
53
+ class CommandLineError(Exception):
54
+ """Invalid command line."""
55
+
56
+
57
+ class ProgramFailure(Exception):
58
+ """Failure, not a bug, which is reported neatly to the user."""
59
+
60
+
61
+ def parse_args():
62
+ """Parse command line arguments, return as `Namespace`."""
63
+ parser = ArgumentParser(description=__doc__)
64
+ parser.add_argument(
65
+ '--extract-file', '-e', metavar='PATH', required=True,
66
+ help=(
67
+ "Path to input file: extract file (e.g. 'extract.sorted.gz' or "
68
+ "'extract.inv.sorted.gz'). Required."))
69
+ parser.add_argument(
70
+ '--lex-file', '-l', metavar='PATH', required=True,
71
+ help=(
72
+ "Path to input file: lex file (e.g. 'lex.f2e' or 'lex.e2f'). "
73
+ "Required."))
74
+ parser.add_argument(
75
+ '--output', '-o', metavar='PATH', required=True,
76
+ help=(
77
+ "Write phrase table to file PATH (e.g. 'phrase-table.half.f2e' "
78
+ "or 'phrase-table.half.e2f'). Required."))
79
+ parser.add_argument(
80
+ '--inverse', '-i', action='store_true',
81
+ help="Inverse scoring. Defaults to direct scoring.")
82
+ parser.add_argument(
83
+ '--labels-file', '-L', metavar='PATH',
84
+ help="Also write source labels to file PATH.")
85
+ parser.add_argument(
86
+ '--parts-of-speech', '-p', metavar='PATH',
87
+ help="Also write parts-of-speech file to PATH.")
88
+ parser.add_argument(
89
+ '--flexibility-score', '-F', metavar='PATH',
90
+ help="Path to the 'flexibility_score.py' script. Defaults to none.")
91
+ parser.add_argument(
92
+ '--hierarchical', '-H', action='store_true',
93
+ help="Process hierarchical rules.")
94
+ parser.add_argument(
95
+ '--args', '-a', metavar='ARGUMENTS',
96
+ help="Additional arguments for `score` and `flexibility_score`.")
97
+ parser.add_argument(
98
+ '--sort', '-s', action='store_true',
99
+ help="Sort output file.")
100
+ parser.add_argument(
101
+ '--jobs', '-j', metavar='N', type=int, default=1,
102
+ help="Run up to N jobs in parallel. Defaults to %(default)s.")
103
+ parser.add_argument(
104
+ '--score-exe', '-x', metavar='PROGRAM',
105
+ help="Name of, or path to, the 'score' executable.")
106
+ parser.add_argument(
107
+ '--sort-command', '-S', metavar='COMMAND-LINE',
108
+ help=(
109
+ "Command line for sorting text files to standard output. "
110
+ "Must support operation as a pipe, as well as input files named "
111
+ "as command-line arguments."))
112
+ parser.add_argument(
113
+ '--gzip-command', '-z', metavar='PROGRAM',
114
+ help="Path to a gzip or pigz executable.")
115
+ parser.add_argument(
116
+ '--verbose', '-v', action='store_true',
117
+ help="Print what's going on.")
118
+ parser.add_argument(
119
+ '--debug', '-d', action='store_true',
120
+ help="Don't delete temporary directories when done.")
121
+ return parser.parse_args()
122
+
123
+
124
+ def normalize_path(optional_path=None):
125
+ """Return a cleaned-up version of a given filesystem path, or None.
126
+
127
+ Converts the path to the operating system's native conventions, and
128
+ removes redundancies like `.`.
129
+
130
+ The return value will be `None`, an absolute path, or a relative path,
131
+ same as the argument. But it will have redundant path separators,
132
+ unnecessary detours through parent directories, and use of the current
133
+ directory "." removed.
134
+ """
135
+ if optional_path is None:
136
+ return None
137
+ else:
138
+ path = os.path.normpath(optional_path)
139
+ path = path.replace('/', os.path.sep)
140
+ path = path.replace('\\', os.path.sep)
141
+ return path
142
+
143
+
144
+ def quote(path):
145
+ """Quote and escape a filename for use in a shell command.
146
+
147
+ The Windows implementation is very limited and will break on anything
148
+ more advanced than a space.
149
+ """
150
+ if os.name == 'posix':
151
+ return pipes.quote(path)
152
+ else:
153
+ # TODO: Improve escaping for Windows.
154
+ return '"%s"' % path
155
+
156
+
157
+ def sanitize_args(args):
158
+ """Check `args` for sanity, clean up, and set nontrivial defaults."""
159
+ if args.jobs < 1:
160
+ raise CommandLineError("Number of parallel jobs must be 1 or more.")
161
+ if args.sort_command is None:
162
+ args.sort_command = find_first_executable(
163
+ ['neandersort', 'gsort', 'sort'])
164
+ if args.sort_command is None:
165
+ raise CommandLineError(
166
+ "No 'sort' command is available. "
167
+ "Choose one using the --sort-command option.")
168
+ if args.gzip_command is None:
169
+ args.gzip_command = find_first_executable(['pigz', 'gzip'])
170
+ if args.gzip_command is None:
171
+ raise CommandLineError(
172
+ "No 'gzip' or 'pigz' command is available. "
173
+ "Choose one using the --gzip-command option.")
174
+ if args.score_exe is None:
175
+ # Look for "score" executable. It may be in the current project
176
+ # directory somewhere, or in the PATH.
177
+ moses_dir = os.path.dirname(os.path.dirname(
178
+ os.path.abspath(__file__)))
179
+ args.score_exe = find_first_executable(
180
+ ['score'],
181
+ [
182
+ moses_dir,
183
+ os.path.join(moses_dir, 'phrase-extract'),
184
+ os.path.join(moses_dir, 'binaries'),
185
+ ])
186
+ args.extract_file = normalize_path(args.extract_file)
187
+ args.lex_file = normalize_path(args.lex_file)
188
+ args.output = normalize_path(args.output)
189
+ args.labels_file = normalize_path(args.labels_file)
190
+ args.parts_of_speech = normalize_path(args.parts_of_speech)
191
+ args.flexibility_score = normalize_path(args.flexibility_score)
192
+ args.score_exe = normalize_path(args.score_exe)
193
+
194
+
195
+ def add_exe_suffix(program):
196
+ """Return the full filename for an executable.
197
+
198
+ On Windows, this adds a `.exe` suffix to the name. On other
199
+ systems, it returns the original name unchanged.
200
+ """
201
+ if os.name == 'nt':
202
+ # Windows.
203
+ return program + '.exe'
204
+ else:
205
+ # Assume POSIX or similar.
206
+ return program
207
+
208
+
209
+ def find_executable(exe, extra_path=None):
210
+ """Return full path to an executable of the given name, or `None`.
211
+
212
+ If the given name is a qualified path to an executable, it will be returned
213
+ unchanged. A qualified path where no executable is found results in a
214
+ `CommandLineError`.
215
+ """
216
+ if extra_path is None:
217
+ extra_path = []
218
+
219
+ if os.path.sep in exe:
220
+ # The executable name includes a path. Only one place it can be.
221
+ if not os.path.isfile(exe) or not os.access(exe, os.X_OK):
222
+ raise CommandLineError("Not an executable: '%s'." % exe)
223
+ return exe
224
+
225
+ for path in extra_path + os.getenv('PATH').split(os.pathsep):
226
+ full_path = os.path.join(path, exe)
227
+ if os.access(full_path, os.X_OK):
228
+ return full_path
229
+ return None
230
+
231
+
232
+ def find_first_executable(candidates, extra_path=None):
233
+ """Find the first available of the given candidate programs.
234
+
235
+ :raise ProgramFailure: If none of `candidates` was found.
236
+ """
237
+ for program in candidates:
238
+ executable = find_executable(add_exe_suffix(program), extra_path)
239
+ if executable is not None:
240
+ return executable
241
+ raise ProgramFailure(
242
+ "Could not find any of these executables in path: %s."
243
+ % ', '.join(candidates))
244
+
245
+
246
+ def execute_shell(command, verbose=False):
247
+ """Run `command` string through the shell.
248
+
249
+ Inherits environment, but sets `LC_ALL` to `C` for predictable results,
250
+ especially from sort commands.
251
+
252
+ This uses a full-featured shell, including pipes, substitution, etc. So
253
+ remember to quote/escape arguments where appropriate!
254
+ """
255
+ assert isinstance(command, UNICODE_TYPE), (
256
+ "Wrong argument for execute_shell.")
257
+ if verbose:
258
+ print("Executing: %s" % command)
259
+ env = os.environ.copy()
260
+ if os.name == 'posix':
261
+ env['LC_ALL'] = 'C'
262
+ check_call(command, shell=True, env=env)
263
+
264
+
265
+ @contextmanager
266
+ def tempdir(keep=False):
267
+ """Context manager: temporary directory."""
268
+ directory = tempfile.mkdtemp()
269
+ yield directory
270
+ if not keep:
271
+ rmtree(directory)
272
+
273
+
274
+ def make_dirs(path):
275
+ """Equivalent to `mkdir -p -- path`."""
276
+ try:
277
+ os.makedirs(path)
278
+ except OSError as error:
279
+ if error.errno != errno.EEXIST:
280
+ raise
281
+
282
+
283
+ def open_file(path, mode='r'):
284
+ """Open a file, which may be gzip-compressed."""
285
+ if path.endswith('.gz'):
286
+ return gzip.open(path, mode)
287
+ else:
288
+ return open(path, mode)
289
+
290
+
291
+ def count_lines(filename):
292
+ """Count the number of lines in `filename` (may be gzip-compressed)."""
293
+ count = 0
294
+ with open_file(filename) as stream:
295
+ for _ in stream:
296
+ count += 1
297
+ return count
298
+
299
+
300
+ def set_temp_dir():
301
+ """Set temporary directory to `$MOSES_TEMP_DIR`, if set.
302
+
303
+ Create the directory if necessary.
304
+ """
305
+ temp_dir = os.getenv('MOSES_TEMP_DIR')
306
+ if temp_dir is not None:
307
+ make_dirs(temp_dir)
308
+ tempfile.tempdir = temp_dir
309
+
310
+
311
+ def strip_newline(line):
312
+ """Remove trailing carriage return and/or line feed, if present."""
313
+ if line.endswith('\n'):
314
+ line = line[:-1]
315
+ if line.endswith('\r'):
316
+ line = line[:-1]
317
+ return line
318
+
319
+
320
+ def open_chunk_file(split_dir, chunk_number):
321
+ """Open a file to write one chunk of the extract file."""
322
+ return open_file(
323
+ os.path.join(split_dir, 'extract.%d.gz' % chunk_number), 'w')
324
+
325
+
326
+ def name_context_chunk_file(split_dir, chunk_number):
327
+ """Compose file name for one chunk of the extract context file."""
328
+ return os.path.join(
329
+ split_dir, 'extract.context.%d.gz' % chunk_number)
330
+
331
+
332
+ def extract_source_phrase(line):
333
+ """Extract the source phrase from an extract-file line."""
334
+ return line.split(b'|||', 1)[0]
335
+
336
+
337
+ def cut_context_file(last_source_phrase, chunk_file, last_line,
338
+ context_stream):
339
+ """Write one chunk of extract context file into its own file.
340
+
341
+ :param last_source_phrase: Last source phrase that should be in the
342
+ chunk. Stop processing after this source phrase.
343
+ :param chunk_file: Path to the extract context file for this chunk.
344
+ :param last_line: Previously read line that may still need writing.
345
+ :param context_stream: Extract context file, opened for reading.
346
+ :return: Last line read from `context_stream`. This line will still
347
+ need processing.
348
+ """
349
+ # TODO: Use open_file.
350
+ with gzip.open(chunk_file, 'w') as chunk:
351
+ if last_line is not None:
352
+ chunk.write('%s\n' % last_line)
353
+
354
+ # Are we processing our last source phrase yet?
355
+ on_last_source_phrase = False
356
+
357
+ # Write all lines in context file until we meet last source phrase
358
+ # in extract file.
359
+ for line in context_stream:
360
+ # Reading from a gzip file returns lines *including the newline*.
361
+ # Either way, we want to ignore carriage returns as well.
362
+ line = strip_newline(line)
363
+ source_phrase = extract_source_phrase(line)
364
+ if on_last_source_phrase and source_phrase != last_source_phrase:
365
+ # First new source phrase after our last one. We're done.
366
+ return line
367
+ else:
368
+ # Still adding lines to our chunk.
369
+ chunk.write('%s\n' % line)
370
+ if source_phrase == last_source_phrase:
371
+ # We're on our last source phrase now.
372
+ on_last_source_phrase = True
373
+
374
+
375
+ def split_extract_files(split_dir, extract_file, extract_context_file=None,
376
+ jobs=1):
377
+ """Split extract file into chunks, so we can process them in parallel.
378
+
379
+ :param split_dir: A temporary directory where this function can write
380
+ temporary files. The caller must ensure that this directory will be
381
+ cleaned up after it's done with the files.
382
+ :return: An iterable of tuples. Each tuple hols a partial extract file,
383
+ and the corresponding context file. The files may be in `split_dir`,
384
+ or there may just be the original extract file.
385
+ """
386
+ if jobs == 1:
387
+ # No splitting needed. Read the original file(s).
388
+ return [(extract_file, extract_context_file)]
389
+
390
+ # Otherwise: split files.
391
+ files = []
392
+ num_lines = count_lines(extract_file)
393
+ chunk_size = (num_lines + jobs - 1) / jobs
394
+ assert isinstance(chunk_size, int)
395
+
396
+ line_count = 0
397
+ chunk_number = 0
398
+ prev_source_phrase = None
399
+ last_line_context = None
400
+ extract_stream = open_file(extract_file)
401
+ chunk_file = open_chunk_file(split_dir, chunk_number)
402
+ if extract_context_file is None:
403
+ chunk_context_file = None
404
+ if extract_context_file is not None:
405
+ context_stream = open_file(extract_context_file)
406
+
407
+ for line in extract_stream:
408
+ line_count += 1
409
+ line = line.decode('utf-8')
410
+ line = strip_newline(line)
411
+ if line_count >= chunk_size:
412
+ # At or over chunk size. Cut off at next source phrase change.
413
+ source_phrase = extract_source_phrase(line)
414
+ if prev_source_phrase is None:
415
+ # Start looking for a different source phrase.
416
+ prev_source_phrase = source_phrase
417
+ elif source_phrase == prev_source_phrase:
418
+ # Can't cut yet. Still working on the same source phrase.
419
+ pass
420
+ else:
421
+ # Hit first new source phrase after chunk limit. Cut new
422
+ # file(s).
423
+ chunk_file.close()
424
+ if extract_context_file is not None:
425
+ chunk_context_file = name_context_chunk_file(
426
+ split_dir, chunk_number)
427
+ last_line_context = cut_context_file(
428
+ prev_source_phrase, chunk_context_file,
429
+ last_line_context, context_stream)
430
+ files.append((chunk_file.name, chunk_context_file))
431
+
432
+ # Start on new chunk.
433
+ prev_source_phrase = None
434
+ line_count = 0
435
+ chunk_number += 1
436
+ chunk_file = open_chunk_file(split_dir, chunk_number)
437
+ chunk_file.write(('%s\n' % line).encode('utf-8'))
438
+
439
+ chunk_file.close()
440
+ if extract_context_file is not None:
441
+ chunk_context_file = name_context_chunk_file(split_dir, chunk_number)
442
+ last_line_context = cut_context_file(
443
+ prev_source_phrase, chunk_number, last_line_context,
444
+ context_stream)
445
+ files.append((chunk_file.name, chunk_context_file))
446
+ return files
447
+
448
+
449
+ def compose_score_command(extract_file, context_file, half_file,
450
+ flex_half_file, args):
451
+ """Compose command line text to run one instance of `score`.
452
+
453
+ :param extract_file: One chunk of extract file.
454
+ :param context_file: If doing flexibility scoring, one chunk of
455
+ extract context file. Otherwise, None.
456
+ :param half_file: ???
457
+ :param flex_half_file: ???
458
+ :param args: Arguments namespace.
459
+ """
460
+ command = [
461
+ args.score_exe,
462
+ extract_file,
463
+ args.lex_file,
464
+ half_file,
465
+ ]
466
+ if args.args not in (None, ''):
467
+ command.append(args.args)
468
+ other_args = build_score_args(args)
469
+ if other_args != '':
470
+ command.append(other_args)
471
+ if context_file is not None:
472
+ command += [
473
+ '&&',
474
+ find_first_executable(['bzcat']),
475
+ half_file,
476
+ '|',
477
+ quote(args.flexibility_score),
478
+ quote(context_file),
479
+ ]
480
+ if args.inverse:
481
+ command.append('--Inverse')
482
+ if args.hierarchical:
483
+ command.append('--Hierarchical')
484
+ command += [
485
+ '|',
486
+ quote(args.gzip_command),
487
+ '-c',
488
+ '>%s' % quote(flex_half_file),
489
+ ]
490
+ return ' '.join(command)
491
+
492
+
493
+ def score_parallel(split_dir, file_pairs, args):
494
+ """Run the `score` command in parallel.
495
+
496
+ :param split_dir: Temporary directory where we can create split files.
497
+ :param file_pairs: Sequence of tuples for the input files, one tuple
498
+ per chunk of the work. Each tuple consists of a partial extract
499
+ file, and optionally a partial extract context file.
500
+ :param args: Arguments namespace.
501
+ :return: A list of tuples. Each tuple contains two file paths. The first
502
+ is for a partial half-phrase-table file. The second is for the
503
+ corresponding partial flex file, if a context file is given; or
504
+ `None` otherwise.
505
+ """
506
+ partial_files = []
507
+ # Pool of worker processes for executing the partial "score" invocations
508
+ # concurrently.
509
+ pool = Pool(args.jobs)
510
+ try:
511
+ for chunk_num, file_pair in enumerate(file_pairs):
512
+ half_file = os.path.join(
513
+ split_dir, 'phrase-table.half.%06d.gz' % chunk_num)
514
+ extract_file, context_file = file_pair
515
+ if context_file is None:
516
+ flex_half_file = None
517
+ else:
518
+ flex_half_file = os.path.join(
519
+ split_dir, 'phrase-table.half.%06d.flex.gz' % chunk_num)
520
+ # Pickling of arguments for the pool is awkward on Windows, so
521
+ # keep them simple. Compose the command line in the parent
522
+ # process, then hand them to worker processes which execute them.
523
+ command_line = compose_score_command(
524
+ extract_file, context_file, half_file, flex_half_file, args)
525
+ pool.apply_async(
526
+ execute_shell, (command_line, ), {'verbose': args.verbose})
527
+ partial_files.append((half_file, flex_half_file))
528
+ pool.close()
529
+ except BaseException:
530
+ pool.terminate()
531
+ raise
532
+ finally:
533
+ pool.join()
534
+ return partial_files
535
+
536
+
537
+ def merge_and_sort(files, output, sort_command=None, gzip_exe=None,
538
+ verbose=False):
539
+ """Merge partial files.
540
+
541
+ :param files: List of partial half-phrase-table files.
542
+ :param output: Path for resulting combined phrase-table file.
543
+ """
544
+ # TODO: The Perl code mentioned "sort" and "flexibility_score" here.
545
+ # What do we do with those?
546
+
547
+ # Sort whether we're asked to or not, as a way of combining the input
548
+ # files.
549
+ if sort_command == 'neandersort':
550
+ # Neandersort transparently decompresses input and compresses output.
551
+ check_call([
552
+ 'neandersort',
553
+ '-o', output,
554
+ ] + files)
555
+ else:
556
+ command = (
557
+ "%(gzip)s -c -d %(files)s | "
558
+ "%(sort)s | "
559
+ "%(gzip)s -c >>%(output)s"
560
+ % {
561
+ 'gzip': quote(gzip_exe),
562
+ 'sort': sort_command,
563
+ 'files': ' '.join(map(quote, files)),
564
+ 'output': quote(output),
565
+ })
566
+ execute_shell(command, verbose=verbose)
567
+
568
+
569
+ def build_score_args(args):
570
+ """Compose command line for the `score` program."""
571
+ command_line = []
572
+ if args.labels_file:
573
+ command_line += [
574
+ '--SourceLabels',
575
+ '--SourceLabelCountsLHS',
576
+ '--SourceLabelSet',
577
+ ]
578
+ if args.parts_of_speech:
579
+ command_line.append('--PartsOfSpeech')
580
+ if args.inverse:
581
+ command_line.append('--Inverse')
582
+ if args.args is not None:
583
+ command_line.append(args.args)
584
+ return ' '.join(command_line)
585
+
586
+
587
+ def list_existing(paths):
588
+ """Return, in the same order, those of the given files which exist."""
589
+ return filter(os.path.exists, paths)
590
+
591
+
592
+ def compose_coc_path_for(path):
593
+ """Compose COC-file path for the given file."""
594
+ return '%s.coc' % path
595
+
596
+
597
+ def read_cocs(path):
598
+ """Read COC file at `path`, return contents as tuple of ints."""
599
+ with open(path) as lines:
600
+ return tuple(
601
+ int(line.rstrip('\r\n'))
602
+ for line in lines
603
+ )
604
+
605
+
606
+ def add_cocs(original, additional):
607
+ """Add two tuples of COCs. Extend as needed."""
608
+ assert not (original is None and additional is None), "No COCs to add!"
609
+ if original is None:
610
+ return additional
611
+ elif additional is None:
612
+ return original
613
+ else:
614
+ common = tuple(lhs + rhs for lhs, rhs in zip(original, additional))
615
+ return (
616
+ common +
617
+ tuple(original[len(common):]) +
618
+ tuple(additional[len(common):]))
619
+
620
+
621
+ def merge_coc(files, output):
622
+ """Merge COC files for the given partial files.
623
+
624
+ Each COC file is a series of integers, one per line. This reads them, and
625
+ adds them up line-wise into one file of the same format: the sum of the
626
+ numbers the respective files have at line 1, the sum of the numbers the
627
+ respective files have at line 2, and so on.
628
+ """
629
+ assert len(files) > 0, "No partial files - no work to do."
630
+ extract_files = [extract_file for extract_file, _ in files]
631
+ if not os.path.exists(compose_coc_path_for(extract_files[0])):
632
+ # Nothing to merge.
633
+ return
634
+ totals = None
635
+ # TODO: Shouldn't we just fail if any of these files is missing?
636
+ for coc_path in list_existing(map(compose_coc_path_for, extract_files)):
637
+ totals = add_cocs(totals, read_cocs(coc_path))
638
+
639
+ # Write to output file.
640
+ with open(output, 'w') as output_stream:
641
+ for entry in totals:
642
+ output_stream.write('%d\n' % entry)
643
+
644
+
645
+ def suffix_line_numbers(infile, outfile):
646
+ """Rewrite `infile` to `outfile`; suffix line number to each line.
647
+
648
+ The line number is zero-based, and separated from the rest of the line
649
+ by a single space.
650
+ """
651
+ temp_file = '%s.numbering' % outfile
652
+ with open(infile, 'r') as instream, open(outfile, 'w') as outstream:
653
+ line_no = 0
654
+ for line in instream:
655
+ outstream.write(line)
656
+ outstream.write(' %d\n' % line_no)
657
+ line_no += 1
658
+ os.rename(temp_file, outfile)
659
+
660
+
661
+ def compose_source_labels_path_for(path):
662
+ """Return source labels file path for given file."""
663
+ return '%s.syntaxLabels.src' % path
664
+
665
+
666
+ def merge_numbered_files(inputs, output, header_lines, sort_command,
667
+ verbose=False):
668
+ """Sort and merge files `inputs`, add header and line numbers.
669
+
670
+ :param inputs: Iterable of input files.
671
+ :param output: Output file.
672
+ :header_lines: Iterable of header lines.
673
+ :sort_command: Command line for sorting input files.
674
+ """
675
+ sort_temp = '%s.sorting' % output
676
+ with open(sort_temp, 'w') as stream:
677
+ for line in header_lines:
678
+ stream.write(line)
679
+ stream.write('\n')
680
+ execute_shell(
681
+ "%s %s >>%s" % (
682
+ sort_command,
683
+ ' '.join(map(quote, inputs)),
684
+ quote(sort_temp)),
685
+ verbose=verbose)
686
+ suffix_line_numbers(sort_temp, output)
687
+
688
+
689
+ def merge_source_labels(files, output, sort_command, verbose=False):
690
+ """Merge source labels files."""
691
+ # TODO: Shouldn't we just fail if any of these files is missing?
692
+ labels_files = list_existing(map(compose_source_labels_path_for, files))
693
+ header = [
694
+ 'GlueTop',
695
+ 'GlueX',
696
+ 'SSTART',
697
+ 'SEND',
698
+ ]
699
+ merge_numbered_files(
700
+ labels_files, output, header, sort_command, verbose=verbose)
701
+
702
+
703
+ def compose_parts_of_speech_path_for(path):
704
+ """Return parts-of-speech file path for given file."""
705
+ return '%s.partsOfSpeech' % path
706
+
707
+
708
+ def merge_parts_of_speech(files, output, sort_command, verbose=False):
709
+ """Merge parts-of-speech files into output."""
710
+ # TODO: Shouldn't we just fail if any of these files is missing?
711
+ parts_files = list_existing(map(compose_parts_of_speech_path_for, files))
712
+ header = [
713
+ 'SSTART',
714
+ 'SEND',
715
+ ]
716
+ merge_numbered_files(
717
+ parts_files, output, header, sort_command, verbose=verbose)
718
+
719
+
720
+ def main():
721
+ """Command-line entry point. Marshals and forwards to `score_parallel`."""
722
+ args = parse_args()
723
+ sanitize_args(args)
724
+ set_temp_dir()
725
+
726
+ if args.flexibility_score is None:
727
+ extract_context_file = None
728
+ else:
729
+ extract_context_file = args.extract_file.replace(
730
+ 'extract.', 'extract.context.')
731
+
732
+ if args.verbose:
733
+ print("Started %s." % datetime.now())
734
+ print("Using '%s' for gzip." % args.gzip_command)
735
+
736
+ with tempdir(args.debug) as split_dir:
737
+ extract_files = split_extract_files(
738
+ split_dir, args.extract_file,
739
+ extract_context_file=extract_context_file, jobs=args.jobs)
740
+
741
+ scored_files = score_parallel(split_dir, extract_files, args)
742
+
743
+ if args.verbose:
744
+ sys.stderr.write("Finished score %s.\n" % datetime.now())
745
+
746
+ # TODO: Pass on "sort" and "flexibility-score" arguments?
747
+ merge_and_sort(
748
+ [phrase_chunk for phrase_chunk, _ in scored_files], args.output,
749
+ sort_command=args.sort_command, gzip_exe=args.gzip_command,
750
+ verbose=args.verbose)
751
+ merge_coc(extract_files, compose_coc_path_for(args.output))
752
+
753
+ if not args.inverse and args.labels_file is not None:
754
+ if args.verbose:
755
+ print("Merging source labels files.")
756
+ merge_source_labels(
757
+ extract_files, args.labels_file,
758
+ sort_command=args.sort_command, verbose=args.verbose)
759
+
760
+ if not args.inverse and args.parts_of_speech is not None:
761
+ if args.verbose:
762
+ print("Merging parts-of-speech files.")
763
+ merge_parts_of_speech(
764
+ extract_files, args.parts_of_speech,
765
+ sort_command=args.sort_command, verbose=args.verbose)
766
+
767
+
768
+ if __name__ == '__main__':
769
+ try:
770
+ main()
771
+ except ProgramFailure as error:
772
+ sys.stderr.write('%s\n' % error)
773
+ sys.exit(1)
774
+ except CommandLineError as error:
775
+ sys.stderr.write("Command line error: %s\n" % error)
776
+ sys.exit(2)
mosesdecoder/scripts/generic/strip-xml.perl ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ use warnings;
7
+ use strict;
8
+
9
+ while (my $line = <STDIN>) {
10
+ chomp($line);
11
+ #print "$line\n";
12
+
13
+ my $len = length($line);
14
+ my $inXML = 0;
15
+ my $prevSpace = 1;
16
+ my $prevBar = 0;
17
+
18
+ for (my $i = 0; $i < $len; ++$i) {
19
+ my $c = substr($line, $i, 1);
20
+ if ($c eq "<" && !$prevBar) {
21
+ ++$inXML;
22
+ }
23
+ elsif ($c eq ">" && $inXML>0) {
24
+ --$inXML;
25
+ }
26
+ elsif ($prevSpace == 1 && $c eq " ")
27
+ { # duplicate space. Do nothing
28
+ }
29
+ elsif ($inXML == 0) {
30
+ if ($c eq " ") {
31
+ $prevSpace = 1;
32
+ $prevBar = 0;
33
+ }
34
+ elsif ($c eq "|") {
35
+ $prevSpace = 0;
36
+ $prevBar = 1;
37
+ }
38
+ else {
39
+ $prevSpace = 0;
40
+ $prevBar = 0;
41
+ }
42
+ print $c;
43
+ }
44
+ }
45
+
46
+ print "\n";
47
+ }
48
+
mosesdecoder/scripts/generic/trainlm-irst2.perl ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ # Compatible with sri LM-creating script, eg.
7
+ # ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
8
+ # To use it in the EMS, add this to the [LM] section
9
+ # lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irst-dir"
10
+ # settings = ""
11
+ # Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section.
12
+ # It should point to the root of the LM toolkit, eg
13
+ # irst-dir = /Users/hieu/workspace/irstlm/trunk/bin
14
+ # Set smoothing method in settings, if different from modified Kneser-Ney
15
+
16
+ use warnings;
17
+ use strict;
18
+ use FindBin qw($RealBin);
19
+ use Getopt::Long;
20
+
21
+ my $order = 3; # order of language model (default trigram)
22
+ my $corpusPath; # input text data
23
+ my $lmPath; # generated language model
24
+ my $cores = 2; # number of CPUs used
25
+ my $irstPath; # bin directory of IRSTLM
26
+ my $tempPath = "tmp"; # temp dir
27
+ my $pruneSingletons = 1; # 1 = prune singletons, 0 = keep singletons
28
+ my $smoothing = "msb"; # smoothing method: wb = witten-bell, sb = kneser-ney, msb = modified-kneser-ney
29
+ my $dummy;
30
+
31
+ GetOptions("order=s" => \$order,
32
+ "text=s" => \$corpusPath,
33
+ "lm=s" => \$lmPath,
34
+ "cores=s" => \$cores,
35
+ "irst-dir=s" => \$irstPath,
36
+ "temp-dir=s" => \$tempPath,
37
+ "p=i" => \$pruneSingletons, # irstlm parameter: prune singletons
38
+ "s=s" => \$smoothing, # irstlm parameter: smoothing method
39
+ "interpolate!" => \$dummy, #ignore
40
+ "kndiscount!" => \$dummy #ignore
41
+ ) or exit 1;
42
+
43
+ #die("ERROR: please set order") unless defined($order);
44
+ die("ERROR: please set text") unless defined($corpusPath);
45
+ die("ERROR: please set lm") unless defined($lmPath);
46
+ die("ERROR: please set irst-dir") unless defined($irstPath);
47
+
48
+
49
+ $tempPath .= "/irstlm-build-tmp.$$";
50
+ `mkdir -p $tempPath`;
51
+
52
+ # add <s> and </s>
53
+ my $cmd = "cat $corpusPath | $irstPath/add-start-end.sh > $tempPath/setagged";
54
+ print STDERR "EXECUTING $cmd\n";
55
+ `$cmd`;
56
+
57
+ # collect n-gram counts
58
+ $cmd = "$irstPath/ngt -i=$tempPath/setagged -n=$order -b=yes -o=$tempPath/counts";
59
+ print STDERR "EXECUTING $cmd\n";
60
+ `$cmd`;
61
+
62
+ # build lm
63
+ $cmd = "$irstPath/tlm -o=$lmPath -lm=$smoothing -bo=yes -n=$order -tr=$tempPath/counts";
64
+ $cmd .= " -ps=no" unless $pruneSingletons;
65
+ print STDERR "EXECUTING $cmd\n";
66
+ `$cmd`;
67
+
68
+ $cmd = "rm -rf $tempPath";
69
+ print STDERR "EXECUTING $cmd\n";
70
+ `$cmd`;
71
+
72
+ print STDERR "FINISH.\n";
mosesdecoder/scripts/share/nonbreaking_prefixes/README.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ The language suffix can be found here:
2
+
3
+ http://www.loc.gov/standards/iso639-2/php/code_list.php
4
+
5
+ This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6
+ This code includes data from czech wiktionary (also czech abbreviations).
7
+
8
+
mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.as ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+
3
+ #common exceptions
4
+ # Dr
5
+
6
+
7
+ #others
8
+
9
+
10
+ #phonetics
11
+ # A
12
+
13
+ # B
14
+ বি
15
+ # C
16
+ সি
17
+ # D
18
+ ডি
19
+ # E
20
+
21
+ # F
22
+ এফ
23
+ # G
24
+ জি
25
+ # H
26
+ এইচ
27
+ # I
28
+ আম
29
+ # J
30
+ জে
31
+ # K
32
+ কে
33
+ # L
34
+ এল
35
+ # M
36
+ এম
37
+ # N
38
+ এন
39
+ # O
40
+ হে
41
+ # P
42
+ পি
43
+ # Q
44
+ কিউ
45
+ # R
46
+ আর
47
+ # S
48
+ এস
49
+ # T
50
+ টি
51
+ # U
52
+ ইউ
53
+ # V
54
+ ভি
55
+ # W
56
+ ডব্লু
57
+ # X
58
+ এক্স
59
+ # Y
60
+ ওয়াই
61
+ # Z
62
+ জেড
63
+
64
+ #consonants
65
+
mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ca ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Dr
2
+ Dra
3
+ pàg
4
+ p
5
+ c
6
+ av
7
+ Sr
8
+ Sra
9
+ adm
10
+ esq
11
+ Prof
12
+ S.A
13
+ S.L
14
+ p.e
15
+ ptes
16
+ Sta
17
+ St
18
+ pl
19
+ màx
20
+ cast
21
+ dir
22
+ nre
23
+ fra
24
+ admdora
25
+ Emm
26
+ Excma
27
+ espf
28
+ dc
29
+ admdor
30
+ tel
31
+ angl
32
+ aprox
33
+ ca
34
+ dept
35
+ dj
36
+ dl
37
+ dt
38
+ ds
39
+ dg
40
+ dv
41
+ ed
42
+ entl
43
+ al
44
+ i.e
45
+ maj
46
+ smin
47
+ n
48
+ núm
49
+ pta
50
+ A
51
+ B
52
+ C
53
+ D
54
+ E
55
+ F
56
+ G
57
+ H
58
+ I
59
+ J
60
+ K
61
+ L
62
+ M
63
+ N
64
+ O
65
+ P
66
+ Q
67
+ R
68
+ S
69
+ T
70
+ U
71
+ V
72
+ W
73
+ X
74
+ Y
75
+ Z
mosesdecoder/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.cs ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Bc
2
+ BcA
3
+ Ing
4
+ Ing.arch
5
+ MUDr
6
+ MVDr
7
+ MgA
8
+ Mgr
9
+ JUDr
10
+ PhDr
11
+ RNDr
12
+ PharmDr
13
+ ThLic
14
+ ThDr
15
+ Ph.D
16
+ Th.D
17
+ prof
18
+ doc
19
+ CSc
20
+ DrSc
21
+ dr. h. c
22
+ PaedDr
23
+ Dr
24
+ PhMr
25
+ DiS
26
+ abt
27
+ ad
28
+ a.i
29
+ aj
30
+ angl
31
+ anon
32
+ apod
33
+ atd
34
+ atp
35
+ aut
36
+ bd
37
+ biogr
38
+ b.m
39
+ b.p
40
+ b.r
41
+ cca
42
+ cit
43
+ cizojaz
44
+ c.k
45
+ col
46
+ čes
47
+ čín
48
+ čj
49
+ ed
50
+ facs
51
+ fasc
52
+ fol
53
+ fot
54
+ franc
55
+ h.c
56
+ hist
57
+ hl
58
+ hrsg
59
+ ibid
60
+ il
61
+ ind
62
+ inv.č
63
+ jap
64
+ jhdt
65
+ jv
66
+ koed
67
+ kol
68
+ korej
69
+ kl
70
+ krit
71
+ lat
72
+ lit
73
+ m.a
74
+ maď
75
+ mj
76
+ mp
77
+ násl
78
+ např
79
+ nepubl
80
+ něm
81
+ no
82
+ nr
83
+ n.s
84
+ okr
85
+ odd
86
+ odp
87
+ obr
88
+ opr
89
+ orig
90
+ phil
91
+ pl
92
+ pokrač
93
+ pol
94
+ port
95
+ pozn
96
+ př.kr
97
+ př.n.l
98
+ přel
99
+ přeprac
100
+ příl
101
+ pseud
102
+ pt
103
+ red
104
+ repr
105
+ resp
106
+ revid
107
+ rkp
108
+ roč
109
+ roz
110
+ rozš
111
+ samost
112
+ sect
113
+ sest
114
+ seš
115
+ sign
116
+ sl
117
+ srv
118
+ stol
119
+ sv
120
+ šk
121
+ šk.ro
122
+ špan
123
+ tab
124
+ t.č
125
+ tis
126
+ tj
127
+
128
+ tzv
129
+ univ
130
+ uspoř
131
+ vol
132
+ vl.jm
133
+ vs
134
+ vyd
135
+ vyobr
136
+ zal
137
+ zejm
138
+ zkr
139
+ zprac
140
+ zvl
141
+ n.p
142
+ např
143
+ než
144
+ MUDr
145
+ abl
146
+ absol
147
+ adj
148
+ adv
149
+ ak
150
+ ak. sl
151
+ akt
152
+ alch
153
+ amer
154
+ anat
155
+ angl
156
+ anglosas
157
+ arab
158
+ arch
159
+ archit
160
+ arg
161
+ astr
162
+ astrol
163
+ att
164
+ bás
165
+ belg
166
+ bibl
167
+ biol
168
+ boh
169
+ bot
170
+ bulh
171
+ círk
172
+ csl
173
+ č
174
+ čas
175
+ čes
176
+ dat
177
+ děj
178
+ dep
179
+ dět
180
+ dial
181
+ dór
182
+ dopr
183
+ dosl
184
+ ekon
185
+ epic
186
+ etnonym
187
+ eufem
188
+ f
189
+ fam
190
+ fem
191
+ fil
192
+ film
193
+ form
194
+ fot
195
+ fr
196
+ fut
197
+ fyz
198
+ gen
199
+ geogr
200
+ geol
201
+ geom
202
+ germ
203
+ gram
204
+ hebr
205
+ herald
206
+ hist
207
+ hl
208
+ hovor
209
+ hud
210
+ hut
211
+ chcsl
212
+ chem
213
+ ie
214
+ imp
215
+ impf
216
+ ind
217
+ indoevr
218
+ inf
219
+ instr
220
+ interj
221
+ ión
222
+ iron
223
+ it
224
+ kanad
225
+ katalán
226
+ klas
227
+ kniž
228
+ komp
229
+ konj
230
+
231
+ konkr
232
+
233
+ kuch
234
+ lat
235
+ lék
236
+ les
237
+ lid
238
+ lit
239
+ liturg
240
+ lok
241
+ log
242
+ m
243
+ mat
244
+ meteor
245
+ metr
246
+ mod
247
+ ms
248
+ mysl
249
+ n
250
+ náb
251
+ námoř
252
+ neklas
253
+ něm
254
+ nesklon
255
+ nom
256
+ ob
257
+ obch
258
+ obyč
259
+ ojed
260
+ opt
261
+ part
262
+ pas
263
+ pejor
264
+ pers
265
+ pf
266
+ pl
267
+ plpf
268
+
269
+ práv
270
+ prep
271
+ předl
272
+ přivl
273
+ r
274
+ rcsl
275
+ refl
276
+ reg
277
+ rkp
278
+ ř
279
+ řec
280
+ s
281
+ samohl
282
+ sg
283
+ sl
284
+ souhl
285
+ spec
286
+ srov
287
+ stfr
288
+ střv
289
+ stsl
290
+ subj
291
+ subst
292
+ superl
293
+ sv
294
+ sz
295
+ táz
296
+ tech
297
+ telev
298
+ teol
299
+ trans
300
+ typogr
301
+ var
302
+ vedl
303
+ verb
304
+ vl. jm
305
+ voj
306
+ vok
307
+ vůb
308
+ vulg
309
+ výtv
310
+ vztaž
311
+ zahr
312
+ zájm
313
+ zast
314
+ zejm
315
+
316
+ zeměd
317
+ zkr
318
+
319
+ mj
320
+ dl
321
+ atp
322
+ sport
323
+ Mgr
324
+ horn
325
+ MVDr
326
+ JUDr
327
+ RSDr
328
+ Bc
329
+ PhDr
330
+ ThDr
331
+ Ing
332
+ aj
333
+ apod
334
+ PharmDr
335
+ pomn
336
+ ev
337
+ slang
338
+ nprap
339
+ odp
340
+ dop
341
+ pol
342
+ st
343
+ stol
344
+ p. n. l
345
+ před n. l
346
+ n. l
347
+ př. Kr
348
+ po Kr
349
+ př. n. l
350
+ odd
351
+ RNDr
352
+ tzv
353
+ atd
354
+ tzn
355
+ resp
356
+ tj
357
+ p
358
+ br
359
+ č. j
360
+ čj
361
+ č. p
362
+ čp
363
+ a. s
364
+ s. r. o
365
+ spol. s r. o
366
+ p. o
367
+ s. p
368
+ v. o. s
369
+ k. s
370
+ o. p. s
371
+ o. s
372
+ v. r
373
+ v z
374
+ ml
375
+
376
+ kr
377
+ mld
378
+ hod
379
+ popř
380
+ ap
381
+ event
382
+ rus
383
+ slov
384
+ rum
385
+ švýc
386
+ P. T
387
+ zvl
388
+ hor
389
+ dol
390
+ S.O.S