suricodes commited on
Commit
fd49381
·
verified ·
1 Parent(s): b5b8584

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +163 -0
  2. mosesdecoder/.beautify-ignore +38 -0
  3. mosesdecoder/.gitignore +90 -0
  4. mosesdecoder/.gitmodules +9 -0
  5. mosesdecoder/.travis.yml +24 -0
  6. mosesdecoder/COPYING +460 -0
  7. mosesdecoder/Jamroot +345 -0
  8. mosesdecoder/OnDiskPt/Jamfile +5 -0
  9. mosesdecoder/OnDiskPt/Main.cpp +273 -0
  10. mosesdecoder/OnDiskPt/Main.h +39 -0
  11. mosesdecoder/OnDiskPt/OnDiskQuery.cpp +83 -0
  12. mosesdecoder/OnDiskPt/OnDiskQuery.h +39 -0
  13. mosesdecoder/OnDiskPt/OnDiskWrapper.cpp +223 -0
  14. mosesdecoder/OnDiskPt/OnDiskWrapper.h +111 -0
  15. mosesdecoder/OnDiskPt/Phrase.cpp +108 -0
  16. mosesdecoder/OnDiskPt/Phrase.h +66 -0
  17. mosesdecoder/OnDiskPt/PhraseNode.cpp +268 -0
  18. mosesdecoder/OnDiskPt/PhraseNode.h +108 -0
  19. mosesdecoder/OnDiskPt/SourcePhrase.cpp +27 -0
  20. mosesdecoder/OnDiskPt/SourcePhrase.h +38 -0
  21. mosesdecoder/OnDiskPt/TargetPhrase.cpp +402 -0
  22. mosesdecoder/OnDiskPt/TargetPhrase.h +127 -0
  23. mosesdecoder/OnDiskPt/TargetPhraseCollection.cpp +171 -0
  24. mosesdecoder/OnDiskPt/TargetPhraseCollection.h +84 -0
  25. mosesdecoder/OnDiskPt/Vocab.cpp +101 -0
  26. mosesdecoder/OnDiskPt/Vocab.h +58 -0
  27. mosesdecoder/OnDiskPt/Word.cpp +144 -0
  28. mosesdecoder/OnDiskPt/Word.h +91 -0
  29. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt +3 -0
  30. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Main.o +0 -0
  31. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/OnDiskQuery.o +0 -0
  32. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/OnDiskWrapper.o +0 -0
  33. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Phrase.o +0 -0
  34. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/PhraseNode.o +0 -0
  35. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/SourcePhrase.o +0 -0
  36. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/TargetPhrase.o +0 -0
  37. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/TargetPhraseCollection.o +0 -0
  38. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Vocab.o +0 -0
  39. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Word.o +0 -0
  40. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/queryOnDiskPt +0 -0
  41. mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/queryOnDiskPt.o +0 -0
  42. mosesdecoder/OnDiskPt/queryOnDiskPt.cpp +86 -0
  43. mosesdecoder/README +14 -0
  44. mosesdecoder/azure-pipelines.yml +100 -0
  45. mosesdecoder/biconcor/Alignment.cpp +222 -0
  46. mosesdecoder/biconcor/Alignment.h +47 -0
  47. mosesdecoder/biconcor/CMakeLists.txt +5 -0
  48. mosesdecoder/biconcor/Jamfile +2 -0
  49. mosesdecoder/biconcor/Mismatch.cpp +292 -0
  50. mosesdecoder/biconcor/Mismatch.h +42 -0
.gitattributes CHANGED
@@ -36,3 +36,166 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  HiSd/data.blm.sd filter=lfs diff=lfs merge=lfs -text
37
  HiSd/phrase-table.minphr filter=lfs diff=lfs merge=lfs -text
38
  HiSd/reordering-table.minlexr filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  HiSd/data.blm.sd filter=lfs diff=lfs merge=lfs -text
37
  HiSd/phrase-table.minphr filter=lfs diff=lfs merge=lfs -text
38
  HiSd/reordering-table.minlexr filter=lfs diff=lfs merge=lfs -text
39
+ mosesdecoder/bin/build_binary filter=lfs diff=lfs merge=lfs -text
40
+ mosesdecoder/bin/consolidate filter=lfs diff=lfs merge=lfs -text
41
+ mosesdecoder/bin/consolidate-direct filter=lfs diff=lfs merge=lfs -text
42
+ mosesdecoder/bin/consolidate-reverse filter=lfs diff=lfs merge=lfs -text
43
+ mosesdecoder/bin/CreateOnDiskPt filter=lfs diff=lfs merge=lfs -text
44
+ mosesdecoder/bin/CreateProbingPT filter=lfs diff=lfs merge=lfs -text
45
+ mosesdecoder/bin/dump_counts filter=lfs diff=lfs merge=lfs -text
46
+ mosesdecoder/bin/evaluator filter=lfs diff=lfs merge=lfs -text
47
+ mosesdecoder/bin/extract filter=lfs diff=lfs merge=lfs -text
48
+ mosesdecoder/bin/extract-ghkm filter=lfs diff=lfs merge=lfs -text
49
+ mosesdecoder/bin/extract-lex filter=lfs diff=lfs merge=lfs -text
50
+ mosesdecoder/bin/extract-mixed-syntax filter=lfs diff=lfs merge=lfs -text
51
+ mosesdecoder/bin/extract-rules filter=lfs diff=lfs merge=lfs -text
52
+ mosesdecoder/bin/extractor filter=lfs diff=lfs merge=lfs -text
53
+ mosesdecoder/bin/filter filter=lfs diff=lfs merge=lfs -text
54
+ mosesdecoder/bin/filter-rule-table filter=lfs diff=lfs merge=lfs -text
55
+ mosesdecoder/bin/fragment filter=lfs diff=lfs merge=lfs -text
56
+ mosesdecoder/bin/gcc-9/debug/empty_test_static filter=lfs diff=lfs merge=lfs -text
57
+ mosesdecoder/bin/hgdecode filter=lfs diff=lfs merge=lfs -text
58
+ mosesdecoder/bin/kbmira filter=lfs diff=lfs merge=lfs -text
59
+ mosesdecoder/bin/kenlm_benchmark filter=lfs diff=lfs merge=lfs -text
60
+ mosesdecoder/bin/lexical-reordering-score filter=lfs diff=lfs merge=lfs -text
61
+ mosesdecoder/bin/lmbrgrid filter=lfs diff=lfs merge=lfs -text
62
+ mosesdecoder/bin/lmplz filter=lfs diff=lfs merge=lfs -text
63
+ mosesdecoder/bin/merge-sorted filter=lfs diff=lfs merge=lfs -text
64
+ mosesdecoder/bin/mert filter=lfs diff=lfs merge=lfs -text
65
+ mosesdecoder/bin/moses filter=lfs diff=lfs merge=lfs -text
66
+ mosesdecoder/bin/moses2 filter=lfs diff=lfs merge=lfs -text
67
+ mosesdecoder/bin/mosesserver filter=lfs diff=lfs merge=lfs -text
68
+ mosesdecoder/bin/postprocess-egret-forests filter=lfs diff=lfs merge=lfs -text
69
+ mosesdecoder/bin/prepare-expected-bleu-training filter=lfs diff=lfs merge=lfs -text
70
+ mosesdecoder/bin/pro filter=lfs diff=lfs merge=lfs -text
71
+ mosesdecoder/bin/processLexicalTable filter=lfs diff=lfs merge=lfs -text
72
+ mosesdecoder/bin/processLexicalTableMin filter=lfs diff=lfs merge=lfs -text
73
+ mosesdecoder/bin/processPhraseTableMin filter=lfs diff=lfs merge=lfs -text
74
+ mosesdecoder/bin/prunePhraseTable filter=lfs diff=lfs merge=lfs -text
75
+ mosesdecoder/bin/query filter=lfs diff=lfs merge=lfs -text
76
+ mosesdecoder/bin/queryLexicalTable filter=lfs diff=lfs merge=lfs -text
77
+ mosesdecoder/bin/queryPhraseTableMin filter=lfs diff=lfs merge=lfs -text
78
+ mosesdecoder/bin/relax-parse filter=lfs diff=lfs merge=lfs -text
79
+ mosesdecoder/bin/score filter=lfs diff=lfs merge=lfs -text
80
+ mosesdecoder/bin/score-stsg filter=lfs diff=lfs merge=lfs -text
81
+ mosesdecoder/bin/sentence-bleu-nbest filter=lfs diff=lfs merge=lfs -text
82
+ mosesdecoder/bin/statistics filter=lfs diff=lfs merge=lfs -text
83
+ mosesdecoder/bin/train-expected-bleu filter=lfs diff=lfs merge=lfs -text
84
+ mosesdecoder/bin/vwtrainer filter=lfs diff=lfs merge=lfs -text
85
+ mosesdecoder/cmph-2.0.2/lib/libcmph.a filter=lfs diff=lfs merge=lfs -text
86
+ mosesdecoder/cmph-2.0.2/src/.libs/libcmph.a filter=lfs diff=lfs merge=lfs -text
87
+ mosesdecoder/contrib/expected-bleu-training/bin/gcc-9/release/link-static/threading-multi/prepare-expected-bleu-training filter=lfs diff=lfs merge=lfs -text
88
+ mosesdecoder/contrib/expected-bleu-training/bin/gcc-9/release/link-static/threading-multi/train-expected-bleu filter=lfs diff=lfs merge=lfs -text
89
+ mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1 filter=lfs diff=lfs merge=lfs -text
90
+ mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1.binphr.tgtdata.wa filter=lfs diff=lfs merge=lfs -text
91
+ mosesdecoder/contrib/server/bin/gcc-9/release/link-static/threading-multi/mosesserver filter=lfs diff=lfs merge=lfs -text
92
+ mosesdecoder/lib/libmert_lib.a filter=lfs diff=lfs merge=lfs -text
93
+ mosesdecoder/lib/libmoses.a filter=lfs diff=lfs merge=lfs -text
94
+ mosesdecoder/lib/libmoses2_lib.a filter=lfs diff=lfs merge=lfs -text
95
+ mosesdecoder/lib/libmoses2decoder.a filter=lfs diff=lfs merge=lfs -text
96
+ mosesdecoder/lib/libprobingpt.a filter=lfs diff=lfs merge=lfs -text
97
+ mosesdecoder/lib/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
98
+ mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary filter=lfs diff=lfs merge=lfs -text
99
+ mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment filter=lfs diff=lfs merge=lfs -text
100
+ mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark filter=lfs diff=lfs merge=lfs -text
101
+ mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query filter=lfs diff=lfs merge=lfs -text
102
+ mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test filter=lfs diff=lfs merge=lfs -text
103
+ mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test filter=lfs diff=lfs merge=lfs -text
104
+ mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.o filter=lfs diff=lfs merge=lfs -text
105
+ mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test filter=lfs diff=lfs merge=lfs -text
106
+ mosesdecoder/lm/builder/bin/gcc-9/release/link-static/threading-multi/dump_counts filter=lfs diff=lfs merge=lfs -text
107
+ mosesdecoder/lm/builder/bin/gcc-9/release/link-static/threading-multi/lmplz filter=lfs diff=lfs merge=lfs -text
108
+ mosesdecoder/lm/filter/bin/gcc-9/release/link-static/threading-multi/filter filter=lfs diff=lfs merge=lfs -text
109
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/bleu_scorer_test filter=lfs diff=lfs merge=lfs -text
110
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/data_test filter=lfs diff=lfs merge=lfs -text
111
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/evaluator filter=lfs diff=lfs merge=lfs -text
112
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/extractor filter=lfs diff=lfs merge=lfs -text
113
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/feature_data_test filter=lfs diff=lfs merge=lfs -text
114
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/forest_rescore_test filter=lfs diff=lfs merge=lfs -text
115
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/hgdecode filter=lfs diff=lfs merge=lfs -text
116
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/hypergraph_test filter=lfs diff=lfs merge=lfs -text
117
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/kbmira filter=lfs diff=lfs merge=lfs -text
118
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/libmert_lib.a filter=lfs diff=lfs merge=lfs -text
119
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/mert filter=lfs diff=lfs merge=lfs -text
120
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/mira_feature_vector_test filter=lfs diff=lfs merge=lfs -text
121
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/ngram_test filter=lfs diff=lfs merge=lfs -text
122
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/optimizer_factory_test filter=lfs diff=lfs merge=lfs -text
123
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/point_test filter=lfs diff=lfs merge=lfs -text
124
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/pro filter=lfs diff=lfs merge=lfs -text
125
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/reference_test filter=lfs diff=lfs merge=lfs -text
126
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/sentence-bleu-nbest filter=lfs diff=lfs merge=lfs -text
127
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/singleton_test filter=lfs diff=lfs merge=lfs -text
128
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/TER/tools.o filter=lfs diff=lfs merge=lfs -text
129
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/timer_test filter=lfs diff=lfs merge=lfs -text
130
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/util_test filter=lfs diff=lfs merge=lfs -text
131
+ mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/vocabulary_test filter=lfs diff=lfs merge=lfs -text
132
+ mosesdecoder/mert/evaluator filter=lfs diff=lfs merge=lfs -text
133
+ mosesdecoder/mert/extractor filter=lfs diff=lfs merge=lfs -text
134
+ mosesdecoder/mert/hgdecode filter=lfs diff=lfs merge=lfs -text
135
+ mosesdecoder/mert/kbmira filter=lfs diff=lfs merge=lfs -text
136
+ mosesdecoder/mert/mert filter=lfs diff=lfs merge=lfs -text
137
+ mosesdecoder/mert/pro filter=lfs diff=lfs merge=lfs -text
138
+ mosesdecoder/mert/sentence-bleu-nbest filter=lfs diff=lfs merge=lfs -text
139
+ mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/merge-sorted filter=lfs diff=lfs merge=lfs -text
140
+ mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/processLexicalTable filter=lfs diff=lfs merge=lfs -text
141
+ mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/processLexicalTableMin filter=lfs diff=lfs merge=lfs -text
142
+ mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/processPhraseTableMin filter=lfs diff=lfs merge=lfs -text
143
+ mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/prunePhraseTable filter=lfs diff=lfs merge=lfs -text
144
+ mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/queryLexicalTable filter=lfs diff=lfs merge=lfs -text
145
+ mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/queryPhraseTableMin filter=lfs diff=lfs merge=lfs -text
146
+ mosesdecoder/moses/bin/gcc-9/release/link-static/threading-multi/libmoses.a filter=lfs diff=lfs merge=lfs -text
147
+ mosesdecoder/moses/bin/gcc-9/release/link-static/threading-multi/moses_test filter=lfs diff=lfs merge=lfs -text
148
+ mosesdecoder/moses/LM/bin/BackwardTest.test/gcc-9/release/link-static/threading-multi/BackwardTest filter=lfs diff=lfs merge=lfs -text
149
+ mosesdecoder/moses/TranslationModel/UG/util/ibm1-align filter=lfs diff=lfs merge=lfs -text
150
+ mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/lmbrgrid filter=lfs diff=lfs merge=lfs -text
151
+ mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/moses filter=lfs diff=lfs merge=lfs -text
152
+ mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/vwtrainer filter=lfs diff=lfs merge=lfs -text
153
+ mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2_lib.a filter=lfs diff=lfs merge=lfs -text
154
+ mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2decoder.a filter=lfs diff=lfs merge=lfs -text
155
+ mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/moses2 filter=lfs diff=lfs merge=lfs -text
156
+ mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt filter=lfs diff=lfs merge=lfs -text
157
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate filter=lfs diff=lfs merge=lfs -text
158
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate-direct filter=lfs diff=lfs merge=lfs -text
159
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate-reverse filter=lfs diff=lfs merge=lfs -text
160
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract filter=lfs diff=lfs merge=lfs -text
161
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract-lex filter=lfs diff=lfs merge=lfs -text
162
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract-rules filter=lfs diff=lfs merge=lfs -text
163
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/relax-parse filter=lfs diff=lfs merge=lfs -text
164
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/score filter=lfs diff=lfs merge=lfs -text
165
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/statistics filter=lfs diff=lfs merge=lfs -text
166
+ mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest filter=lfs diff=lfs merge=lfs -text
167
+ mosesdecoder/phrase-extract/extract-ghkm/bin/gcc-9/release/link-static/threading-multi/extract-ghkm filter=lfs diff=lfs merge=lfs -text
168
+ mosesdecoder/phrase-extract/extract-mixed-syntax/bin/gcc-9/release/link-static/threading-multi/extract-mixed-syntax filter=lfs diff=lfs merge=lfs -text
169
+ mosesdecoder/phrase-extract/filter-rule-table/bin/gcc-9/release/link-static/threading-multi/filter-rule-table filter=lfs diff=lfs merge=lfs -text
170
+ mosesdecoder/phrase-extract/lexical-reordering/bin/gcc-9/release/link-static/threading-multi/lexical-reordering-score filter=lfs diff=lfs merge=lfs -text
171
+ mosesdecoder/phrase-extract/postprocess-egret-forests/bin/gcc-9/release/link-static/threading-multi/postprocess-egret-forests filter=lfs diff=lfs merge=lfs -text
172
+ mosesdecoder/phrase-extract/score-stsg/bin/gcc-9/release/link-static/threading-multi/score-stsg filter=lfs diff=lfs merge=lfs -text
173
+ mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
174
+ mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_fragment_tokenizer_test filter=lfs diff=lfs merge=lfs -text
175
+ mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_test filter=lfs diff=lfs merge=lfs -text
176
+ mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT filter=lfs diff=lfs merge=lfs -text
177
+ mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/libprobingpt.a filter=lfs diff=lfs merge=lfs -text
178
+ mosesdecoder/util/bin/file_piece_test.test/gcc-9/release/link-static/threading-multi/file_piece_test filter=lfs diff=lfs merge=lfs -text
179
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/bit_packing_test filter=lfs diff=lfs merge=lfs -text
180
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/integer_to_string_test filter=lfs diff=lfs merge=lfs -text
181
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/joint_sort_test filter=lfs diff=lfs merge=lfs -text
182
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/multi_intersection_test filter=lfs diff=lfs merge=lfs -text
183
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/pcqueue_test filter=lfs diff=lfs merge=lfs -text
184
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/probing_hash_table_test filter=lfs diff=lfs merge=lfs -text
185
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/random_test filter=lfs diff=lfs merge=lfs -text
186
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/sized_iterator_test filter=lfs diff=lfs merge=lfs -text
187
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/sorted_uniform_test filter=lfs diff=lfs merge=lfs -text
188
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/string_stream_test filter=lfs diff=lfs merge=lfs -text
189
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/tempfile_test filter=lfs diff=lfs merge=lfs -text
190
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/tokenize_piece_test filter=lfs diff=lfs merge=lfs -text
191
+ mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/tokenize_test filter=lfs diff=lfs merge=lfs -text
192
+ mosesdecoder/util/stream/bin/gcc-9/release/link-static/threading-multi/io_test filter=lfs diff=lfs merge=lfs -text
193
+ mosesdecoder/util/stream/bin/gcc-9/release/link-static/threading-multi/rewindable_stream_test filter=lfs diff=lfs merge=lfs -text
194
+ mosesdecoder/util/stream/bin/gcc-9/release/link-static/threading-multi/sort_test filter=lfs diff=lfs merge=lfs -text
195
+ mosesdecoder/util/stream/bin/gcc-9/release/link-static/threading-multi/stream_test filter=lfs diff=lfs merge=lfs -text
196
+ mosesdecoder/xmlrpc-c-1.51.06/lib/abyss/src/libxmlrpc_abyss.a filter=lfs diff=lfs merge=lfs -text
197
+ mosesdecoder/xmlrpc-c-1.51.06/lib/libxmlrpc++.a filter=lfs diff=lfs merge=lfs -text
198
+ mosesdecoder/xmlrpc-c-1.51.06/lib/libxmlrpc.a filter=lfs diff=lfs merge=lfs -text
199
+ mosesdecoder/xmlrpc-c-1.51.06/lib/libxmlrpc_abyss.a filter=lfs diff=lfs merge=lfs -text
200
+ mosesdecoder/xmlrpc-c-1.51.06/src/cpp/libxmlrpc++.a filter=lfs diff=lfs merge=lfs -text
201
+ mosesdecoder/xmlrpc-c-1.51.06/src/libxmlrpc.a filter=lfs diff=lfs merge=lfs -text
mosesdecoder/.beautify-ignore ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Files and directories that beautify.py should not clean up.
2
+ #
3
+ # This file is not as advanced as, say, .gitignore. It only supports files
4
+ # and directory paths relative to the project root, one per line, no globs,
5
+ # no quotes.
6
+ #
7
+ # Leading and trailing whitespace is stripped from filenames, but internal
8
+ # whitespace is preserved.
9
+ #
10
+ # Lines starting with a hash mark, such as this one, are comments. The hash
11
+ # mark must be the first character on the line. Blank lines are ignored.
12
+ #
13
+ # The .beautify-ignore file must be encoded in UTF-8.
14
+
15
+ boost
16
+ contrib
17
+ irstlm
18
+ jam-files
19
+ lm
20
+ mingw/MosesGUI/icons_rc.py
21
+ mingw/MosesGUI/Ui_credits.py
22
+ mingw/MosesGUI/Ui_mainWindow.py
23
+ moses/TranslationModel/UG
24
+ moses/server
25
+ moses/parameters
26
+ moses/thread_safe_container.h
27
+ phrase-extract/pcfg-common
28
+ phrase-extract/syntax-common
29
+ randlm
30
+ # Filename suffixes in here are language codes, so e.g. ".pl" means
31
+ # Polish, not Perl.
32
+ scripts/share/nonbreaking_prefixes
33
+ search
34
+ srilm
35
+ util
36
+ xmlrpc-c
37
+ .git
38
+ util/ug_cache_with_timeout.h
mosesdecoder/.gitignore ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tools
2
+ *.d
3
+ *.pyc
4
+ *.lo
5
+ *.o
6
+ *.so
7
+ *.lo
8
+ *.o
9
+ *.la
10
+ *.a
11
+ *.swp
12
+ *.save
13
+ *.cmd
14
+ *~
15
+ *.gch
16
+ dist*
17
+ jam-files/bjam
18
+ jam-files/engine/bootstrap
19
+ jam-files/engine/bin.*
20
+ lm/build_binary
21
+ lm/query
22
+ mert/evaluator
23
+ mert/extractor
24
+ mert/hgdecode
25
+ mert/mert
26
+ mert/megam_i686.opt
27
+ mert/pro
28
+ mert/kbmira
29
+ misc/processLexicalTable
30
+ misc/processPhraseTable
31
+ misc/queryLexicalTable
32
+ mira/mira
33
+ mira/Makefile
34
+ mira/Makefile.in
35
+ misc/queryPhraseTable
36
+ moses-chart-cmd/src/moses_chart
37
+ moses-cmd/src/checkplf
38
+ moses-cmd/src/lmbrgrid
39
+ moses-cmd/src/moses
40
+ regression-testing/moses-reg-test-data-*
41
+ regression-testing/tests/mert.extractor-bin/FEATSTAT*
42
+ regression-testing/tests/mert.extractor-bin/SCORESTAT*
43
+ scripts/ems/biconcor/biconcor
44
+ scripts/release-exclude
45
+ scripts/training/cmert-0.5/mert
46
+ scripts/training/compact-rule-table/tools/compactify
47
+ scripts/training/eppex/counter
48
+ scripts/training/eppex/eppex
49
+ scripts/training/lexical-reordering/score
50
+ scripts/training/memscore/memscore
51
+ scripts/training/mbr/mbr
52
+ scripts/training/phrase-extract/consolidate
53
+ scripts/training/phrase-extract/consolidate-direct
54
+ scripts/training/phrase-extract/consolidate-reverse
55
+ scripts/training/phrase-extract/extract
56
+ scripts/training/phrase-extract/extract-ghkm/tools/extract-ghkm
57
+ scripts/training/phrase-extract/extract-lex
58
+ scripts/training/phrase-extract/extract-rules
59
+ scripts/training/phrase-extract/relax-parse
60
+ scripts/training/phrase-extract/score
61
+ scripts/training/phrase-extract/statistics
62
+ scripts/training/symal/symal
63
+ dist
64
+ bin
65
+ previous.sh
66
+ contrib/other-builds/*.xcodeproj/project.xcworkspace/
67
+ contrib/other-builds/*.xcodeproj/xcuserdata/
68
+ */*.xcodeproj/project.xcworkspace
69
+ */*.xcodeproj/xcuserdata
70
+
71
+ mert/sentence-bleu
72
+ mert/sentence-bleu-nbest
73
+ ._*
74
+ .DS_Store
75
+ *.pbxuser
76
+ *.mode1v3
77
+
78
+ *.exe
79
+ build/
80
+ nbproject/
81
+
82
+ mingw/MosesGUI/MosesGUI.e4p
83
+ mingw/MosesGUI/_eric4project/
84
+
85
+ contrib/m4m/merge-sorted
86
+ mert/hgdecode
87
+ .bash_history*
88
+ doxygen.conf
89
+ doxy
90
+ opt
mosesdecoder/.gitmodules ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [submodule "contrib/arrow-pipelines/python/pcl"]
2
+ path = contrib/arrow-pipelines/python/pcl
3
+ url = https://github.com/ianj-als/pcl.git
4
+ [submodule "contrib/omtc/omtc"]
5
+ path = contrib/omtc/omtc
6
+ url = https://github.com/ianj-als/omtc.git
7
+ [submodule "regtest"]
8
+ path = regtest
9
+ url = https://github.com/moses-smt/moses-regression-tests
mosesdecoder/.travis.yml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sudo: false
2
+ dist: trusty
3
+ language: c
4
+ compiler: gcc
5
+ env:
6
+ matrix:
7
+ addons:
8
+ apt:
9
+ sources:
10
+ - ubuntu-toolchain-r-test
11
+ packages:
12
+ - subversion
13
+ - automake
14
+ - libtool
15
+ - zlib1g-dev
16
+ - libbz2-dev
17
+ - liblzma-dev
18
+ - libboost-all-dev
19
+ - libgoogle-perftools-dev
20
+ - libxmlrpc-c++.*-dev
21
+ - cmake
22
+ - csh
23
+ script:
24
+ - ./bjam -j4
mosesdecoder/COPYING ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ GNU LESSER GENERAL PUBLIC LICENSE
3
+ Version 2.1, February 1999
4
+
5
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
6
+ 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
7
+ Everyone is permitted to copy and distribute verbatim copies
8
+ of this license document, but changing it is not allowed.
9
+
10
+ [This is the first released version of the Lesser GPL. It also counts
11
+ as the successor of the GNU Library Public License, version 2, hence
12
+ the version number 2.1.]
13
+
14
+ Preamble
15
+
16
+ The licenses for most software are designed to take away your
17
+ freedom to share and change it. By contrast, the GNU General Public
18
+ Licenses are intended to guarantee your freedom to share and change
19
+ free software--to make sure the software is free for all its users.
20
+
21
+ This license, the Lesser General Public License, applies to some
22
+ specially designated software packages--typically libraries--of the
23
+ Free Software Foundation and other authors who decide to use it. You
24
+ can use it too, but we suggest you first think carefully about whether
25
+ this license or the ordinary General Public License is the better
26
+ strategy to use in any particular case, based on the explanations
27
+ below.
28
+
29
+ When we speak of free software, we are referring to freedom of use,
30
+ not price. Our General Public Licenses are designed to make sure that
31
+ you have the freedom to distribute copies of free software (and charge
32
+ for this service if you wish); that you receive source code or can get
33
+ it if you want it; that you can change the software and use pieces of
34
+ it in new free programs; and that you are informed that you can do
35
+ these things.
36
+
37
+ To protect your rights, we need to make restrictions that forbid
38
+ distributors to deny you these rights or to ask you to surrender these
39
+ rights. These restrictions translate to certain responsibilities for
40
+ you if you distribute copies of the library or if you modify it.
41
+
42
+ For example, if you distribute copies of the library, whether gratis
43
+ or for a fee, you must give the recipients all the rights that we gave
44
+ you. You must make sure that they, too, receive or can get the source
45
+ code. If you link other code with the library, you must provide
46
+ complete object files to the recipients, so that they can relink them
47
+ with the library after making changes to the library and recompiling
48
+ it. And you must show them these terms so they know their rights.
49
+
50
+ We protect your rights with a two-step method: (1) we copyright the
51
+ library, and (2) we offer you this license, which gives you legal
52
+ permission to copy, distribute and/or modify the library.
53
+
54
+ To protect each distributor, we want to make it very clear that
55
+ there is no warranty for the free library. Also, if the library is
56
+ modified by someone else and passed on, the recipients should know
57
+ that what they have is not the original version, so that the original
58
+ author's reputation will not be affected by problems that might be
59
+ introduced by others.
60
+
61
+ Finally, software patents pose a constant threat to the existence of
62
+ any free program. We wish to make sure that a company cannot
63
+ effectively restrict the users of a free program by obtaining a
64
+ restrictive license from a patent holder. Therefore, we insist that
65
+ any patent license obtained for a version of the library must be
66
+ consistent with the full freedom of use specified in this license.
67
+
68
+ Most GNU software, including some libraries, is covered by the
69
+ ordinary GNU General Public License. This license, the GNU Lesser
70
+ General Public License, applies to certain designated libraries, and
71
+ is quite different from the ordinary General Public License. We use
72
+ this license for certain libraries in order to permit linking those
73
+ libraries into non-free programs.
74
+
75
+ When a program is linked with a library, whether statically or using
76
+ a shared library, the combination of the two is legally speaking a
77
+ combined work, a derivative of the original library. The ordinary
78
+ General Public License therefore permits such linking only if the
79
+ entire combination fits its criteria of freedom. The Lesser General
80
+ Public License permits more lax criteria for linking other code with
81
+ the library.
82
+
83
+ We call this license the "Lesser" General Public License because it
84
+ does Less to protect the user's freedom than the ordinary General
85
+ Public License. It also provides other free software developers Less
86
+ of an advantage over competing non-free programs. These disadvantages
87
+ are the reason we use the ordinary General Public License for many
88
+ libraries. However, the Lesser license provides advantages in certain
89
+ special circumstances.
90
+
91
+ For example, on rare occasions, there may be a special need to
92
+ encourage the widest possible use of a certain library, so that it
93
+ becomes a de-facto standard. To achieve this, non-free programs must
94
+ be allowed to use the library. A more frequent case is that a free
95
+ library does the same job as widely used non-free libraries. In this
96
+ case, there is little to gain by limiting the free library to free
97
+ software only, so we use the Lesser General Public License.
98
+
99
+ In other cases, permission to use a particular library in non-free
100
+ programs enables a greater number of people to use a large body of
101
+ free software. For example, permission to use the GNU C Library in
102
+ non-free programs enables many more people to use the whole GNU
103
+ operating system, as well as its variant, the GNU/Linux operating
104
+ system.
105
+
106
+ Although the Lesser General Public License is Less protective of the
107
+ users' freedom, it does ensure that the user of a program that is
108
+ linked with the Library has the freedom and the wherewithal to run
109
+ that program using a modified version of the Library.
110
+
111
+ The precise terms and conditions for copying, distribution and
112
+ modification follow. Pay close attention to the difference between a
113
+ "work based on the library" and a "work that uses the library". The
114
+ former contains code derived from the library, whereas the latter must
115
+ be combined with the library in order to run.
116
+
117
+ GNU LESSER GENERAL PUBLIC LICENSE
118
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
119
+
120
+ 0. This License Agreement applies to any software library or other
121
+ program which contains a notice placed by the copyright holder or
122
+ other authorized party saying it may be distributed under the terms of
123
+ this Lesser General Public License (also called "this License").
124
+ Each licensee is addressed as "you".
125
+
126
+ A "library" means a collection of software functions and/or data
127
+ prepared so as to be conveniently linked with application programs
128
+ (which use some of those functions and data) to form executables.
129
+
130
+ The "Library", below, refers to any such software library or work
131
+ which has been distributed under these terms. A "work based on the
132
+ Library" means either the Library or any derivative work under
133
+ copyright law: that is to say, a work containing the Library or a
134
+ portion of it, either verbatim or with modifications and/or translated
135
+ straightforwardly into another language. (Hereinafter, translation is
136
+ included without limitation in the term "modification".)
137
+
138
+ "Source code" for a work means the preferred form of the work for
139
+ making modifications to it. For a library, complete source code means
140
+ all the source code for all modules it contains, plus any associated
141
+ interface definition files, plus the scripts used to control
142
+ compilation and installation of the library.
143
+
144
+ Activities other than copying, distribution and modification are not
145
+ covered by this License; they are outside its scope. The act of
146
+ running a program using the Library is not restricted, and output from
147
+ such a program is covered only if its contents constitute a work based
148
+ on the Library (independent of the use of the Library in a tool for
149
+ writing it). Whether that is true depends on what the Library does
150
+ and what the program that uses the Library does.
151
+
152
+ 1. You may copy and distribute verbatim copies of the Library's
153
+ complete source code as you receive it, in any medium, provided that
154
+ you conspicuously and appropriately publish on each copy an
155
+ appropriate copyright notice and disclaimer of warranty; keep intact
156
+ all the notices that refer to this License and to the absence of any
157
+ warranty; and distribute a copy of this License along with the
158
+ Library.
159
+
160
+ You may charge a fee for the physical act of transferring a copy,
161
+ and you may at your option offer warranty protection in exchange for a
162
+ fee.
163
+
164
+ 2. You may modify your copy or copies of the Library or any portion
165
+ of it, thus forming a work based on the Library, and copy and
166
+ distribute such modifications or work under the terms of Section 1
167
+ above, provided that you also meet all of these conditions:
168
+
169
+ a) The modified work must itself be a software library.
170
+
171
+ b) You must cause the files modified to carry prominent notices
172
+ stating that you changed the files and the date of any change.
173
+
174
+ c) You must cause the whole of the work to be licensed at no
175
+ charge to all third parties under the terms of this License.
176
+
177
+ d) If a facility in the modified Library refers to a function or a
178
+ table of data to be supplied by an application program that uses
179
+ the facility, other than as an argument passed when the facility
180
+ is invoked, then you must make a good faith effort to ensure that,
181
+ in the event an application does not supply such function or
182
+ table, the facility still operates, and performs whatever part of
183
+ its purpose remains meaningful.
184
+
185
+ (For example, a function in a library to compute square roots has
186
+ a purpose that is entirely well-defined independent of the
187
+ application. Therefore, Subsection 2d requires that any
188
+ application-supplied function or table used by this function must
189
+ be optional: if the application does not supply it, the square
190
+ root function must still compute square roots.)
191
+
192
+ These requirements apply to the modified work as a whole. If
193
+ identifiable sections of that work are not derived from the Library,
194
+ and can be reasonably considered independent and separate works in
195
+ themselves, then this License, and its terms, do not apply to those
196
+ sections when you distribute them as separate works. But when you
197
+ distribute the same sections as part of a whole which is a work based
198
+ on the Library, the distribution of the whole must be on the terms of
199
+ this License, whose permissions for other licensees extend to the
200
+ entire whole, and thus to each and every part regardless of who wrote
201
+ it.
202
+
203
+ Thus, it is not the intent of this section to claim rights or contest
204
+ your rights to work written entirely by you; rather, the intent is to
205
+ exercise the right to control the distribution of derivative or
206
+ collective works based on the Library.
207
+
208
+ In addition, mere aggregation of another work not based on the Library
209
+ with the Library (or with a work based on the Library) on a volume of
210
+ a storage or distribution medium does not bring the other work under
211
+ the scope of this License.
212
+
213
+ 3. You may opt to apply the terms of the ordinary GNU General Public
214
+ License instead of this License to a given copy of the Library. To do
215
+ this, you must alter all the notices that refer to this License, so
216
+ that they refer to the ordinary GNU General Public License, version 2,
217
+ instead of to this License. (If a newer version than version 2 of the
218
+ ordinary GNU General Public License has appeared, then you can specify
219
+ that version instead if you wish.) Do not make any other change in
220
+ these notices.
221
+
222
+ Once this change is made in a given copy, it is irreversible for
223
+ that copy, so the ordinary GNU General Public License applies to all
224
+ subsequent copies and derivative works made from that copy.
225
+
226
+ This option is useful when you wish to copy part of the code of
227
+ the Library into a program that is not a library.
228
+
229
+ 4. You may copy and distribute the Library (or a portion or
230
+ derivative of it, under Section 2) in object code or executable form
231
+ under the terms of Sections 1 and 2 above provided that you accompany
232
+ it with the complete corresponding machine-readable source code, which
233
+ must be distributed under the terms of Sections 1 and 2 above on a
234
+ medium customarily used for software interchange.
235
+
236
+ If distribution of object code is made by offering access to copy
237
+ from a designated place, then offering equivalent access to copy the
238
+ source code from the same place satisfies the requirement to
239
+ distribute the source code, even though third parties are not
240
+ compelled to copy the source along with the object code.
241
+
242
+ 5. A program that contains no derivative of any portion of the
243
+ Library, but is designed to work with the Library by being compiled or
244
+ linked with it, is called a "work that uses the Library". Such a
245
+ work, in isolation, is not a derivative work of the Library, and
246
+ therefore falls outside the scope of this License.
247
+
248
+ However, linking a "work that uses the Library" with the Library
249
+ creates an executable that is a derivative of the Library (because it
250
+ contains portions of the Library), rather than a "work that uses the
251
+ library". The executable is therefore covered by this License.
252
+ Section 6 states terms for distribution of such executables.
253
+
254
+ When a "work that uses the Library" uses material from a header file
255
+ that is part of the Library, the object code for the work may be a
256
+ derivative work of the Library even though the source code is not.
257
+ Whether this is true is especially significant if the work can be
258
+ linked without the Library, or if the work is itself a library. The
259
+ threshold for this to be true is not precisely defined by law.
260
+
261
+ If such an object file uses only numerical parameters, data
262
+ structure layouts and accessors, and small macros and small inline
263
+ functions (ten lines or less in length), then the use of the object
264
+ file is unrestricted, regardless of whether it is legally a derivative
265
+ work. (Executables containing this object code plus portions of the
266
+ Library will still fall under Section 6.)
267
+
268
+ Otherwise, if the work is a derivative of the Library, you may
269
+ distribute the object code for the work under the terms of Section 6.
270
+ Any executables containing that work also fall under Section 6,
271
+ whether or not they are linked directly with the Library itself.
272
+
273
+ 6. As an exception to the Sections above, you may also combine or
274
+ link a "work that uses the Library" with the Library to produce a
275
+ work containing portions of the Library, and distribute that work
276
+ under terms of your choice, provided that the terms permit
277
+ modification of the work for the customer's own use and reverse
278
+ engineering for debugging such modifications.
279
+
280
+ You must give prominent notice with each copy of the work that the
281
+ Library is used in it and that the Library and its use are covered by
282
+ this License. You must supply a copy of this License. If the work
283
+ during execution displays copyright notices, you must include the
284
+ copyright notice for the Library among them, as well as a reference
285
+ directing the user to the copy of this License. Also, you must do one
286
+ of these things:
287
+
288
+ a) Accompany the work with the complete corresponding
289
+ machine-readable source code for the Library including whatever
290
+ changes were used in the work (which must be distributed under
291
+ Sections 1 and 2 above); and, if the work is an executable linked
292
+ with the Library, with the complete machine-readable "work that
293
+ uses the Library", as object code and/or source code, so that the
294
+ user can modify the Library and then relink to produce a modified
295
+ executable containing the modified Library. (It is understood
296
+ that the user who changes the contents of definitions files in the
297
+ Library will not necessarily be able to recompile the application
298
+ to use the modified definitions.)
299
+
300
+ b) Use a suitable shared library mechanism for linking with the
301
+ Library. A suitable mechanism is one that (1) uses at run time a
302
+ copy of the library already present on the user's computer system,
303
+ rather than copying library functions into the executable, and (2)
304
+ will operate properly with a modified version of the library, if
305
+ the user installs one, as long as the modified version is
306
+ interface-compatible with the version that the work was made with.
307
+
308
+ c) Accompany the work with a written offer, valid for at least
309
+ three years, to give the same user the materials specified in
310
+ Subsection 6a, above, for a charge no more than the cost of
311
+ performing this distribution.
312
+
313
+ d) If distribution of the work is made by offering access to copy
314
+ from a designated place, offer equivalent access to copy the above
315
+ specified materials from the same place.
316
+
317
+ e) Verify that the user has already received a copy of these
318
+ materials or that you have already sent this user a copy.
319
+
320
+ For an executable, the required form of the "work that uses the
321
+ Library" must include any data and utility programs needed for
322
+ reproducing the executable from it. However, as a special exception,
323
+ the materials to be distributed need not include anything that is
324
+ normally distributed (in either source or binary form) with the major
325
+ components (compiler, kernel, and so on) of the operating system on
326
+ which the executable runs, unless that component itself accompanies
327
+ the executable.
328
+
329
+ It may happen that this requirement contradicts the license
330
+ restrictions of other proprietary libraries that do not normally
331
+ accompany the operating system. Such a contradiction means you cannot
332
+ use both them and the Library together in an executable that you
333
+ distribute.
334
+
335
+ 7. You may place library facilities that are a work based on the
336
+ Library side-by-side in a single library together with other library
337
+ facilities not covered by this License, and distribute such a combined
338
+ library, provided that the separate distribution of the work based on
339
+ the Library and of the other library facilities is otherwise
340
+ permitted, and provided that you do these two things:
341
+
342
+ a) Accompany the combined library with a copy of the same work
343
+ based on the Library, uncombined with any other library
344
+ facilities. This must be distributed under the terms of the
345
+ Sections above.
346
+
347
+ b) Give prominent notice with the combined library of the fact
348
+ that part of it is a work based on the Library, and explaining
349
+ where to find the accompanying uncombined form of the same work.
350
+
351
+ 8. You may not copy, modify, sublicense, link with, or distribute
352
+ the Library except as expressly provided under this License. Any
353
+ attempt otherwise to copy, modify, sublicense, link with, or
354
+ distribute the Library is void, and will automatically terminate your
355
+ rights under this License. However, parties who have received copies,
356
+ or rights, from you under this License will not have their licenses
357
+ terminated so long as such parties remain in full compliance.
358
+
359
+ 9. You are not required to accept this License, since you have not
360
+ signed it. However, nothing else grants you permission to modify or
361
+ distribute the Library or its derivative works. These actions are
362
+ prohibited by law if you do not accept this License. Therefore, by
363
+ modifying or distributing the Library (or any work based on the
364
+ Library), you indicate your acceptance of this License to do so, and
365
+ all its terms and conditions for copying, distributing or modifying
366
+ the Library or works based on it.
367
+
368
+ 10. Each time you redistribute the Library (or any work based on the
369
+ Library), the recipient automatically receives a license from the
370
+ original licensor to copy, distribute, link with or modify the Library
371
+ subject to these terms and conditions. You may not impose any further
372
+ restrictions on the recipients' exercise of the rights granted herein.
373
+ You are not responsible for enforcing compliance by third parties with
374
+ this License.
375
+
376
+ 11. If, as a consequence of a court judgment or allegation of patent
377
+ infringement or for any other reason (not limited to patent issues),
378
+ conditions are imposed on you (whether by court order, agreement or
379
+ otherwise) that contradict the conditions of this License, they do not
380
+ excuse you from the conditions of this License. If you cannot
381
+ distribute so as to satisfy simultaneously your obligations under this
382
+ License and any other pertinent obligations, then as a consequence you
383
+ may not distribute the Library at all. For example, if a patent
384
+ license would not permit royalty-free redistribution of the Library by
385
+ all those who receive copies directly or indirectly through you, then
386
+ the only way you could satisfy both it and this License would be to
387
+ refrain entirely from distribution of the Library.
388
+
389
+ If any portion of this section is held invalid or unenforceable under
390
+ any particular circumstance, the balance of the section is intended to
391
+ apply, and the section as a whole is intended to apply in other
392
+ circumstances.
393
+
394
+ It is not the purpose of this section to induce you to infringe any
395
+ patents or other property right claims or to contest validity of any
396
+ such claims; this section has the sole purpose of protecting the
397
+ integrity of the free software distribution system which is
398
+ implemented by public license practices. Many people have made
399
+ generous contributions to the wide range of software distributed
400
+ through that system in reliance on consistent application of that
401
+ system; it is up to the author/donor to decide if he or she is willing
402
+ to distribute software through any other system and a licensee cannot
403
+ impose that choice.
404
+
405
+ This section is intended to make thoroughly clear what is believed to
406
+ be a consequence of the rest of this License.
407
+
408
+ 12. If the distribution and/or use of the Library is restricted in
409
+ certain countries either by patents or by copyrighted interfaces, the
410
+ original copyright holder who places the Library under this License
411
+ may add an explicit geographical distribution limitation excluding those
412
+ countries, so that distribution is permitted only in or among
413
+ countries not thus excluded. In such case, this License incorporates
414
+ the limitation as if written in the body of this License.
415
+
416
+ 13. The Free Software Foundation may publish revised and/or new
417
+ versions of the Lesser General Public License from time to time.
418
+ Such new versions will be similar in spirit to the present version,
419
+ but may differ in detail to address new problems or concerns.
420
+
421
+ Each version is given a distinguishing version number. If the Library
422
+ specifies a version number of this License which applies to it and
423
+ "any later version", you have the option of following the terms and
424
+ conditions either of that version or of any later version published by
425
+ the Free Software Foundation. If the Library does not specify a
426
+ license version number, you may choose any version ever published by
427
+ the Free Software Foundation.
428
+
429
+ 14. If you wish to incorporate parts of the Library into other free
430
+ programs whose distribution conditions are incompatible with these,
431
+ write to the author to ask for permission. For software which is
432
+ copyrighted by the Free Software Foundation, write to the Free
433
+ Software Foundation; we sometimes make exceptions for this. Our
434
+ decision will be guided by the two goals of preserving the free status
435
+ of all derivatives of our free software and of promoting the sharing
436
+ and reuse of software generally.
437
+
438
+ NO WARRANTY
439
+
440
+ 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
441
+ WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
442
+ EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
443
+ OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
444
+ KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
445
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
446
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
447
+ LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
448
+ THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
449
+
450
+ 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
451
+ WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
452
+ AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
453
+ FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
454
+ CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
455
+ LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
456
+ RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
457
+ FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
458
+ SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
459
+ DAMAGES.
460
+
mosesdecoder/Jamroot ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #BUILDING MOSES
2
+
3
+ #PACKAGES
4
+ #Language models (optional):
5
+ #--with-irstlm=/path/to/irstlm
6
+ #--with-srilm=/path/to/srilm See moses/LM/Jamfile for more options.
7
+ #--with-maxent-srilm=true (requires a maxent-enabled version of SRILM to be specified via --with-srilm)
8
+ #--with-nplm=/path/to/nplm
9
+ #--with-randlm=/path/to/randlm
10
+ #KenLM is always compiled.
11
+ #
12
+ #--with-boost=/path/to/boost
13
+ #If Boost is in a non-standard location, specify it here. This directory is
14
+ #expected to contain include and lib or lib64.
15
+ #
16
+ #--with-xmlrpc-c=/path/to/xmlrpc-c for libxmlrpc-c (used by server)
17
+ #Note that, like language models, this is the --prefix where the library was
18
+ #installed, not some executable within the library.
19
+ #
20
+ #--no-xmlrpc-c
21
+ # Don't use xmlrpc-c library, even if it exists. Don't build moses server
22
+ #
23
+ #Compact phrase table and compact lexical reordering table
24
+ #--with-cmph=/path/to/cmph
25
+ #
26
+ #Thread-caching malloc (if present, used for multi-threaded builds by default)
27
+ #--without-tcmalloc does not compile with tcmalloc even if present
28
+ #--full-tcmalloc links against the full version (useful for memory profiling)
29
+ #
30
+ #REGRESSION TESTING
31
+ #--with-regtest=/path/to/moses-reg-test-data
32
+ #
33
+ #INSTALLATION
34
+ #--prefix=/path/to/prefix sets the install prefix [default is source root].
35
+ #--bindir=/path/to/prefix/bin sets the bin directory [PREFIX/bin]
36
+ #--libdir=/path/to/prefix/lib sets the lib directory [PREFIX/lib]
37
+ #--includedir=/path/to/prefix/include installs headers.
38
+ # Does not install if missing. No argument defaults to PREFIX/include .
39
+ #--install-scripts=/path/to/scripts copies scripts into a directory.
40
+ # Does not install if missing. No argument defaults to PREFIX/scripts .
41
+ #--git appends the git revision to the prefix directory.
42
+ #
43
+ #
44
+ #BUILD OPTIONS
45
+ # By default, the build is multi-threaded, optimized, and statically linked.
46
+ # Pass these to change the build:
47
+ #
48
+ # threading=single|multi controls threading (default multi)
49
+ #
50
+ # variant=release|debug|profile builds optimized (default), for debug, or for
51
+ # profiling
52
+ #
53
+ # link=static|shared controls preferred linking (default static)
54
+ # --static forces static linking (the default will fall
55
+ # back to shared)
56
+ #
57
+ # debug-symbols=on|off include or exclude (default) debugging
58
+ # information also known as -g
59
+ # --notrace compiles without TRACE macros
60
+ #
61
+ # --enable-boost-pool uses Boost pools for the memory SCFG tabgle
62
+ #
63
+ # --enable-mpi switch on mpi
64
+ # --without-libsegfault does not link with libSegFault
65
+ #
66
+ # --max-kenlm-order maximum ngram order that kenlm can process (default 6)
67
+ #
68
+ # --max-factors maximum number of factors (default 4)
69
+ #
70
+ # --unlabelled-source ignore source labels (redundant in hiero or string-to-tree system)
71
+ # for better performance
72
+ #CONTROLLING THE BUILD
73
+ #-a to build from scratch
74
+ #-j$NCPUS to compile in parallel
75
+ #--clean to clean
76
+ #--debug-build to build with Og. Only available with gcc 4.8+
77
+
78
+ import os ;
79
+ import option ;
80
+ import modules ;
81
+ import path ;
82
+ path-constant TOP : . ;
83
+
84
+ include $(TOP)/jam-files/sanity.jam ;
85
+
86
+ home = [ os.environ "HOME" ] ;
87
+ if [ path.exists $(home)/moses-environment.jam ]
88
+ {
89
+ # for those of use who don't like typing in command line bjam options all day long
90
+ include $(home)/moses-environment.jam ;
91
+ }
92
+ include $(TOP)/jam-files/check-environment.jam ; # get resource locations
93
+ # from environment variables
94
+ include $(TOP)/jam-files/xmlrpc-c.jam ; # xmlrpc-c stuff for the server
95
+ # include $(TOP)/jam-files/curlpp.jam ; # curlpp stuff for bias lookup (MMT only)
96
+
97
+ # exit "done" : 0 ;
98
+
99
+ max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
100
+ if ! [ option.get "max-kenlm-order" ]
101
+ {
102
+ # some classes in Moses pull in header files from KenLM, so this needs to be
103
+ # defined here, not in moses/lm/Jamfile
104
+ option.set "max-kenlm-order" : 6 ;
105
+ requirements += <define>KENLM_MAX_ORDER=$(max-order) ;
106
+ }
107
+ # exit "all done" : 0 ;
108
+
109
+ boost 104400 ;
110
+ external-lib z ;
111
+
112
+ #lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
113
+ #requirements += <library>dl ;
114
+ requirements += <cxxflags>-std=c++0x ;
115
+
116
+ # Allow moses to report the git commit hash of the version used for compilation
117
+ moses_githash = [ _shell "git describe --dirty" ] ;
118
+ requirements += <define>MOSES_VERSION_ID=\\\"$(moses_githash)\\\" ;
119
+
120
+ if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] {
121
+ if [ option.get "full-tcmalloc" : : "yes" ] {
122
+ external-lib unwind ;
123
+ external-lib tcmalloc_and_profiler : : unwind ;
124
+ requirements += <library>tcmalloc_and_profiler <library>unwind <cflags>-fno-omit-frame-pointer <cxxflags>-fno-omit-frame-pointer ;
125
+ } else {
126
+ external-lib tcmalloc_minimal ;
127
+ requirements += <threading>multi:<library>tcmalloc_minimal ;
128
+ }
129
+ } else {
130
+ echo "Tip: install tcmalloc for faster threading. See BUILD-INSTRUCTIONS.txt for more information." ;
131
+ }
132
+
133
+ if [ option.get "filter-warnings" : : "yes" ] {
134
+ # given the low coding standards in Moses, we may want to filter out
135
+ # warnings about poor coding practice that no-one is ever going to fix
136
+ # anyway ...
137
+ requirements += <cxxflags>-Wno-deprecated ;
138
+ requirements += <cxxflags>-Wno-reorder ;
139
+ requirements += <cxxflags>-Wno-sign-compare ;
140
+ requirements += <cxxflags>-Wno-unused-but-set-variable ;
141
+ requirements += <cxxflags>-Wno-unused-result ;
142
+ requirements += <cxxflags>-Wno-unused-variable ;
143
+ requirements += <cxxflags>-Wno-comment ;
144
+ requirements += <cxxflags>-Wno-strict-aliasing ;
145
+ requirements += <cxxflags>-Wno-overloaded-virtual ;
146
+ }
147
+
148
+ if [ option.get "debug-build" : : "yes" ] {
149
+ requirements += <cxxflags>-Og ;
150
+ echo "Building with -Og to enable easier profiling and debugging. Only available on gcc 4.8+." ;
151
+ }
152
+
153
+ if [ option.get "with-address-sanitizer" : : "yes" ] {
154
+ requirements += <cxxflags>-fsanitize=address ;
155
+ requirements += <cxxflags>-fno-omit-frame-pointer ;
156
+ requirements += <linkflags>-fsanitize=address ;
157
+ echo "Building with AddressSanitizer to enable debugging of memory errors. Only available on gcc 4.8+." ;
158
+ }
159
+
160
+ if [ option.get "enable-mpi" : : "yes" ] {
161
+ import mpi ;
162
+ using mpi ;
163
+ external-lib boost_mpi ;
164
+ external-lib boost_serialization ;
165
+ requirements += <define>MPI_ENABLE ;
166
+ requirements += <library>mpi ;
167
+ requirements += <library>boost_mpi ;
168
+ requirements += <library>boost_serialization ;
169
+ }
170
+
171
+ mmt = [ option.get "mmt" ] ;
172
+ if $(mmt) {
173
+ requirements += <define>MMT ;
174
+ requirements += <include>$(mmt) ;
175
+ mmt_githash = [ _shell "cd $(mmt) && git describe --dirty" ] ;
176
+ requirements += <define>MMT_VERSION_ID=\\\"$(mmt_githash)\\\" ;
177
+ }
178
+
179
+ requirements += [ option.get "notrace" : <define>TRACE_ENABLE=1 ] ;
180
+ requirements += [ option.get "enable-boost-pool" : : <define>USE_BOOST_POOL ] ;
181
+ requirements += [ option.get "with-mm" : : <define>PT_UG ] ;
182
+ requirements += [ option.get "with-mm" : : <define>MAX_NUM_FACTORS=4 ] ;
183
+ requirements += [ option.get "unlabelled-source" : : <define>UNLABELLED_SOURCE ] ;
184
+
185
+ if [ option.get "with-oxlm" ] {
186
+ external-lib gomp ;
187
+ requirements += <library>boost_serialization ;
188
+ requirements += <library>gomp ;
189
+ }
190
+
191
+ if [ option.get "with-cmph" : : "yes" ] {
192
+ requirements += <define>HAVE_CMPH ;
193
+ }
194
+
195
+ if [ option.get "with-icu" : : "yes" ]
196
+ {
197
+ external-lib icuuc ;
198
+ external-lib icuio ;
199
+ external-lib icui18n ;
200
+ requirements += <library>icuuc/<link>shared ;
201
+ requirements += <library>icuio/<link>shared ;
202
+ requirements += <library>icui18n/<link>shared ;
203
+ requirements += <cxxflags>-fPIC ;
204
+ requirements += <address-model>64 ;
205
+ # requirements += <runtime-link>shared ;
206
+ }
207
+
208
+ # for probing pt
209
+ external-lib boost_serialization ;
210
+ requirements += <library>boost_serialization/<runtime-link>static ;
211
+
212
+ if [ option.get "with-vw" ] {
213
+ requirements += <define>HAVE_VW ;
214
+ }
215
+
216
+ project : default-build
217
+ <threading>multi
218
+ <warnings>on
219
+ <debug-symbols>off
220
+ <variant>release
221
+ <link>static
222
+ ;
223
+
224
+ #Apparently OS X likes to link against iconv for fgetsUTF8.
225
+ lib iconv ;
226
+ requirements += <os>MACOSX:<library>iconv ;
227
+
228
+ project : requirements
229
+ <threading>multi:<define>WITH_THREADS
230
+ <threading>multi:<library>boost_thread
231
+ <library>boost_system
232
+ <library>boost_program_options
233
+ <define>_FILE_OFFSET_BITS=64 <define>_LARGE_FILES
234
+ $(requirements)
235
+ <include>.
236
+ ;
237
+
238
+
239
+ #Add directories here if you want their incidental targets too (i.e. tests).
240
+ build-projects lm util phrase-extract phrase-extract/syntax-common search moses moses/LM mert moses-cmd scripts regression-testing ;
241
+ # contrib/mira
242
+
243
+ if [ option.get "with-mm-extras" : : "yes" ]
244
+ {
245
+ alias mm-extras :
246
+ moses/TranslationModel/UG//bitext-find
247
+ moses/TranslationModel/UG//ptable-describe-features
248
+ moses/TranslationModel/UG//count-ptable-features
249
+ moses/TranslationModel/UG//ptable-sigtest-filter
250
+ moses/TranslationModel/UG//ptable-lookup
251
+ moses/TranslationModel/UG//ptable-lookup-corpus
252
+ moses/TranslationModel/UG//check-coverage
253
+ moses/TranslationModel/UG/mm//mtt-demo1
254
+ moses/TranslationModel/UG/mm//mtt-dump
255
+ moses/TranslationModel/UG/mm//mam2symal
256
+ moses/TranslationModel/UG/mm//mam_verify
257
+ moses/TranslationModel/UG/mm//mmlex-lookup
258
+ moses/TranslationModel/UG/mm//mtt-count-words
259
+ moses/TranslationModel/UG/mm//calc-coverage
260
+ moses/TranslationModel/UG//try-align
261
+ ;
262
+ }
263
+ else
264
+ {
265
+ alias mm-extras ;
266
+ }
267
+
268
+ if [ option.get "with-mm" : : "yes" ]
269
+ {
270
+ alias mm :
271
+ moses/TranslationModel/UG/mm//mtt-build
272
+ moses/TranslationModel/UG/mm//symal2mam
273
+ moses/TranslationModel/UG/mm//mmlex-build
274
+ ;
275
+ }
276
+ else
277
+ {
278
+ alias mm ;
279
+ }
280
+
281
+ if [ option.get "with-rephraser" : : "yes" ]
282
+ {
283
+ alias rephraser :
284
+ contrib/rephraser//paraphrase
285
+ ;
286
+ }
287
+ else
288
+ {
289
+ alias rephraser ;
290
+ }
291
+
292
+ alias programs :
293
+ lm//programs
294
+ moses-cmd//programs
295
+ OnDiskPt//CreateOnDiskPt
296
+ OnDiskPt//queryOnDiskPt
297
+ mert//programs
298
+ misc//programs
299
+ symal
300
+ phrase-extract
301
+ phrase-extract//lexical-reordering
302
+ phrase-extract//extract-ghkm
303
+ phrase-extract//pcfg-extract
304
+ phrase-extract//pcfg-score
305
+ phrase-extract//extract-mixed-syntax
306
+ phrase-extract//score-stsg
307
+ phrase-extract//filter-rule-table
308
+ phrase-extract//postprocess-egret-forests
309
+ biconcor
310
+ # contrib/mira//mira
311
+ contrib/server//mosesserver
312
+ mm
313
+ mm-extras
314
+ rephraser
315
+ contrib/c++tokenizer//tokenizer
316
+ contrib/expected-bleu-training//train-expected-bleu
317
+ contrib/expected-bleu-training//prepare-expected-bleu-training
318
+
319
+ probingpt//programs
320
+ moses2//programs
321
+ ;
322
+
323
+
324
+ install-bin-libs programs ;
325
+ install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ;
326
+ install-headers headers-moses : moses//headers-to-install : moses ;
327
+
328
+ alias install : prefix-bin prefix-lib headers-base headers-moses ;
329
+
330
+ if ! [ option.get "includedir" : : $(prefix)/include ] {
331
+ explicit install headers-base headers-moses ;
332
+ }
333
+
334
+ if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
335
+ echo "You have a $(TOP)/dist directory, but the build system now places files directly in the root i.e. $(TOP)/bin ." ;
336
+ echo "To disable this message, delete $(TOP)/dist ." ;
337
+ echo ;
338
+ }
339
+
340
+ #local temp = [ _shell "bash source ./s.sh" ] ;
341
+ local temp = [ _shell "mkdir -p $(PREFIX)/bin" ] ;
342
+ local temp = [ _shell "rm -f $(PREFIX)/bin/moses_chart" ] ;
343
+ local temp = [ _shell "cd $(PREFIX)/bin && ln -sf moses moses_chart" ] ;
344
+ local temp = [ _shell "cd $(PREFIX)/bin && ln -sf CreateProbingPT CreateProbingPT2" ] ;
345
+
mosesdecoder/OnDiskPt/Jamfile ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fakelib OnDiskPt : OnDiskWrapper.cpp SourcePhrase.cpp TargetPhrase.cpp Word.cpp Phrase.cpp PhraseNode.cpp TargetPhraseCollection.cpp Vocab.cpp OnDiskQuery.cpp ../moses//headers ;
2
+
3
+ exe CreateOnDiskPt : Main.cpp ..//boost_filesystem ../moses//moses OnDiskPt ;
4
+ exe queryOnDiskPt : queryOnDiskPt.cpp ..//boost_filesystem ../moses//moses OnDiskPt ;
5
+
mosesdecoder/OnDiskPt/Main.cpp ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
4
+ Copyright (C) 2009 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+
21
+ #include <algorithm>
22
+ #include <iostream>
23
+ #include <string>
24
+ #include <vector>
25
+ #include <iterator>
26
+ #include <cassert>
27
+ #include "moses/InputFileStream.h"
28
+ #include "moses/Timer.h"
29
+ #include "moses/Util.h"
30
+ #include "OnDiskWrapper.h"
31
+ #include "SourcePhrase.h"
32
+ #include "TargetPhrase.h"
33
+ #include "TargetPhraseCollection.h"
34
+ #include "Word.h"
35
+ #include "Vocab.h"
36
+ #include "Main.h"
37
+
38
+ using namespace std;
39
+ using namespace OnDiskPt;
40
+
41
+ int main (int argc, char * const argv[])
42
+ {
43
+ // insert code here...
44
+ Moses::ResetUserTime();
45
+ Moses::PrintUserTime("Starting");
46
+
47
+ if (argc != 8) {
48
+ std::cerr << "Usage: " << argv[0] << " numSourceFactors numTargetFactors numScores tableLimit sortScoreIndex inputPath outputPath" << std::endl;
49
+ return 1;
50
+ }
51
+
52
+ int numSourceFactors = Moses::Scan<int>(argv[1])
53
+ , numTargetFactors = Moses::Scan<int>(argv[2])
54
+ , numScores = Moses::Scan<int>(argv[3])
55
+ , tableLimit = Moses::Scan<int>(argv[4]);
56
+ TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]);
57
+ assert(TargetPhraseCollection::s_sortScoreInd < numScores);
58
+
59
+ const string filePath = argv[6]
60
+ ,destPath = argv[7];
61
+
62
+ Moses::InputFileStream inStream(filePath);
63
+
64
+ OnDiskWrapper onDiskWrapper;
65
+ onDiskWrapper.BeginSave(destPath, numSourceFactors, numTargetFactors, numScores);
66
+
67
+ PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode();
68
+ size_t lineNum = 0;
69
+ string line;
70
+
71
+ while(getline(inStream, line)) {
72
+ lineNum++;
73
+ if (lineNum%1000 == 0) cerr << "." << flush;
74
+ if (lineNum%10000 == 0) cerr << ":" << flush;
75
+ if (lineNum%100000 == 0) cerr << lineNum << flush;
76
+ //cerr << lineNum << " " << line << endl;
77
+
78
+ std::vector<float> misc(1);
79
+ SourcePhrase sourcePhrase;
80
+ TargetPhrase *targetPhrase = new TargetPhrase(numScores);
81
+ OnDiskPt::PhrasePtr spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc);
82
+ assert(misc.size() == onDiskWrapper.GetNumCounts());
83
+
84
+ rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort);
85
+ }
86
+
87
+ rootNode.Save(onDiskWrapper, 0, tableLimit);
88
+ onDiskWrapper.EndSave();
89
+
90
+ Moses::PrintUserTime("Finished");
91
+
92
+ //pause();
93
+ return 0;
94
+
95
+ } // main()
96
+
97
+ bool Flush(const OnDiskPt::SourcePhrase *prevSourcePhrase, const OnDiskPt::SourcePhrase *currSourcePhrase)
98
+ {
99
+ if (prevSourcePhrase == NULL)
100
+ return false;
101
+
102
+ assert(currSourcePhrase);
103
+ bool ret = (*currSourcePhrase > *prevSourcePhrase);
104
+ //cerr << *prevSourcePhrase << endl << *currSourcePhrase << " " << ret << endl << endl;
105
+
106
+ return ret;
107
+ }
108
+
109
+ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, const std::string &lineStr, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc)
110
+ {
111
+ char line[lineStr.size() + 1];
112
+ strcpy(line, lineStr.c_str());
113
+
114
+ stringstream sparseFeatures, property;
115
+
116
+ size_t scoreInd = 0;
117
+
118
+ // MAIN LOOP
119
+ size_t stage = 0;
120
+ /* 0 = source phrase
121
+ 1 = target phrase
122
+ 2 = scores
123
+ 3 = align
124
+ 4 = count
125
+ 7 = properties
126
+ */
127
+ char *tok = strtok (line," ");
128
+ OnDiskPt::PhrasePtr out(new Phrase());
129
+ while (tok != NULL) {
130
+ if (0 == strcmp(tok, "|||")) {
131
+ ++stage;
132
+ } else {
133
+ switch (stage) {
134
+ case 0: {
135
+ WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper, 1);
136
+ if (w != NULL)
137
+ out->AddWord(w);
138
+
139
+ break;
140
+ }
141
+ case 1: {
142
+ Tokenize(targetPhrase, tok, false, true, onDiskWrapper, 0);
143
+ break;
144
+ }
145
+ case 2: {
146
+ float score = Moses::Scan<float>(tok);
147
+ targetPhrase.SetScore(score, scoreInd);
148
+ ++scoreInd;
149
+ break;
150
+ }
151
+ case 3: {
152
+ //targetPhrase.Create1AlignFromString(tok);
153
+ targetPhrase.CreateAlignFromString(tok);
154
+ break;
155
+ }
156
+ case 4: {
157
+ // store only the 3rd one (rule count)
158
+ float val = Moses::Scan<float>(tok);
159
+ misc[0] = val;
160
+ break;
161
+ }
162
+ case 5: {
163
+ // sparse features
164
+ sparseFeatures << tok << " ";
165
+ break;
166
+ }
167
+ case 6: {
168
+ property << tok << " ";
169
+ break;
170
+ }
171
+ default:
172
+ cerr << "ERROR in line " << line << endl;
173
+ assert(false);
174
+ break;
175
+ }
176
+ }
177
+
178
+ tok = strtok (NULL, " ");
179
+ } // while (tok != NULL)
180
+
181
+ assert(scoreInd == numScores);
182
+ targetPhrase.SetSparseFeatures(Moses::Trim(sparseFeatures.str()));
183
+ targetPhrase.SetProperty(Moses::Trim(property.str()));
184
+ targetPhrase.SortAlign();
185
+ return out;
186
+ } // Tokenize()
187
+
188
+ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
189
+ , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
190
+ , OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget)
191
+ {
192
+ // retSourceTarget: 0 = don't return anything. 1 = source, 2 = target
193
+
194
+ bool nonTerm = false;
195
+ size_t tokSize = token.size();
196
+ int comStr =token.compare(0, 1, "[");
197
+
198
+ if (comStr == 0) {
199
+ comStr = token.compare(tokSize - 1, 1, "]");
200
+ nonTerm = comStr == 0;
201
+ }
202
+
203
+ OnDiskPt::WordPtr out;
204
+ if (nonTerm) {
205
+ // non-term
206
+ size_t splitPos = token.find_first_of("[", 2);
207
+ string wordStr = token.substr(0, splitPos);
208
+
209
+ if (splitPos == string::npos) {
210
+ // lhs - only 1 word
211
+ WordPtr word(new Word());
212
+ word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
213
+ phrase.AddWord(word);
214
+ } else {
215
+ // source & target non-terms
216
+ if (addSourceNonTerm) {
217
+ WordPtr word(new Word());
218
+ word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
219
+ phrase.AddWord(word);
220
+
221
+ if (retSourceTarget == 1) {
222
+ out = word;
223
+ }
224
+ }
225
+
226
+ wordStr = token.substr(splitPos, tokSize - splitPos);
227
+ if (addTargetNonTerm) {
228
+ WordPtr word(new Word());
229
+ word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
230
+ phrase.AddWord(word);
231
+
232
+ if (retSourceTarget == 2) {
233
+ out = word;
234
+ }
235
+ }
236
+
237
+ }
238
+ } else {
239
+ // term
240
+ WordPtr word(new Word());
241
+ word->CreateFromString(token, onDiskWrapper.GetVocab());
242
+ phrase.AddWord(word);
243
+ out = word;
244
+ }
245
+
246
+ return out;
247
+ }
248
+
249
+ void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const ::AlignType &alignments)
250
+ {
251
+ for (int ind = alignments.size() - 1; ind >= 0; --ind) {
252
+ const ::AlignPair &alignPair = alignments[ind];
253
+ size_t sourcePos = alignPair.first
254
+ ,targetPos = alignPair.second;
255
+
256
+ const string &target = targetToks[targetPos];
257
+ sourceToks.insert(sourceToks.begin() + sourcePos + 1, target);
258
+
259
+ }
260
+ }
261
+
262
+ class AlignOrderer
263
+ {
264
+ public:
265
+ bool operator()(const ::AlignPair &a, const ::AlignPair &b) const {
266
+ return a.first < b.first;
267
+ }
268
+ };
269
+
270
+ void SortAlign(::AlignType &alignments)
271
+ {
272
+ std::sort(alignments.begin(), alignments.end(), AlignOrderer());
273
+ }
mosesdecoder/OnDiskPt/Main.h ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // $Id$
3
+ /***********************************************************************
4
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
5
+ Copyright (C) 2009 Hieu Hoang
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+ #include <string>
22
+ #include "SourcePhrase.h"
23
+ #include "TargetPhrase.h"
24
+
25
+ typedef std::pair<size_t, size_t> AlignPair;
26
+ typedef std::vector<AlignPair> AlignType;
27
+
28
+ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
29
+ , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
30
+ , OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget);
31
+ OnDiskPt::PhrasePtr Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
32
+ , const std::string &lineStr, OnDiskPt::OnDiskWrapper &onDiskWrapper
33
+ , int numScores
34
+ , std::vector<float> &misc);
35
+
36
+ void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const AlignType &alignments);
37
+ void SortAlign(AlignType &alignments);
38
+ bool Flush(const OnDiskPt::SourcePhrase *prevSource, const OnDiskPt::SourcePhrase *currSource);
39
+
mosesdecoder/OnDiskPt/OnDiskQuery.cpp ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "OnDiskQuery.h"
2
+
3
+ namespace OnDiskPt
4
+ {
5
+
6
+ void OnDiskQuery::Tokenize(Phrase &phrase,
7
+ const std::string &token,
8
+ bool addSourceNonTerm,
9
+ bool addTargetNonTerm)
10
+ {
11
+ bool nonTerm = false;
12
+ size_t tokSize = token.size();
13
+ int comStr =token.compare(0, 1, "[");
14
+
15
+ if (comStr == 0) {
16
+ comStr = token.compare(tokSize - 1, 1, "]");
17
+ nonTerm = comStr == 0;
18
+ }
19
+
20
+ if (nonTerm) {
21
+ // non-term
22
+ size_t splitPos = token.find_first_of("[", 2);
23
+ std::string wordStr = token.substr(0, splitPos);
24
+
25
+ if (splitPos == std::string::npos) {
26
+ // lhs - only 1 word
27
+ WordPtr word (new Word());
28
+ word->CreateFromString(wordStr, m_wrapper.GetVocab());
29
+ phrase.AddWord(word);
30
+ } else {
31
+ // source & target non-terms
32
+ if (addSourceNonTerm) {
33
+ WordPtr word( new Word());
34
+ word->CreateFromString(wordStr, m_wrapper.GetVocab());
35
+ phrase.AddWord(word);
36
+ }
37
+
38
+ wordStr = token.substr(splitPos, tokSize - splitPos);
39
+ if (addTargetNonTerm) {
40
+ WordPtr word(new Word());
41
+ word->CreateFromString(wordStr, m_wrapper.GetVocab());
42
+ phrase.AddWord(word);
43
+ }
44
+
45
+ }
46
+ } else {
47
+ // term
48
+ WordPtr word(new Word());
49
+ word->CreateFromString(token, m_wrapper.GetVocab());
50
+ phrase.AddWord(word);
51
+ }
52
+ }
53
+
54
+ SourcePhrase OnDiskQuery::Tokenize(const std::vector<std::string>& tokens)
55
+ {
56
+ SourcePhrase sourcePhrase;
57
+ if (tokens.size() > 0) {
58
+ std::vector<std::string>::const_iterator token = tokens.begin();
59
+ for (; token + 1 != tokens.end(); ++token) {
60
+ Tokenize(sourcePhrase, *token, true, true);
61
+ }
62
+ // last position. LHS non-term
63
+ Tokenize(sourcePhrase, *token, false, true);
64
+ }
65
+ return sourcePhrase;
66
+ }
67
+
68
+ const PhraseNode* OnDiskQuery::Query(const SourcePhrase& sourcePhrase)
69
+ {
70
+ const PhraseNode *node = &m_wrapper.GetRootSourceNode();
71
+ assert(node);
72
+
73
+ for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos) {
74
+ const Word &word = sourcePhrase.GetWord(pos);
75
+ node = node->GetChild(word, m_wrapper);
76
+ if (node == NULL) {
77
+ break;
78
+ }
79
+ }
80
+ return node;
81
+ }
82
+
83
+ }
mosesdecoder/OnDiskPt/OnDiskQuery.h ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include <string>
3
+ #include <vector>
4
+ #include "OnDiskWrapper.h"
5
+ #include "Phrase.h"
6
+ #include "SourcePhrase.h"
7
+ #include "Word.h"
8
+ #include "PhraseNode.h"
9
+
10
+
11
+ namespace OnDiskPt
12
+ {
13
+
14
+ class OnDiskQuery
15
+ {
16
+ private:
17
+ OnDiskWrapper &m_wrapper;
18
+
19
+ public:
20
+
21
+ OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper) {}
22
+
23
+ void Tokenize(Phrase &phrase,
24
+ const std::string &token,
25
+ bool addSourceNonTerm,
26
+ bool addTargetNonTerm);
27
+
28
+ SourcePhrase Tokenize(const std::vector<std::string>& tokens);
29
+
30
+ const PhraseNode *Query(const SourcePhrase& sourcePhrase);
31
+
32
+ inline const PhraseNode *Query(const std::vector<std::string>& tokens) {
33
+ return Query(Tokenize(tokens));
34
+ }
35
+
36
+ };
37
+
38
+
39
+ }
mosesdecoder/OnDiskPt/OnDiskWrapper.cpp ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
4
+ Copyright (C) 2009 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+ #ifdef WIN32
21
+ #include <direct.h>
22
+ #endif
23
+ #include <sys/stat.h>
24
+ #include <string>
25
+ #include "OnDiskWrapper.h"
26
+ #include "moses/Util.h"
27
+ #include "util/exception.hh"
28
+ #include "util/string_stream.hh"
29
+
30
+ using namespace std;
31
+
32
+ namespace OnDiskPt
33
+ {
34
+
35
+ int OnDiskWrapper::VERSION_NUM = 7;
36
+
37
+ OnDiskWrapper::OnDiskWrapper()
38
+ {
39
+ }
40
+
41
+ OnDiskWrapper::~OnDiskWrapper()
42
+ {
43
+ delete m_rootSourceNode;
44
+ }
45
+
46
+ void OnDiskWrapper::BeginLoad(const std::string &filePath)
47
+ {
48
+ if (!OpenForLoad(filePath)) {
49
+ UTIL_THROW(util::FileOpenException, "Couldn't open for loading: " << filePath);
50
+ }
51
+
52
+ if (!m_vocab.Load(*this))
53
+ UTIL_THROW(util::FileOpenException, "Couldn't load vocab");
54
+
55
+ uint64_t rootFilePos = GetMisc("RootNodeOffset");
56
+ m_rootSourceNode = new PhraseNode(rootFilePos, *this);
57
+ }
58
+
59
+ bool OnDiskWrapper::OpenForLoad(const std::string &filePath)
60
+ {
61
+ m_fileSource.open((filePath + "/Source.dat").c_str(), ios::in | ios::binary);
62
+ UTIL_THROW_IF(!m_fileSource.is_open(),
63
+ util::FileOpenException,
64
+ "Couldn't open file " << filePath << "/Source.dat");
65
+
66
+ m_fileTargetInd.open((filePath + "/TargetInd.dat").c_str(), ios::in | ios::binary);
67
+ UTIL_THROW_IF(!m_fileTargetInd.is_open(),
68
+ util::FileOpenException,
69
+ "Couldn't open file " << filePath << "/TargetInd.dat");
70
+
71
+ m_fileTargetColl.open((filePath + "/TargetColl.dat").c_str(), ios::in | ios::binary);
72
+ UTIL_THROW_IF(!m_fileTargetColl.is_open(),
73
+ util::FileOpenException,
74
+ "Couldn't open file " << filePath << "/TargetColl.dat");
75
+
76
+ m_fileVocab.open((filePath + "/Vocab.dat").c_str(), ios::in);
77
+ UTIL_THROW_IF(!m_fileVocab.is_open(),
78
+ util::FileOpenException,
79
+ "Couldn't open file " << filePath << "/Vocab.dat");
80
+
81
+ m_fileMisc.open((filePath + "/Misc.dat").c_str(), ios::in);
82
+ UTIL_THROW_IF(!m_fileMisc.is_open(),
83
+ util::FileOpenException,
84
+ "Couldn't open file " << filePath << "/Misc.dat");
85
+
86
+ // set up root node
87
+ LoadMisc();
88
+ m_numSourceFactors = GetMisc("NumSourceFactors");
89
+ m_numTargetFactors = GetMisc("NumTargetFactors");
90
+ m_numScores = GetMisc("NumScores");
91
+
92
+ return true;
93
+ }
94
+
95
+ bool OnDiskWrapper::LoadMisc()
96
+ {
97
+ char line[100000];
98
+
99
+ while(m_fileMisc.getline(line, 100000)) {
100
+ vector<string> tokens;
101
+ Moses::Tokenize(tokens, line);
102
+ UTIL_THROW_IF2(tokens.size() != 2, "Except key value. Found " << line);
103
+
104
+
105
+ const string &key = tokens[0];
106
+ m_miscInfo[key] = Moses::Scan<uint64_t>(tokens[1]);
107
+ }
108
+
109
+ return true;
110
+ }
111
+
112
+ void OnDiskWrapper::BeginSave(const std::string &filePath
113
+ , int numSourceFactors, int numTargetFactors, int numScores)
114
+ {
115
+ m_numSourceFactors = numSourceFactors;
116
+ m_numTargetFactors = numTargetFactors;
117
+ m_numScores = numScores;
118
+ m_filePath = filePath;
119
+
120
+ #ifdef WIN32
121
+ mkdir(filePath.c_str());
122
+ #else
123
+ mkdir(filePath.c_str(), 0777);
124
+ #endif
125
+
126
+ m_fileSource.open((filePath + "/Source.dat").c_str(), ios::out | ios::in | ios::binary | ios::ate | ios::trunc);
127
+ UTIL_THROW_IF(!m_fileSource.is_open(),
128
+ util::FileOpenException,
129
+ "Couldn't open file " << filePath << "/Source.dat");
130
+
131
+ m_fileTargetInd.open((filePath + "/TargetInd.dat").c_str(), ios::out | ios::binary | ios::ate | ios::trunc);
132
+ UTIL_THROW_IF(!m_fileTargetInd.is_open(),
133
+ util::FileOpenException,
134
+ "Couldn't open file " << filePath << "/TargetInd.dat");
135
+
136
+ m_fileTargetColl.open((filePath + "/TargetColl.dat").c_str(), ios::out | ios::binary | ios::ate | ios::trunc);
137
+ UTIL_THROW_IF(!m_fileTargetColl.is_open(),
138
+ util::FileOpenException,
139
+ "Couldn't open file " << filePath << "/TargetColl.dat");
140
+
141
+ m_fileVocab.open((filePath + "/Vocab.dat").c_str(), ios::out | ios::ate | ios::trunc);
142
+ UTIL_THROW_IF(!m_fileVocab.is_open(),
143
+ util::FileOpenException,
144
+ "Couldn't open file " << filePath << "/Vocab.dat");
145
+
146
+ m_fileMisc.open((filePath + "/Misc.dat").c_str(), ios::out | ios::ate | ios::trunc);
147
+ UTIL_THROW_IF(!m_fileMisc.is_open(),
148
+ util::FileOpenException,
149
+ "Couldn't open file " << filePath << "/Misc.dat");
150
+
151
+ // offset by 1. 0 offset is reserved
152
+ char c = 0xff;
153
+ m_fileSource.write(&c, 1);
154
+ UTIL_THROW_IF2(1 != m_fileSource.tellp(),
155
+ "Couldn't write to stream m_fileSource");
156
+
157
+ m_fileTargetInd.write(&c, 1);
158
+ UTIL_THROW_IF2(1 != m_fileTargetInd.tellp(),
159
+ "Couldn't write to stream m_fileTargetInd");
160
+
161
+ m_fileTargetColl.write(&c, 1);
162
+ UTIL_THROW_IF2(1 != m_fileTargetColl.tellp(),
163
+ "Couldn't write to stream m_fileTargetColl");
164
+
165
+ // set up root node
166
+ UTIL_THROW_IF2(GetNumCounts() != 1,
167
+ "Not sure what this is...");
168
+
169
+ vector<float> counts(GetNumCounts());
170
+ counts[0] = DEFAULT_COUNT;
171
+ m_rootSourceNode = new PhraseNode();
172
+ m_rootSourceNode->AddCounts(counts);
173
+ }
174
+
175
+ void OnDiskWrapper::EndSave()
176
+ {
177
+ bool ret = m_rootSourceNode->Saved();
178
+ UTIL_THROW_IF2(!ret, "Root node not saved");
179
+
180
+ GetVocab().Save(*this);
181
+
182
+ SaveMisc();
183
+
184
+ m_fileMisc.close();
185
+ m_fileVocab.close();
186
+ m_fileSource.close();
187
+ m_fileTarget.close();
188
+ m_fileTargetInd.close();
189
+ m_fileTargetColl.close();
190
+ }
191
+
192
+ void OnDiskWrapper::SaveMisc()
193
+ {
194
+ m_fileMisc << "Version " << VERSION_NUM << endl;
195
+ m_fileMisc << "NumSourceFactors " << m_numSourceFactors << endl;
196
+ m_fileMisc << "NumTargetFactors " << m_numTargetFactors << endl;
197
+ m_fileMisc << "NumScores " << m_numScores << endl;
198
+ m_fileMisc << "RootNodeOffset " << m_rootSourceNode->GetFilePos() << endl;
199
+ }
200
+
201
+ size_t OnDiskWrapper::GetSourceWordSize() const
202
+ {
203
+ return sizeof(uint64_t) + sizeof(char);
204
+ }
205
+
206
+ size_t OnDiskWrapper::GetTargetWordSize() const
207
+ {
208
+ return sizeof(uint64_t) + sizeof(char);
209
+ }
210
+
211
+ uint64_t OnDiskWrapper::GetMisc(const std::string &key) const
212
+ {
213
+ std::map<std::string, uint64_t>::const_iterator iter;
214
+ iter = m_miscInfo.find(key);
215
+ UTIL_THROW_IF2(iter == m_miscInfo.end()
216
+ , "Couldn't find value for key " << key
217
+ );
218
+
219
+ return iter->second;
220
+ }
221
+
222
+
223
+ }
mosesdecoder/OnDiskPt/OnDiskWrapper.h ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // $Id$
3
+ /***********************************************************************
4
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
5
+ Copyright (C) 2009 Hieu Hoang
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+ #include <string>
22
+ #include <fstream>
23
+ #include "Vocab.h"
24
+ #include "PhraseNode.h"
25
+
26
+ namespace OnDiskPt
27
+ {
28
+ const float DEFAULT_COUNT = 66666;
29
+
30
+ /** Global class with misc information need to create and use the on-disk rule table.
31
+ * 1 object of this class should be instantiated per rule table.
32
+ * Currently only hierarchical/syntax models use this, but can & should be used with pb models too
33
+ */
34
+ class OnDiskWrapper
35
+ {
36
+ protected:
37
+ Vocab m_vocab;
38
+ std::string m_filePath;
39
+ int m_numSourceFactors, m_numTargetFactors, m_numScores;
40
+ std::fstream m_fileMisc, m_fileVocab, m_fileSource, m_fileTarget, m_fileTargetInd, m_fileTargetColl;
41
+
42
+ size_t m_defaultNodeSize;
43
+ PhraseNode *m_rootSourceNode;
44
+
45
+ std::map<std::string, uint64_t> m_miscInfo;
46
+
47
+ void SaveMisc();
48
+ bool OpenForLoad(const std::string &filePath);
49
+ bool LoadMisc();
50
+
51
+ public:
52
+ static int VERSION_NUM;
53
+
54
+ OnDiskWrapper();
55
+ ~OnDiskWrapper();
56
+
57
+ void BeginLoad(const std::string &filePath);
58
+
59
+ void BeginSave(const std::string &filePath
60
+ , int numSourceFactors, int numTargetFactors, int numScores);
61
+ void EndSave();
62
+
63
+ Vocab &GetVocab() {
64
+ return m_vocab;
65
+ }
66
+ const Vocab &GetVocab() const {
67
+ return m_vocab;
68
+ }
69
+
70
+ size_t GetSourceWordSize() const;
71
+ size_t GetTargetWordSize() const;
72
+
73
+ std::fstream &GetFileSource() {
74
+ return m_fileSource;
75
+ }
76
+ std::fstream &GetFileTargetInd() {
77
+ return m_fileTargetInd;
78
+ }
79
+ std::fstream &GetFileTargetColl() {
80
+ return m_fileTargetColl;
81
+ }
82
+ std::fstream &GetFileVocab() {
83
+ return m_fileVocab;
84
+ }
85
+
86
+ size_t GetNumSourceFactors() const {
87
+ return m_numSourceFactors;
88
+ }
89
+ size_t GetNumTargetFactors() const {
90
+ return m_numTargetFactors;
91
+ }
92
+
93
+ size_t GetNumScores() const {
94
+ return m_numScores;
95
+ }
96
+ size_t GetNumCounts() const {
97
+ return 1;
98
+ }
99
+
100
+ PhraseNode &GetRootSourceNode() {
101
+ return *m_rootSourceNode;
102
+ }
103
+ const PhraseNode &GetRootSourceNode() const {
104
+ return *m_rootSourceNode;
105
+ }
106
+
107
+ uint64_t GetMisc(const std::string &key) const;
108
+
109
+ };
110
+
111
+ }
mosesdecoder/OnDiskPt/Phrase.cpp ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
4
+ Copyright (C) 2009 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+ #include <iostream>
21
+ #include "moses/Util.h"
22
+ #include "Phrase.h"
23
+ #include "util/exception.hh"
24
+
25
+ using namespace std;
26
+
27
+ namespace OnDiskPt
28
+ {
29
+
30
+
31
+ void Phrase::AddWord(WordPtr word)
32
+ {
33
+ m_words.push_back(word);
34
+ }
35
+
36
+ void Phrase::AddWord(WordPtr word, size_t pos)
37
+ {
38
+ UTIL_THROW_IF2(!(pos < m_words.size()),
39
+ "Trying to get word " << pos << " when phrase size is " << m_words.size());
40
+ m_words.insert(m_words.begin() + pos + 1, word);
41
+ }
42
+
43
+ int Phrase::Compare(const Phrase &compare) const
44
+ {
45
+ int ret = 0;
46
+ for (size_t pos = 0; pos < GetSize(); ++pos) {
47
+ if (pos >= compare.GetSize()) {
48
+ // we're bigger than the other. Put 1st
49
+ ret = -1;
50
+ break;
51
+ }
52
+
53
+ const Word &thisWord = GetWord(pos)
54
+ ,&compareWord = compare.GetWord(pos);
55
+ int wordRet = thisWord.Compare(compareWord);
56
+ if (wordRet != 0) {
57
+ ret = wordRet;
58
+ break;
59
+ }
60
+ }
61
+
62
+ if (ret == 0) {
63
+ assert(compare.GetSize() >= GetSize());
64
+ ret = (compare.GetSize() > GetSize()) ? 1 : 0;
65
+ }
66
+ return ret;
67
+ }
68
+
69
+ //! transitive comparison
70
+ bool Phrase::operator<(const Phrase &compare) const
71
+ {
72
+ int ret = Compare(compare);
73
+ return ret < 0;
74
+ }
75
+
76
+ bool Phrase::operator>(const Phrase &compare) const
77
+ {
78
+ int ret = Compare(compare);
79
+ return ret > 0;
80
+ }
81
+
82
+ bool Phrase::operator==(const Phrase &compare) const
83
+ {
84
+ int ret = Compare(compare);
85
+ return ret == 0;
86
+ }
87
+
88
+ void Phrase::DebugPrint(ostream &out, const Vocab &vocab) const
89
+ {
90
+ for (size_t pos = 0; pos < GetSize(); ++pos) {
91
+ const Word &word = GetWord(pos);
92
+ word.DebugPrint(out, vocab);
93
+ out << " ";
94
+ }
95
+ }
96
+
97
+ std::ostream& operator<<(std::ostream &out, const Phrase &phrase)
98
+ {
99
+ for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
100
+ const Word &word = phrase.GetWord(pos);
101
+ out << word << " ";
102
+ }
103
+
104
+ return out;
105
+ }
106
+
107
+ }
108
+
mosesdecoder/OnDiskPt/Phrase.h ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // $Id$
3
+ /***********************************************************************
4
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
5
+ Copyright (C) 2009 Hieu Hoang
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+ #include <vector>
22
+ #include <iostream>
23
+ #include <boost/shared_ptr.hpp>
24
+ #include "Word.h"
25
+
26
+ namespace OnDiskPt
27
+ {
28
+ class Vocab;
29
+
30
+
31
+ /** A contiguous phrase. SourcePhrase & TargetPhrase inherit from this and add the on-disk functionality
32
+ */
33
+ class Phrase
34
+ {
35
+ friend std::ostream& operator<<(std::ostream&, const Phrase&);
36
+
37
+ protected:
38
+ std::vector<WordPtr> m_words;
39
+
40
+ public:
41
+ Phrase() {
42
+ }
43
+
44
+ virtual ~Phrase() {}
45
+
46
+ void AddWord(WordPtr word);
47
+ void AddWord(WordPtr word, size_t pos);
48
+
49
+ const Word &GetWord(size_t pos) const {
50
+ return *m_words[pos];
51
+ }
52
+ size_t GetSize() const {
53
+ return m_words.size();
54
+ }
55
+
56
+ virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
57
+
58
+ int Compare(const Phrase &compare) const;
59
+ bool operator<(const Phrase &compare) const;
60
+ bool operator>(const Phrase &compare) const;
61
+ bool operator==(const Phrase &compare) const;
62
+ };
63
+
64
+ typedef boost::shared_ptr<Phrase> PhrasePtr;
65
+
66
+ }
mosesdecoder/OnDiskPt/PhraseNode.cpp ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
4
+ Copyright (C) 2009 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+ #include "PhraseNode.h"
21
+ #include "OnDiskWrapper.h"
22
+ #include "TargetPhraseCollection.h"
23
+ #include "SourcePhrase.h"
24
+ #include "moses/Util.h"
25
+ #include "util/exception.hh"
26
+
27
+ using namespace std;
28
+
29
+ namespace OnDiskPt
30
+ {
31
+
32
+ size_t PhraseNode::GetNodeSize(size_t numChildren, size_t wordSize, size_t countSize)
33
+ {
34
+ size_t ret = sizeof(uint64_t) * 2 // num children, value
35
+ + (wordSize + sizeof(uint64_t)) * numChildren // word + ptr to next source node
36
+ + sizeof(float) * countSize; // count info
37
+ return ret;
38
+ }
39
+
40
+ PhraseNode::PhraseNode()
41
+ : m_value(0)
42
+ ,m_currChild(NULL)
43
+ ,m_saved(false)
44
+ ,m_memLoad(NULL)
45
+ {
46
+ }
47
+
48
+ PhraseNode::PhraseNode(uint64_t filePos, OnDiskWrapper &onDiskWrapper)
49
+ :m_counts(onDiskWrapper.GetNumCounts())
50
+ {
51
+ // load saved node
52
+ m_filePos = filePos;
53
+
54
+ size_t countSize = onDiskWrapper.GetNumCounts();
55
+
56
+ std::fstream &file = onDiskWrapper.GetFileSource();
57
+ file.seekg(filePos);
58
+ assert(filePos == (uint64_t)file.tellg());
59
+
60
+ file.read((char*) &m_numChildrenLoad, sizeof(uint64_t));
61
+
62
+ size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize);
63
+ m_memLoad = (char*) malloc(memAlloc);
64
+
65
+ // go to start of node again
66
+ file.seekg(filePos);
67
+ assert(filePos == (uint64_t)file.tellg());
68
+
69
+ // read everything into memory
70
+ file.read(m_memLoad, memAlloc);
71
+ assert(filePos + memAlloc == (uint64_t)file.tellg());
72
+
73
+ // get value
74
+ m_value = ((uint64_t*)m_memLoad)[1];
75
+
76
+ // get counts
77
+ float *memFloat = (float*) (m_memLoad + sizeof(uint64_t) * 2);
78
+
79
+ assert(countSize == 1);
80
+ m_counts[0] = memFloat[0];
81
+
82
+ m_memLoadLast = m_memLoad + memAlloc;
83
+ }
84
+
85
+ PhraseNode::~PhraseNode()
86
+ {
87
+ free(m_memLoad);
88
+ }
89
+
90
+ float PhraseNode::GetCount(size_t ind) const
91
+ {
92
+ return m_counts[ind];
93
+ }
94
+
95
+ void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimit)
96
+ {
97
+ UTIL_THROW_IF2(m_saved, "Already saved");
98
+
99
+ // save this node
100
+ m_targetPhraseColl.Sort(tableLimit);
101
+ m_targetPhraseColl.Save(onDiskWrapper);
102
+ m_value = m_targetPhraseColl.GetFilePos();
103
+
104
+ size_t numCounts = onDiskWrapper.GetNumCounts();
105
+
106
+ size_t memAlloc = GetNodeSize(GetSize(), onDiskWrapper.GetSourceWordSize(), numCounts);
107
+ char *mem = (char*) malloc(memAlloc);
108
+ //memset(mem, 0xfe, memAlloc);
109
+
110
+ size_t memUsed = 0;
111
+ uint64_t *memArray = (uint64_t*) mem;
112
+ memArray[0] = GetSize(); // num of children
113
+ memArray[1] = m_value; // file pos of corresponding target phrases
114
+ memUsed += 2 * sizeof(uint64_t);
115
+
116
+ // count info
117
+ float *memFloat = (float*) (mem + memUsed);
118
+ UTIL_THROW_IF2(numCounts != 1, "Can only store 1 phrase count");
119
+ memFloat[0] = (m_counts.size() == 0) ? DEFAULT_COUNT : m_counts[0]; // if count = 0, put in very large num to make sure its still used. HACK
120
+ memUsed += sizeof(float) * numCounts;
121
+
122
+ // recursively save chm_countsildren
123
+ ChildColl::iterator iter;
124
+ for (iter = m_children.begin(); iter != m_children.end(); ++iter) {
125
+ const Word &childWord = iter->first;
126
+ PhraseNode &childNode = iter->second;
127
+
128
+ // recursive
129
+ if (!childNode.Saved())
130
+ childNode.Save(onDiskWrapper, pos + 1, tableLimit);
131
+
132
+ char *currMem = mem + memUsed;
133
+ size_t wordMemUsed = childWord.WriteToMemory(currMem);
134
+ memUsed += wordMemUsed;
135
+
136
+ uint64_t *memArray = (uint64_t*) (mem + memUsed);
137
+ memArray[0] = childNode.GetFilePos();
138
+ memUsed += sizeof(uint64_t);
139
+
140
+ }
141
+
142
+ // save this node
143
+ //Moses::DebugMem(mem, memAlloc);
144
+ assert(memUsed == memAlloc);
145
+
146
+ std::fstream &file = onDiskWrapper.GetFileSource();
147
+ m_filePos = file.tellp();
148
+ file.seekp(0, ios::end);
149
+ file.write(mem, memUsed);
150
+
151
+ uint64_t endPos = file.tellp();
152
+ assert(m_filePos + memUsed == endPos);
153
+
154
+ free(mem);
155
+
156
+ m_children.clear();
157
+ m_saved = true;
158
+ }
159
+
160
+ void PhraseNode::AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase
161
+ , OnDiskWrapper &onDiskWrapper, size_t tableLimit
162
+ , const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort)
163
+ {
164
+ AddTargetPhrase(0, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort);
165
+ }
166
+
167
+ void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
168
+ , TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
169
+ , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort)
170
+ {
171
+ size_t phraseSize = sourcePhrase.GetSize();
172
+ if (pos < phraseSize) {
173
+ const Word &word = sourcePhrase.GetWord(pos);
174
+
175
+ PhraseNode &node = m_children[word];
176
+ if (m_currChild != &node) {
177
+ // new node
178
+ node.SetPos(pos);
179
+
180
+ if (m_currChild) {
181
+ m_currChild->Save(onDiskWrapper, pos, tableLimit);
182
+ }
183
+
184
+ m_currChild = &node;
185
+ }
186
+
187
+ // keep searching for target phrase node..
188
+ node.AddTargetPhrase(pos + 1, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort);
189
+ } else {
190
+ // drilled down to the right node
191
+ m_counts = counts;
192
+ targetPhrase->SetSourcePhrase(spShort);
193
+ m_targetPhraseColl.AddTargetPhrase(targetPhrase);
194
+ }
195
+ }
196
+
197
+ const PhraseNode *PhraseNode::GetChild(const Word &wordSought, OnDiskWrapper &onDiskWrapper) const
198
+ {
199
+ const PhraseNode *ret = NULL;
200
+
201
+ int l = 0;
202
+ int r = m_numChildrenLoad - 1;
203
+ int x;
204
+
205
+ while (r >= l) {
206
+ x = (l + r) / 2;
207
+
208
+ Word wordFound;
209
+ uint64_t childFilePos;
210
+ GetChild(wordFound, childFilePos, x, onDiskWrapper);
211
+
212
+ if (wordSought == wordFound) {
213
+ ret = new PhraseNode(childFilePos, onDiskWrapper);
214
+ break;
215
+ }
216
+ if (wordSought < wordFound)
217
+ r = x - 1;
218
+ else
219
+ l = x + 1;
220
+ }
221
+
222
+ return ret;
223
+ }
224
+
225
+ void PhraseNode::GetChild(Word &wordFound, uint64_t &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const
226
+ {
227
+
228
+ size_t wordSize = onDiskWrapper.GetSourceWordSize();
229
+ size_t childSize = wordSize + sizeof(uint64_t);
230
+
231
+ char *currMem = m_memLoad
232
+ + sizeof(uint64_t) * 2 // size & file pos of target phrase coll
233
+ + sizeof(float) * onDiskWrapper.GetNumCounts() // count info
234
+ + childSize * ind;
235
+
236
+ size_t memRead = ReadChild(wordFound, childFilePos, currMem);
237
+ assert(memRead == childSize);
238
+ }
239
+
240
+ size_t PhraseNode::ReadChild(Word &wordFound, uint64_t &childFilePos, const char *mem) const
241
+ {
242
+ size_t memRead = wordFound.ReadFromMemory(mem);
243
+
244
+ const char *currMem = mem + memRead;
245
+ uint64_t *memArray = (uint64_t*) (currMem);
246
+ childFilePos = memArray[0];
247
+
248
+ memRead += sizeof(uint64_t);
249
+ return memRead;
250
+ }
251
+
252
+ TargetPhraseCollection::shared_ptr
253
+ PhraseNode::
254
+ GetTargetPhraseCollection(size_t tableLimit, OnDiskWrapper &onDiskWrapper) const
255
+ {
256
+ TargetPhraseCollection::shared_ptr ret(new TargetPhraseCollection);
257
+ if (m_value > 0) ret->ReadFromFile(tableLimit, m_value, onDiskWrapper);
258
+ return ret;
259
+ }
260
+
261
+ std::ostream& operator<<(std::ostream &out, const PhraseNode &node)
262
+ {
263
+ out << "node (" << node.GetFilePos() << "," << node.GetValue() << "," << node.m_pos << ")";
264
+ return out;
265
+ }
266
+
267
+ }
268
+
mosesdecoder/OnDiskPt/PhraseNode.h ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // $Id$
3
+ /***********************************************************************
4
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
5
+ Copyright (C) 2009 Hieu Hoang
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+ #include <fstream>
22
+ #include <vector>
23
+ #include <map>
24
+ #include "Word.h"
25
+ #include "TargetPhraseCollection.h"
26
+ #include "Phrase.h"
27
+
28
+ namespace OnDiskPt
29
+ {
30
+
31
+ class OnDiskWrapper;
32
+ class SourcePhrase;
33
+
34
+ /** A node in the source tree trie */
35
+ class PhraseNode
36
+ {
37
+ friend std::ostream& operator<<(std::ostream&, const PhraseNode&);
38
+ protected:
39
+ uint64_t m_filePos, m_value;
40
+
41
+ typedef std::map<Word, PhraseNode> ChildColl;
42
+ ChildColl m_children;
43
+ PhraseNode *m_currChild;
44
+ bool m_saved;
45
+ size_t m_pos;
46
+ std::vector<float> m_counts;
47
+
48
+ TargetPhraseCollection m_targetPhraseColl;
49
+
50
+ char *m_memLoad, *m_memLoadLast;
51
+ uint64_t m_numChildrenLoad;
52
+
53
+ void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
54
+ , TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
55
+ , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);
56
+ size_t ReadChild(Word &wordFound, uint64_t &childFilePos, const char *mem) const;
57
+ void GetChild(Word &wordFound, uint64_t &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const;
58
+
59
+ public:
60
+ static size_t GetNodeSize(size_t numChildren, size_t wordSize, size_t countSize);
61
+
62
+ PhraseNode(); // unsaved node
63
+ PhraseNode(uint64_t filePos, OnDiskWrapper &onDiskWrapper); // load saved node
64
+ ~PhraseNode();
65
+
66
+ void Add(const Word &word, uint64_t nextFilePos, size_t wordSize);
67
+ void Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimit);
68
+
69
+ void AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase
70
+ , OnDiskWrapper &onDiskWrapper, size_t tableLimit
71
+ , const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);
72
+
73
+ uint64_t GetFilePos() const {
74
+ return m_filePos;
75
+ }
76
+ uint64_t GetValue() const {
77
+ return m_value;
78
+ }
79
+ void SetValue(uint64_t value) {
80
+ m_value = value;
81
+ }
82
+ size_t GetSize() const {
83
+ return m_children.size();
84
+ }
85
+
86
+ bool Saved() const {
87
+ return m_saved;
88
+ }
89
+
90
+ void SetPos(size_t pos) {
91
+ m_pos = pos;
92
+ }
93
+
94
+ const PhraseNode *GetChild(const Word &wordSought, OnDiskWrapper &onDiskWrapper) const;
95
+
96
+ TargetPhraseCollection::shared_ptr
97
+ GetTargetPhraseCollection(size_t tableLimit,
98
+ OnDiskWrapper &onDiskWrapper) const;
99
+
100
+ void AddCounts(const std::vector<float> &counts) {
101
+ m_counts = counts;
102
+ }
103
+ float GetCount(size_t ind) const;
104
+
105
+ };
106
+
107
+ }
108
+
mosesdecoder/OnDiskPt/SourcePhrase.cpp ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
4
+ Copyright (C) 2009 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+ #include "SourcePhrase.h"
21
+
22
+ namespace OnDiskPt
23
+ {
24
+
25
+ }
26
+
27
+
mosesdecoder/OnDiskPt/SourcePhrase.h ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // $Id$
3
+ /***********************************************************************
4
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
5
+ Copyright (C) 2009 Hieu Hoang
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+ #include <vector>
22
+ #include "Phrase.h"
23
+ #include "Word.h"
24
+
25
+ namespace OnDiskPt
26
+ {
27
+
28
+ /** A source phrase. No extension of a norm Phrase class because source phrases are saved as tries.
29
+ */
30
+ class SourcePhrase: public Phrase
31
+ {
32
+ protected:
33
+
34
+ public:
35
+ };
36
+
37
+
38
+ }
mosesdecoder/OnDiskPt/TargetPhrase.cpp ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
4
+ Copyright (C) 2009 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+
21
+ #include <algorithm>
22
+ #include <iostream>
23
+ #include "moses/Util.h"
24
+ #include "TargetPhrase.h"
25
+ #include "OnDiskWrapper.h"
26
+ #include "util/exception.hh"
27
+
28
+ #include <boost/algorithm/string.hpp>
29
+
30
+ using namespace std;
31
+
32
+ namespace OnDiskPt
33
+ {
34
+
35
+ TargetPhrase::TargetPhrase(size_t numScores)
36
+ :m_scores(numScores)
37
+ {
38
+ }
39
+
40
+ TargetPhrase::TargetPhrase(const TargetPhrase &copy)
41
+ :Phrase(copy)
42
+ ,m_scores(copy.m_scores)
43
+ {
44
+
45
+ }
46
+
47
+ TargetPhrase::~TargetPhrase()
48
+ {
49
+ }
50
+
51
+ void TargetPhrase::SetLHS(WordPtr lhs)
52
+ {
53
+ AddWord(lhs);
54
+ }
55
+
56
+ void TargetPhrase::Create1AlignFromString(const std::string &align1Str)
57
+ {
58
+ vector<size_t> alignPoints;
59
+ Moses::Tokenize<size_t>(alignPoints, align1Str, "-");
60
+ UTIL_THROW_IF2(alignPoints.size() != 2, "Incorrectly formatted word alignment: " << align1Str);
61
+ m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
62
+ }
63
+
64
+ void TargetPhrase::CreateAlignFromString(const std::string &alignStr)
65
+ {
66
+ vector<std::string> alignPairs;
67
+ boost::split(alignPairs, alignStr, boost::is_any_of("\t "));
68
+ for (size_t i = 0; i < alignPairs.size(); ++i) {
69
+ vector<size_t> alignPoints;
70
+ Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-");
71
+ m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
72
+ }
73
+ }
74
+
75
+
76
+ void TargetPhrase::SetScore(float score, size_t ind)
77
+ {
78
+ assert(ind < m_scores.size());
79
+ m_scores[ind] = score;
80
+ }
81
+
82
+ class AlignOrderer
83
+ {
84
+ public:
85
+ bool operator()(const AlignPair &a, const AlignPair &b) const {
86
+ return a.first < b.first;
87
+ }
88
+ };
89
+
90
+ void TargetPhrase::SortAlign()
91
+ {
92
+ std::sort(m_align.begin(), m_align.end(), AlignOrderer());
93
+ }
94
+
95
+ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) const
96
+ {
97
+ size_t phraseSize = GetSize();
98
+ size_t targetWordSize = onDiskWrapper.GetTargetWordSize();
99
+
100
+ const PhrasePtr sp = GetSourcePhrase();
101
+ size_t spSize = sp->GetSize();
102
+ size_t sourceWordSize = onDiskWrapper.GetSourceWordSize();
103
+
104
+ size_t memNeeded = sizeof(uint64_t) // num of words
105
+ + targetWordSize * phraseSize // actual words. lhs as last words
106
+ + sizeof(uint64_t) // num source words
107
+ + sourceWordSize * spSize; // actual source words
108
+
109
+ memUsed = 0;
110
+ uint64_t *mem = (uint64_t*) malloc(memNeeded);
111
+
112
+ // write size
113
+ mem[0] = phraseSize;
114
+ memUsed += sizeof(uint64_t);
115
+
116
+ // write each word
117
+ for (size_t pos = 0; pos < phraseSize; ++pos) {
118
+ const Word &word = GetWord(pos);
119
+ char *currPtr = (char*)mem + memUsed;
120
+ memUsed += word.WriteToMemory((char*) currPtr);
121
+ }
122
+
123
+ // write size of source phrase and all source words
124
+ char *currPtr = (char*)mem + memUsed;
125
+ uint64_t *memTmp = (uint64_t*) currPtr;
126
+ memTmp[0] = spSize;
127
+ memUsed += sizeof(uint64_t);
128
+ for (size_t pos = 0; pos < spSize; ++pos) {
129
+ const Word &word = sp->GetWord(pos);
130
+ char *currPtr = (char*)mem + memUsed;
131
+ memUsed += word.WriteToMemory((char*) currPtr);
132
+ }
133
+
134
+ assert(memUsed == memNeeded);
135
+ return (char *) mem;
136
+ }
137
+
138
+ void TargetPhrase::Save(OnDiskWrapper &onDiskWrapper)
139
+ {
140
+ // save in target ind
141
+ size_t memUsed;
142
+ char *mem = WriteToMemory(onDiskWrapper, memUsed);
143
+
144
+ std::fstream &file = onDiskWrapper.GetFileTargetInd();
145
+
146
+ uint64_t startPos = file.tellp();
147
+
148
+ file.seekp(0, ios::end);
149
+ file.write(mem, memUsed);
150
+
151
+ #ifndef NDEBUG
152
+ uint64_t endPos = file.tellp();
153
+ assert(startPos + memUsed == endPos);
154
+ #endif
155
+
156
+ m_filePos = startPos;
157
+ free(mem);
158
+ }
159
+
160
+ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) const
161
+ {
162
+ // allocate mem
163
+ size_t numScores = onDiskWrapper.GetNumScores()
164
+ ,numAlign = GetAlign().size();
165
+ size_t sparseFeatureSize = m_sparseFeatures.size();
166
+ size_t propSize = m_property.size();
167
+
168
+ size_t memNeeded = sizeof(uint64_t) // file pos (phrase id)
169
+ + sizeof(uint64_t) + 2 * sizeof(uint64_t) * numAlign // align
170
+ + sizeof(float) * numScores // scores
171
+ + sizeof(uint64_t) + sparseFeatureSize // sparse features string
172
+ + sizeof(uint64_t) + propSize; // property string
173
+
174
+ char *mem = (char*) malloc(memNeeded);
175
+ //memset(mem, 0, memNeeded);
176
+
177
+ memUsed = 0;
178
+
179
+ // phrase id
180
+ memcpy(mem, &m_filePos, sizeof(uint64_t));
181
+ memUsed += sizeof(uint64_t);
182
+
183
+ // align
184
+ size_t tmp = WriteAlignToMemory(mem + memUsed);
185
+ memUsed += tmp;
186
+
187
+ // scores
188
+ memUsed += WriteScoresToMemory(mem + memUsed);
189
+
190
+ // sparse features
191
+ memUsed += WriteStringToMemory(mem + memUsed, m_sparseFeatures);
192
+
193
+ // property string
194
+ memUsed += WriteStringToMemory(mem + memUsed, m_property);
195
+
196
+ //DebugMem(mem, memNeeded);
197
+ assert(memNeeded == memUsed);
198
+ return mem;
199
+ }
200
+
201
+ size_t TargetPhrase::WriteStringToMemory(char *mem, const std::string &str) const
202
+ {
203
+ size_t memUsed = 0;
204
+ uint64_t *memTmp = (uint64_t*) mem;
205
+
206
+ size_t strSize = str.size();
207
+ memTmp[0] = strSize;
208
+ memUsed += sizeof(uint64_t);
209
+
210
+ const char *charStr = str.c_str();
211
+ memcpy(mem + memUsed, charStr, strSize);
212
+ memUsed += strSize;
213
+
214
+ return memUsed;
215
+ }
216
+
217
+ size_t TargetPhrase::WriteAlignToMemory(char *mem) const
218
+ {
219
+ size_t memUsed = 0;
220
+
221
+ // num of alignments
222
+ uint64_t numAlign = m_align.size();
223
+ memcpy(mem, &numAlign, sizeof(numAlign));
224
+ memUsed += sizeof(numAlign);
225
+
226
+ // actual alignments
227
+ AlignType::const_iterator iter;
228
+ for (iter = m_align.begin(); iter != m_align.end(); ++iter) {
229
+ const AlignPair &alignPair = *iter;
230
+
231
+ memcpy(mem + memUsed, &alignPair.first, sizeof(alignPair.first));
232
+ memUsed += sizeof(alignPair.first);
233
+
234
+ memcpy(mem + memUsed, &alignPair.second, sizeof(alignPair.second));
235
+ memUsed += sizeof(alignPair.second);
236
+ }
237
+
238
+ return memUsed;
239
+ }
240
+
241
+ size_t TargetPhrase::WriteScoresToMemory(char *mem) const
242
+ {
243
+ float *scoreMem = (float*) mem;
244
+
245
+ for (size_t ind = 0; ind < m_scores.size(); ++ind)
246
+ scoreMem[ind] = m_scores[ind];
247
+
248
+ size_t memUsed = sizeof(float) * m_scores.size();
249
+ return memUsed;
250
+ }
251
+
252
+ uint64_t TargetPhrase::ReadOtherInfoFromFile(uint64_t filePos, std::fstream &fileTPColl)
253
+ {
254
+ assert(filePos == (uint64_t)fileTPColl.tellg());
255
+
256
+ uint64_t memUsed = 0;
257
+ fileTPColl.read((char*) &m_filePos, sizeof(uint64_t));
258
+ memUsed += sizeof(uint64_t);
259
+ assert(m_filePos != 0);
260
+
261
+ memUsed += ReadAlignFromFile(fileTPColl);
262
+ assert((memUsed + filePos) == (uint64_t)fileTPColl.tellg());
263
+
264
+ memUsed += ReadScoresFromFile(fileTPColl);
265
+ assert((memUsed + filePos) == (uint64_t)fileTPColl.tellg());
266
+
267
+ // sparse features
268
+ memUsed += ReadStringFromFile(fileTPColl, m_sparseFeatures);
269
+
270
+ // properties
271
+ memUsed += ReadStringFromFile(fileTPColl, m_property);
272
+
273
+ return memUsed;
274
+ }
275
+
276
+ uint64_t TargetPhrase::ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr)
277
+ {
278
+ uint64_t bytesRead = 0;
279
+
280
+ uint64_t strSize;
281
+ fileTPColl.read((char*) &strSize, sizeof(uint64_t));
282
+ bytesRead += sizeof(uint64_t);
283
+
284
+ if (strSize) {
285
+ char *mem = (char*) malloc(strSize + 1);
286
+ mem[strSize] = '\0';
287
+ fileTPColl.read(mem, strSize);
288
+ outStr = string(mem);
289
+ free(mem);
290
+
291
+ bytesRead += strSize;
292
+ }
293
+
294
+ return bytesRead;
295
+ }
296
+
297
+ uint64_t TargetPhrase::ReadFromFile(std::fstream &fileTP)
298
+ {
299
+ uint64_t bytesRead = 0;
300
+
301
+ fileTP.seekg(m_filePos);
302
+
303
+ uint64_t numWords;
304
+ fileTP.read((char*) &numWords, sizeof(uint64_t));
305
+ bytesRead += sizeof(uint64_t);
306
+
307
+ for (size_t ind = 0; ind < numWords; ++ind) {
308
+ WordPtr word(new Word());
309
+ bytesRead += word->ReadFromFile(fileTP);
310
+ AddWord(word);
311
+ }
312
+
313
+ // read source words
314
+ uint64_t numSourceWords;
315
+ fileTP.read((char*) &numSourceWords, sizeof(uint64_t));
316
+ bytesRead += sizeof(uint64_t);
317
+
318
+ PhrasePtr sp(new SourcePhrase());
319
+ for (size_t ind = 0; ind < numSourceWords; ++ind) {
320
+ WordPtr word( new Word());
321
+ bytesRead += word->ReadFromFile(fileTP);
322
+ sp->AddWord(word);
323
+ }
324
+ SetSourcePhrase(sp);
325
+
326
+ return bytesRead;
327
+ }
328
+
329
+ uint64_t TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
330
+ {
331
+ uint64_t bytesRead = 0;
332
+
333
+ uint64_t numAlign;
334
+ fileTPColl.read((char*) &numAlign, sizeof(uint64_t));
335
+ bytesRead += sizeof(uint64_t);
336
+
337
+ for (size_t ind = 0; ind < numAlign; ++ind) {
338
+ AlignPair alignPair;
339
+ fileTPColl.read((char*) &alignPair.first, sizeof(uint64_t));
340
+ fileTPColl.read((char*) &alignPair.second, sizeof(uint64_t));
341
+ m_align.push_back(alignPair);
342
+
343
+ bytesRead += sizeof(uint64_t) * 2;
344
+ }
345
+
346
+ return bytesRead;
347
+ }
348
+
349
+ uint64_t TargetPhrase::ReadScoresFromFile(std::fstream &fileTPColl)
350
+ {
351
+ UTIL_THROW_IF2(m_scores.size() == 0, "Translation rules must must have some scores");
352
+
353
+ uint64_t bytesRead = 0;
354
+
355
+ for (size_t ind = 0; ind < m_scores.size(); ++ind) {
356
+ fileTPColl.read((char*) &m_scores[ind], sizeof(float));
357
+
358
+ bytesRead += sizeof(float);
359
+ }
360
+
361
+ std::transform(m_scores.begin(),m_scores.end(),m_scores.begin(), Moses::TransformScore);
362
+ std::transform(m_scores.begin(),m_scores.end(),m_scores.begin(), Moses::FloorScore);
363
+
364
+ return bytesRead;
365
+ }
366
+
367
+ void TargetPhrase::DebugPrint(ostream &out, const Vocab &vocab) const
368
+ {
369
+ Phrase::DebugPrint(out, vocab);
370
+
371
+ for (size_t ind = 0; ind < m_align.size(); ++ind) {
372
+ const AlignPair &alignPair = m_align[ind];
373
+ out << alignPair.first << "-" << alignPair.second << " ";
374
+ }
375
+ out << ", ";
376
+
377
+ for (size_t ind = 0; ind < m_scores.size(); ++ind) {
378
+ out << m_scores[ind] << " ";
379
+ }
380
+
381
+ return;
382
+ }
383
+
384
+ std::ostream& operator<<(std::ostream &out, const TargetPhrase &phrase)
385
+ {
386
+ out << (const Phrase&) phrase << ", " ;
387
+
388
+ for (size_t ind = 0; ind < phrase.m_align.size(); ++ind) {
389
+ const AlignPair &alignPair = phrase.m_align[ind];
390
+ out << alignPair.first << "-" << alignPair.second << " ";
391
+ }
392
+ out << ", ";
393
+
394
+ for (size_t ind = 0; ind < phrase.m_scores.size(); ++ind) {
395
+ out << phrase.m_scores[ind] << " ";
396
+ }
397
+
398
+ return out;
399
+ }
400
+
401
+ } // namespace
402
+
mosesdecoder/OnDiskPt/TargetPhrase.h ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // $Id$
3
+ /***********************************************************************
4
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
5
+ Copyright (C) 2009 Hieu Hoang
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include <fstream>
23
+ #include <string>
24
+ #include <vector>
25
+ #include "Word.h"
26
+ #include "Phrase.h"
27
+ #include "SourcePhrase.h"
28
+
29
+ namespace Moses
30
+ {
31
+ class PhraseDictionary;
32
+ class TargetPhrase;
33
+ class Phrase;
34
+ }
35
+
36
+ namespace OnDiskPt
37
+ {
38
+
39
+ typedef std::pair<uint64_t, uint64_t> AlignPair;
40
+ typedef std::vector<AlignPair> AlignType;
41
+
42
+ class Vocab;
43
+
44
+ /** A target phrase, with the score breakdowns, alignment info and assorted other information it need.
45
+ * Readable and writeable to disk
46
+ */
47
+ class TargetPhrase: public Phrase
48
+ {
49
+ friend std::ostream& operator<<(std::ostream&, const TargetPhrase&);
50
+ protected:
51
+ AlignType m_align;
52
+ PhrasePtr m_sourcePhrase;
53
+ std::string m_sparseFeatures, m_property;
54
+
55
+ std::vector<float> m_scores;
56
+ uint64_t m_filePos;
57
+
58
+ size_t WriteAlignToMemory(char *mem) const;
59
+ size_t WriteScoresToMemory(char *mem) const;
60
+ size_t WriteStringToMemory(char *mem, const std::string &str) const;
61
+
62
+ uint64_t ReadAlignFromFile(std::fstream &fileTPColl);
63
+ uint64_t ReadScoresFromFile(std::fstream &fileTPColl);
64
+ uint64_t ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr);
65
+
66
+ public:
67
+ TargetPhrase() {
68
+ }
69
+ TargetPhrase(size_t numScores);
70
+ TargetPhrase(const TargetPhrase &copy);
71
+ virtual ~TargetPhrase();
72
+
73
+ void SetSourcePhrase(PhrasePtr p) {
74
+ m_sourcePhrase = p;
75
+ }
76
+ const PhrasePtr GetSourcePhrase() const {
77
+ return m_sourcePhrase;
78
+ }
79
+ const std::vector<float> &GetScores() const {
80
+ return m_scores;
81
+ }
82
+
83
+ void SetLHS(WordPtr lhs);
84
+
85
+ void Create1AlignFromString(const std::string &align1Str);
86
+ void CreateAlignFromString(const std::string &align1Str);
87
+ void SetScore(float score, size_t ind);
88
+
89
+ const AlignType &GetAlign() const {
90
+ return m_align;
91
+ }
92
+ void SortAlign();
93
+
94
+ char *WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) const;
95
+ char *WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) const;
96
+ void Save(OnDiskWrapper &onDiskWrapper);
97
+
98
+ uint64_t GetFilePos() const {
99
+ return m_filePos;
100
+ }
101
+ float GetScore(size_t ind) const {
102
+ return m_scores[ind];
103
+ }
104
+
105
+ uint64_t ReadOtherInfoFromFile(uint64_t filePos, std::fstream &fileTPColl);
106
+ uint64_t ReadFromFile(std::fstream &fileTP);
107
+
108
+ virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
109
+
110
+ const std::string &GetProperty() const {
111
+ return m_property;
112
+ }
113
+
114
+ void SetProperty(const std::string &value) {
115
+ m_property = value;
116
+ }
117
+
118
+ const std::string &GetSparseFeatures() const {
119
+ return m_sparseFeatures;
120
+ }
121
+
122
+ void SetSparseFeatures(const std::string &value) {
123
+ m_sparseFeatures = value;
124
+ }
125
+ };
126
+
127
+ }
mosesdecoder/OnDiskPt/TargetPhraseCollection.cpp ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
4
+ Copyright (C) 2009 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+
21
+ #include <algorithm>
22
+ #include <iostream>
23
+ #include "moses/Util.h"
24
+ #include "TargetPhraseCollection.h"
25
+ #include "Vocab.h"
26
+ #include "OnDiskWrapper.h"
27
+
28
+ using namespace std;
29
+
30
+ namespace OnDiskPt
31
+ {
32
+
33
+ size_t TargetPhraseCollection::s_sortScoreInd;
34
+
35
+ TargetPhraseCollection::TargetPhraseCollection()
36
+ :m_filePos(777)
37
+ {}
38
+
39
+ TargetPhraseCollection::TargetPhraseCollection(const TargetPhraseCollection &copy)
40
+ :m_filePos(copy.m_filePos)
41
+ ,m_debugStr(copy.m_debugStr)
42
+ {
43
+ }
44
+
45
+ TargetPhraseCollection::~TargetPhraseCollection()
46
+ {
47
+ Moses::RemoveAllInColl(m_coll);
48
+ }
49
+
50
+ void TargetPhraseCollection::AddTargetPhrase(TargetPhrase *targetPhrase)
51
+ {
52
+ m_coll.push_back(targetPhrase);
53
+ }
54
+
55
+ void TargetPhraseCollection::Sort(size_t tableLimit)
56
+ {
57
+ std::sort(m_coll.begin(), m_coll.end(), TargetPhraseOrderByScore());
58
+
59
+ if (tableLimit && m_coll.size() > tableLimit) {
60
+ CollType::iterator iter;
61
+ for (iter = m_coll.begin() + tableLimit ; iter != m_coll.end(); ++iter) {
62
+ delete *iter;
63
+ }
64
+ m_coll.resize(tableLimit);
65
+ }
66
+ }
67
+
68
+ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
69
+ {
70
+ std::fstream &file = onDiskWrapper.GetFileTargetColl();
71
+
72
+ size_t memUsed = sizeof(uint64_t);
73
+ char *mem = (char*) malloc(memUsed);
74
+
75
+ // size of coll
76
+ uint64_t numPhrases = GetSize();
77
+ ((uint64_t*)mem)[0] = numPhrases;
78
+
79
+ // MAIN LOOP
80
+ CollType::iterator iter;
81
+ for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
82
+ // save phrase
83
+ TargetPhrase &targetPhrase = **iter;
84
+ targetPhrase.Save(onDiskWrapper);
85
+
86
+ // save coll
87
+ size_t memUsedTPOtherInfo;
88
+ char *memTPOtherInfo = targetPhrase.WriteOtherInfoToMemory(onDiskWrapper, memUsedTPOtherInfo);
89
+
90
+ // expand existing mem
91
+ mem = (char*) realloc(mem, memUsed + memUsedTPOtherInfo);
92
+ memcpy(mem + memUsed, memTPOtherInfo, memUsedTPOtherInfo);
93
+ memUsed += memUsedTPOtherInfo;
94
+
95
+ free(memTPOtherInfo);
96
+ }
97
+
98
+ // total number of bytes
99
+ //((uint64_t*)mem)[0] = (uint64_t) memUsed;
100
+
101
+ uint64_t startPos = file.tellp();
102
+ file.seekp(0, ios::end);
103
+ file.write((char*) mem, memUsed);
104
+
105
+ free(mem);
106
+
107
+ #ifndef NDEBUG
108
+ uint64_t endPos = file.tellp();
109
+ assert(startPos + memUsed == endPos);
110
+ #endif
111
+ m_filePos = startPos;
112
+
113
+ }
114
+
115
+ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, uint64_t filePos, OnDiskWrapper &onDiskWrapper)
116
+ {
117
+ fstream &fileTPColl = onDiskWrapper.GetFileTargetColl();
118
+ fstream &fileTP = onDiskWrapper.GetFileTargetInd();
119
+
120
+ size_t numScores = onDiskWrapper.GetNumScores();
121
+
122
+
123
+ uint64_t numPhrases;
124
+
125
+ uint64_t currFilePos = filePos;
126
+ fileTPColl.seekg(filePos);
127
+ fileTPColl.read((char*) &numPhrases, sizeof(uint64_t));
128
+
129
+ // table limit
130
+ if (tableLimit) {
131
+ numPhrases = std::min(numPhrases, (uint64_t) tableLimit);
132
+ }
133
+
134
+ currFilePos += sizeof(uint64_t);
135
+
136
+ for (size_t ind = 0; ind < numPhrases; ++ind) {
137
+ TargetPhrase *tp = new TargetPhrase(numScores);
138
+
139
+ uint64_t sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
140
+ tp->ReadFromFile(fileTP);
141
+
142
+ currFilePos += sizeOtherInfo;
143
+
144
+ m_coll.push_back(tp);
145
+ }
146
+ }
147
+
148
+ uint64_t TargetPhraseCollection::GetFilePos() const
149
+ {
150
+ return m_filePos;
151
+ }
152
+
153
+ const std::string TargetPhraseCollection::GetDebugStr() const
154
+ {
155
+ return m_debugStr;
156
+ }
157
+
158
+ void TargetPhraseCollection::SetDebugStr(const std::string &str)
159
+ {
160
+ m_debugStr = str;
161
+ }
162
+
163
+ const TargetPhrase &TargetPhraseCollection::GetTargetPhrase(size_t ind) const
164
+ {
165
+ assert(ind < GetSize());
166
+ return *m_coll[ind];
167
+ }
168
+
169
+ }
170
+
171
+
mosesdecoder/OnDiskPt/TargetPhraseCollection.h ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
4
+ Copyright (C) 2009 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+ #pragma once
21
+
22
+ #include "TargetPhrase.h"
23
+ #include "Vocab.h"
24
+ #include <boost/shared_ptr.hpp>
25
+
26
+ namespace Moses
27
+ {
28
+ class TargetPhraseCollection;
29
+ class PhraseDictionary;
30
+ }
31
+
32
+ namespace OnDiskPt
33
+ {
34
+
35
+ /** A vector of target phrases
36
+ */
37
+ class TargetPhraseCollection
38
+ {
39
+ class TargetPhraseOrderByScore
40
+ {
41
+ public:
42
+ bool operator()(const TargetPhrase* a, const TargetPhrase *b) const {
43
+ return a->GetScore(s_sortScoreInd) > b->GetScore(s_sortScoreInd);
44
+ }
45
+ };
46
+
47
+ protected:
48
+ typedef std::vector<TargetPhrase*> CollType;
49
+ CollType m_coll;
50
+ uint64_t m_filePos;
51
+ std::string m_debugStr;
52
+
53
+ public:
54
+ typedef boost::shared_ptr<TargetPhraseCollection const> shared_const_ptr;
55
+ typedef boost::shared_ptr<TargetPhraseCollection> shared_ptr;
56
+
57
+ static size_t s_sortScoreInd;
58
+
59
+ TargetPhraseCollection();
60
+ TargetPhraseCollection(const TargetPhraseCollection &copy);
61
+
62
+ ~TargetPhraseCollection();
63
+ void AddTargetPhrase(TargetPhrase *targetPhrase);
64
+ void Sort(size_t tableLimit);
65
+
66
+ void Save(OnDiskWrapper &onDiskWrapper);
67
+
68
+ size_t GetSize() const {
69
+ return m_coll.size();
70
+ }
71
+
72
+ const TargetPhrase &GetTargetPhrase(size_t ind) const;
73
+
74
+ uint64_t GetFilePos() const;
75
+
76
+ void ReadFromFile(size_t tableLimit, uint64_t filePos, OnDiskWrapper &onDiskWrapper);
77
+
78
+ const std::string GetDebugStr() const;
79
+ void SetDebugStr(const std::string &str);
80
+
81
+ };
82
+
83
+ }
84
+
mosesdecoder/OnDiskPt/Vocab.cpp ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
4
+ Copyright (C) 2009 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+ #include <string>
21
+ #include <fstream>
22
+ #include "OnDiskWrapper.h"
23
+ #include "Vocab.h"
24
+ #include "moses/Util.h"
25
+ #include "util/exception.hh"
26
+
27
+ using namespace std;
28
+
29
+ namespace OnDiskPt
30
+ {
31
+
32
+ bool Vocab::Load(OnDiskWrapper &onDiskWrapper)
33
+ {
34
+ fstream &file = onDiskWrapper.GetFileVocab();
35
+
36
+ string line;
37
+ while(getline(file, line)) {
38
+ vector<string> tokens;
39
+ Moses::Tokenize(tokens, line);
40
+ UTIL_THROW_IF2(tokens.size() != 2, "Vocab file corrupted");
41
+ const string &key = tokens[0];
42
+ m_vocabColl[key] = Moses::Scan<uint64_t>(tokens[1]);
43
+ }
44
+
45
+ // create lookup
46
+ // assume contiguous vocab id
47
+ m_lookup.resize(m_vocabColl.size() + 1);
48
+ m_nextId = m_lookup.size();
49
+
50
+ CollType::const_iterator iter;
51
+ for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) {
52
+ uint32_t vocabId = iter->second;
53
+ const std::string &word = iter->first;
54
+
55
+ m_lookup[vocabId] = word;
56
+ }
57
+
58
+ return true;
59
+ }
60
+
61
+ void Vocab::Save(OnDiskWrapper &onDiskWrapper)
62
+ {
63
+ fstream &file = onDiskWrapper.GetFileVocab();
64
+ CollType::const_iterator iterVocab;
65
+ for (iterVocab = m_vocabColl.begin(); iterVocab != m_vocabColl.end(); ++iterVocab) {
66
+ const string &word = iterVocab->first;
67
+ uint32_t vocabId = iterVocab->second;
68
+
69
+ file << word << " " << vocabId << endl;
70
+ }
71
+ }
72
+
73
+ uint64_t Vocab::AddVocabId(const std::string &str)
74
+ {
75
+ // find string id
76
+ CollType::const_iterator iter = m_vocabColl.find(str);
77
+ if (iter == m_vocabColl.end()) {
78
+ // add new vocab entry
79
+ m_vocabColl[str] = m_nextId;
80
+ return m_nextId++;
81
+ } else {
82
+ // return existing entry
83
+ return iter->second;
84
+ }
85
+ }
86
+
87
+ uint64_t Vocab::GetVocabId(const std::string &str, bool &found) const
88
+ {
89
+ // find string id
90
+ CollType::const_iterator iter = m_vocabColl.find(str);
91
+ if (iter == m_vocabColl.end()) {
92
+ found = false;
93
+ return 0; //return whatever
94
+ } else {
95
+ // return existing entry
96
+ found = true;
97
+ return iter->second;
98
+ }
99
+ }
100
+
101
+ }
mosesdecoder/OnDiskPt/Vocab.h ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // $Id$
3
+ /***********************************************************************
4
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
5
+ Copyright (C) 2009 Hieu Hoang
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+ #include <string>
22
+ #include <map>
23
+ #include "moses/TypeDef.h"
24
+
25
+
26
+ namespace OnDiskPt
27
+ {
28
+
29
+ class OnDiskWrapper;
30
+
31
+ /* A bidirectional map of string<->contiguous id
32
+ * No distinction between source and target language
33
+ */
34
+ class Vocab
35
+ {
36
+ protected:
37
+ typedef std::map<std::string, uint64_t> CollType;
38
+ CollType m_vocabColl;
39
+
40
+ std::vector<std::string> m_lookup; // opposite of m_vocabColl
41
+ uint64_t m_nextId; // starts @ 1
42
+
43
+ public:
44
+ Vocab()
45
+ :m_nextId(1) {
46
+ }
47
+ uint64_t AddVocabId(const std::string &str);
48
+ uint64_t GetVocabId(const std::string &str, bool &found) const;
49
+ const std::string &GetString(uint64_t vocabId) const {
50
+ return m_lookup[vocabId];
51
+ }
52
+
53
+ bool Load(OnDiskWrapper &onDiskWrapper);
54
+ void Save(OnDiskWrapper &onDiskWrapper);
55
+ };
56
+
57
+ }
58
+
mosesdecoder/OnDiskPt/Word.cpp ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
4
+ Copyright (C) 2009 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+
21
+ #include <boost/algorithm/string/predicate.hpp>
22
+ #include "moses/Util.h"
23
+ #include "Word.h"
24
+
25
+ #include "util/tokenize_piece.hh"
26
+ #include "util/exception.hh"
27
+
28
+ using namespace std;
29
+ using namespace boost::algorithm;
30
+
31
+ namespace OnDiskPt
32
+ {
33
+
34
+ Word::Word(const Word &copy)
35
+ :m_isNonTerminal(copy.m_isNonTerminal)
36
+ ,m_vocabId(copy.m_vocabId)
37
+ {}
38
+
39
+ Word::~Word()
40
+ {}
41
+
42
+ void Word::CreateFromString(const std::string &inString, Vocab &vocab)
43
+ {
44
+ if (starts_with(inString, "[") && ends_with(inString, "]")) {
45
+ // non-term
46
+ m_isNonTerminal = true;
47
+ string str = inString.substr(1, inString.size() - 2);
48
+ m_vocabId = vocab.AddVocabId(str);
49
+ } else {
50
+ m_isNonTerminal = false;
51
+ m_vocabId = vocab.AddVocabId(inString);
52
+ }
53
+
54
+ }
55
+
56
+ size_t Word::WriteToMemory(char *mem) const
57
+ {
58
+ uint64_t *vocabMem = (uint64_t*) mem;
59
+ vocabMem[0] = m_vocabId;
60
+
61
+ size_t size = sizeof(uint64_t);
62
+
63
+ // is non-term
64
+ char bNonTerm = (char) m_isNonTerminal;
65
+ mem[size] = bNonTerm;
66
+ ++size;
67
+
68
+ return size;
69
+ }
70
+
71
+ size_t Word::ReadFromMemory(const char *mem)
72
+ {
73
+ uint64_t *vocabMem = (uint64_t*) mem;
74
+ m_vocabId = vocabMem[0];
75
+
76
+ size_t memUsed = sizeof(uint64_t);
77
+
78
+ // is non-term
79
+ char bNonTerm;
80
+ bNonTerm = mem[memUsed];
81
+ m_isNonTerminal = (bool) bNonTerm;
82
+ ++memUsed;
83
+
84
+ return memUsed;
85
+ }
86
+
87
+ size_t Word::ReadFromFile(std::fstream &file)
88
+ {
89
+ const size_t memAlloc = sizeof(uint64_t) + sizeof(char);
90
+ char mem[sizeof(uint64_t) + sizeof(char)];
91
+ file.read(mem, memAlloc);
92
+
93
+ size_t memUsed = ReadFromMemory(mem);
94
+ assert(memAlloc == memUsed);
95
+
96
+ return memAlloc;
97
+ }
98
+
99
+ int Word::Compare(const Word &compare) const
100
+ {
101
+ int ret;
102
+
103
+ if (m_isNonTerminal != compare.m_isNonTerminal)
104
+ return m_isNonTerminal ?-1 : 1;
105
+
106
+ if (m_vocabId < compare.m_vocabId)
107
+ ret = -1;
108
+ else if (m_vocabId > compare.m_vocabId)
109
+ ret = 1;
110
+ else
111
+ ret = 0;
112
+
113
+ return ret;
114
+ }
115
+
116
+ bool Word::operator<(const Word &compare) const
117
+ {
118
+ int ret = Compare(compare);
119
+ return ret < 0;
120
+ }
121
+
122
+ bool Word::operator==(const Word &compare) const
123
+ {
124
+ int ret = Compare(compare);
125
+ return ret == 0;
126
+ }
127
+
128
+ void Word::DebugPrint(ostream &out, const Vocab &vocab) const
129
+ {
130
+ const string &str = vocab.GetString(m_vocabId);
131
+ out << str;
132
+ }
133
+
134
+ std::ostream& operator<<(std::ostream &out, const Word &word)
135
+ {
136
+ out << "(";
137
+ out << word.m_vocabId;
138
+
139
+ out << (word.m_isNonTerminal ? "n" : "t");
140
+ out << ")";
141
+
142
+ return out;
143
+ }
144
+ }
mosesdecoder/OnDiskPt/Word.h ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // $Id$
3
+ /***********************************************************************
4
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
5
+ Copyright (C) 2009 Hieu Hoang
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+ #include <string>
22
+ #include <vector>
23
+ #include <iostream>
24
+ #include <fstream>
25
+ #include <boost/shared_ptr.hpp>
26
+ #include "Vocab.h"
27
+
28
+ namespace Moses
29
+ {
30
+ class Word;
31
+ }
32
+
33
+ namespace OnDiskPt
34
+ {
35
+ class Vocab;
36
+
37
+ /* A wrapper around a vocab id, and a boolean indicating whther it is a term or non-term.
38
+ * Factors can be represented by using a vocab string with | character, eg go|VB
39
+ */
40
+ class Word
41
+ {
42
+ friend std::ostream& operator<<(std::ostream&, const Word&);
43
+
44
+ private:
45
+ bool m_isNonTerminal;
46
+ uint64_t m_vocabId;
47
+
48
+ public:
49
+ explicit Word() {
50
+ }
51
+
52
+ explicit Word(bool isNonTerminal)
53
+ :m_isNonTerminal(isNonTerminal)
54
+ ,m_vocabId(0) {
55
+ }
56
+
57
+ Word(const Word &copy);
58
+ ~Word();
59
+
60
+
61
+ void CreateFromString(const std::string &inString, Vocab &vocab);
62
+ bool IsNonTerminal() const {
63
+ return m_isNonTerminal;
64
+ }
65
+
66
+ size_t WriteToMemory(char *mem) const;
67
+ size_t ReadFromMemory(const char *mem);
68
+ size_t ReadFromFile(std::fstream &file);
69
+
70
+ uint64_t GetVocabId() const {
71
+ return m_vocabId;
72
+ }
73
+
74
+ void SetVocabId(uint64_t vocabId) {
75
+ m_vocabId = vocabId;
76
+ }
77
+
78
+ void DebugPrint(std::ostream &out, const Vocab &vocab) const;
79
+ inline const std::string &GetString(const Vocab &vocab) const {
80
+ return vocab.GetString(m_vocabId);
81
+ }
82
+
83
+ int Compare(const Word &compare) const;
84
+ bool operator<(const Word &compare) const;
85
+ bool operator==(const Word &compare) const;
86
+
87
+ };
88
+
89
+ typedef boost::shared_ptr<Word> WordPtr;
90
+ }
91
+
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:924fb66d9f0e64d799938679376511b174bd77ad4cbe1d218e33a9c3278402a3
3
+ size 9824568
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Main.o ADDED
Binary file (67.2 kB). View file
 
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/OnDiskQuery.o ADDED
Binary file (23.2 kB). View file
 
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/OnDiskWrapper.o ADDED
Binary file (115 kB). View file
 
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Phrase.o ADDED
Binary file (26 kB). View file
 
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/PhraseNode.o ADDED
Binary file (37.6 kB). View file
 
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/SourcePhrase.o ADDED
Binary file (2.19 kB). View file
 
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/TargetPhrase.o ADDED
Binary file (115 kB). View file
 
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/TargetPhraseCollection.o ADDED
Binary file (18.1 kB). View file
 
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Vocab.o ADDED
Binary file (28.1 kB). View file
 
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Word.o ADDED
Binary file (6.87 kB). View file
 
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/queryOnDiskPt ADDED
Binary file (977 kB). View file
 
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/queryOnDiskPt.o ADDED
Binary file (18.3 kB). View file
 
mosesdecoder/OnDiskPt/queryOnDiskPt.cpp ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Query binary phrase tables.
2
+ // Christian Hardmeier, 16 May 2010
3
+
4
+ #include <cstdlib>
5
+ #include <cstring>
6
+ #include <string>
7
+ #include <vector>
8
+
9
+ #include "moses/Util.h"
10
+ #include "OnDiskWrapper.h"
11
+ #include "SourcePhrase.h"
12
+ #include "OnDiskQuery.h"
13
+
14
+ using namespace std;
15
+ using namespace OnDiskPt;
16
+
17
+ void usage();
18
+
19
+ typedef unsigned int uint;
20
+
21
+ int main(int argc, char **argv)
22
+ {
23
+ int tableLimit = 20;
24
+ std::string ttable = "";
25
+ // bool useAlignments = false;
26
+
27
+ for(int i = 1; i < argc; i++) {
28
+ if(!strcmp(argv[i], "-tlimit")) {
29
+ if(i + 1 == argc)
30
+ usage();
31
+ tableLimit = atoi(argv[++i]);
32
+ } else if(!strcmp(argv[i], "-t")) {
33
+ if(i + 1 == argc)
34
+ usage();
35
+ ttable = argv[++i];
36
+ } else
37
+ usage();
38
+ }
39
+
40
+ if(ttable == "")
41
+ usage();
42
+
43
+ OnDiskWrapper onDiskWrapper;
44
+ onDiskWrapper.BeginLoad(ttable);
45
+ OnDiskQuery onDiskQuery(onDiskWrapper);
46
+
47
+ cerr << "Ready..." << endl;
48
+
49
+ std::string line;
50
+ while(getline(std::cin, line)) {
51
+ std::vector<std::string> tokens;
52
+ tokens = Moses::Tokenize(line, " ");
53
+
54
+ cerr << "line: " << line << endl;
55
+ const PhraseNode* node = onDiskQuery.Query(tokens);
56
+
57
+ if (node) {
58
+ // source phrase points to a bunch of rules
59
+ TargetPhraseCollection::shared_ptr coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper);
60
+ string str = coll->GetDebugStr();
61
+ cout << "Found " << coll->GetSize() << endl;
62
+
63
+ for (size_t ind = 0; ind < coll->GetSize(); ++ind) {
64
+ const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind);
65
+ cerr << " ";
66
+ targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab());
67
+ cerr << endl;
68
+ }
69
+ } else {
70
+ cout << "Not found" << endl;
71
+ }
72
+
73
+ std::cout << '\n';
74
+ std::cout.flush();
75
+ }
76
+
77
+ cerr << "Finished." << endl;
78
+ }
79
+
80
+ void usage()
81
+ {
82
+ std::cerr << "Usage: queryOnDiskPt [-n <nscores>] [-a] -t <ttable>\n"
83
+ "-tlimit <table limit> max number of rules per source phrase (default: 20)\n"
84
+ "-t <ttable> phrase table\n";
85
+ exit(1);
86
+ }
mosesdecoder/README ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Instructions for building and installing Moses are online:
2
+
3
+ http://www.statmt.org/moses/?n=Development.GetStarted
4
+
5
+ Questions should be directed to the mailing list (don't forget to register before sending emails):
6
+ http://mailman.mit.edu/mailman/listinfo/moses-support
7
+
8
+ Some of the code is not originally part of Moses, but is periodically copied
9
+ into the source tree from elsewhere:
10
+
11
+ * "bjam-files" is taken from Boost.
12
+ * "util" and "lm" are taken from KenLM: https://github.com/kpu/kenlm
13
+
14
+
mosesdecoder/azure-pipelines.yml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Starter pipeline
2
+ # Start with a minimal pipeline that you can customize to build and deploy your code.
3
+ # Add steps that build, run tests, deploy, and more:
4
+ # https://aka.ms/yaml
5
+
6
+ trigger:
7
+ - master
8
+
9
+ pool:
10
+ #vmImage: 'ubuntu-latest'
11
+ vmImage: 'ubuntu-16.04'
12
+
13
+ steps:
14
+
15
+ - script: |
16
+ echo Printing some environment information
17
+ echo HOME: $HOME
18
+ echo
19
+ echo UBUNTU VERSION:
20
+ cat /etc/lsb-release
21
+ echo
22
+ echo CPU INFO
23
+ cat /proc/cpuinfo
24
+ echo
25
+ echo MEM INFO
26
+ cat /proc/meminfo
27
+ echo
28
+ echo DISK INFO
29
+ df -h
30
+ echo
31
+ echo PWD: $PWD
32
+ echo
33
+ ls
34
+ displayName: 'Printing some environment information'
35
+
36
+
37
+ ## Installation commands for Ubuntu
38
+ - script: |
39
+ sudo apt-get install \
40
+ g++ \
41
+ git \
42
+ subversion \
43
+ automake \
44
+ libtool \
45
+ zlib1g-dev \
46
+ libicu-dev \
47
+ libboost-all-dev \
48
+ libssl-dev \
49
+ libbz2-dev \
50
+ liblzma-dev \
51
+ python-dev \
52
+ graphviz \
53
+ imagemagick \
54
+ make \
55
+ cmake \
56
+ libgoogle-perftools-dev \
57
+ autoconf \
58
+ doxygen
59
+ displayName: 'Install Ubuntu packages'
60
+
61
+ - script: |
62
+ wget "https://sourceforge.net/projects/cmph/files/v2.0.2/cmph-2.0.2.tar.gz/download"
63
+ mv download cmph-2.0.2.tar.gz
64
+ tar xvzf cmph-2.0.2.tar.gz
65
+ cd cmph-2.0.2
66
+ ./configure --prefix=$PWD
67
+ make
68
+ make install
69
+ cd ..
70
+ displayName: 'Build and Install cmph'
71
+
72
+ - script: |
73
+ wget "https://sourceforge.net/projects/xmlrpc-c/files/Xmlrpc-c%20Super%20Stable/1.51.06/xmlrpc-c-1.51.06.tgz/download"
74
+ mv download xmlrpc-c-1.51.06.tgz
75
+ tar xvzf xmlrpc-c-1.51.06.tgz
76
+ cd xmlrpc-c-1.51.06
77
+ ./configure --prefix=$PWD
78
+ make
79
+ make install
80
+ sudo ldconfig
81
+ cd ..
82
+ displayName: 'Build and Install xmlrpc-c'
83
+
84
+ - script: |
85
+ ./bjam \
86
+ --with-cmph=$PWD/cmph-2.0.2 \
87
+ --with-xmlrpc-c=$PWD/xmlrpc-c-1.51.06 \
88
+ -j2
89
+ displayName: 'Build Moses'
90
+
91
+ # - script: |
92
+ # ./bjam \
93
+ # -j2
94
+ # displayName: 'Build Moses'
95
+
96
+ # - task: ComponentGovernanceComponentDetection@0
97
+ # inputs:
98
+ # scanType: 'Register'
99
+ # verbosity: 'Verbose'
100
+ # alertWarningLevel: 'High'
mosesdecoder/biconcor/Alignment.cpp ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "Alignment.h"
2
+
3
+ #include <fstream>
4
+ #include <string>
5
+ #include <cstdlib>
6
+ #include <cstring>
7
+
8
+ namespace
9
+ {
10
+
11
+ const int LINE_MAX_LENGTH = 10000;
12
+
13
+ } // namespace
14
+
15
+ using namespace std;
16
+
17
+ void Alignment::Create(const string& fileName)
18
+ {
19
+ ifstream textFile;
20
+ char line[LINE_MAX_LENGTH];
21
+
22
+ // count the number of words first;
23
+ textFile.open(fileName.c_str());
24
+
25
+ if (!textFile) {
26
+ cerr << "No such file or directory: " << fileName << endl;
27
+ exit(1);
28
+ }
29
+
30
+ istream *fileP = &textFile;
31
+ m_size = 0;
32
+ m_sentenceCount = 0;
33
+ while(!fileP->eof()) {
34
+ SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
35
+ if (fileP->eof()) break;
36
+ vector<string> alignmentSequence = Tokenize( line );
37
+ m_size += alignmentSequence.size();
38
+ m_sentenceCount++;
39
+ }
40
+ textFile.close();
41
+ cerr << m_size << " alignment points" << endl;
42
+
43
+ // allocate memory
44
+ m_array = (int*) calloc( sizeof(int), m_size*2 );
45
+ m_sentenceEnd = (INDEX*) calloc( sizeof( INDEX ), m_sentenceCount );
46
+
47
+ if (m_array == NULL) {
48
+ cerr << "Error: cannot allocate memory to m_array" << endl;
49
+ exit(1);
50
+ }
51
+
52
+ if (m_sentenceEnd == NULL) {
53
+ cerr << "Error: cannot allocate memory to m_sentenceEnd" << endl;
54
+ exit(1);
55
+ }
56
+
57
+ // fill the array
58
+ int alignmentPointIndex = 0;
59
+ int sentenceId = 0;
60
+
61
+ textFile.open(fileName.c_str());
62
+ if (!textFile) {
63
+ cerr << "Failed to open " << fileName << endl;
64
+ exit(1);
65
+ }
66
+
67
+ fileP = &textFile;
68
+ while(!fileP->eof()) {
69
+ SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
70
+ if (fileP->eof()) break;
71
+ vector<string> alignmentSequence = Tokenize( line );
72
+ for(size_t i=0; i<alignmentSequence.size(); i++) {
73
+ int s,t;
74
+ // cout << "scaning " << alignmentSequence[i].c_str() << endl;
75
+ if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
76
+ cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceId << endl;
77
+ }
78
+ m_array[alignmentPointIndex++] = (char) s;
79
+ m_array[alignmentPointIndex++] = (char) t;
80
+ }
81
+ m_sentenceEnd[ sentenceId++ ] = alignmentPointIndex - 2;
82
+ }
83
+ textFile.close();
84
+ cerr << "done reading " << (alignmentPointIndex/2) << " alignment points, " << sentenceId << " sentences." << endl;
85
+ }
86
+
87
+ Alignment::Alignment()
88
+ : m_array(NULL),
89
+ m_sentenceEnd(NULL),
90
+ m_size(0),
91
+ m_sentenceCount(0) {}
92
+
93
+ Alignment::~Alignment()
94
+ {
95
+ if (m_array != NULL) {
96
+ free(m_array);
97
+ }
98
+ if (m_sentenceEnd != NULL) {
99
+ free(m_sentenceEnd);
100
+ }
101
+ }
102
+
103
+ vector<string> Alignment::Tokenize( const char input[] )
104
+ {
105
+ vector< string > token;
106
+ bool betweenWords = true;
107
+ int start=0;
108
+ int i=0;
109
+ for(; input[i] != '\0'; i++) {
110
+ bool isSpace = (input[i] == ' ' || input[i] == '\t');
111
+
112
+ if (!isSpace && betweenWords) {
113
+ start = i;
114
+ betweenWords = false;
115
+ } else if (isSpace && !betweenWords) {
116
+ token.push_back( string( input+start, i-start ) );
117
+ betweenWords = true;
118
+ }
119
+ }
120
+ if (!betweenWords)
121
+ token.push_back( string( input+start, i-start ) );
122
+ return token;
123
+ }
124
+
125
+ bool Alignment::PhraseAlignment( INDEX sentence, int target_length,
126
+ int source_start, int source_end,
127
+ int &target_start, int &target_end,
128
+ int &pre_null, int &post_null )
129
+ {
130
+ // get index for first alignment point
131
+ INDEX sentenceStart = 0;
132
+ if (sentence > 0) {
133
+ sentenceStart = m_sentenceEnd[ sentence-1 ] + 2;
134
+ }
135
+
136
+ // get target phrase boundaries
137
+ target_start = target_length;
138
+ target_end = 0;
139
+ for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
140
+ int source = m_array[ ap ];
141
+ if (source >= source_start && source <= source_end ) {
142
+ int target = m_array[ ap+1 ];
143
+ if (target < target_start) target_start = target;
144
+ if (target > target_end ) target_end = target;
145
+ }
146
+ }
147
+ if (target_start == target_length) {
148
+ return false; // done if no alignment points
149
+ }
150
+
151
+ // check consistency
152
+ for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
153
+ int target = m_array[ ap+1 ];
154
+ if (target >= target_start && target <= target_end ) {
155
+ int source = m_array[ ap ];
156
+ if (source < source_start || source > source_end) {
157
+ return false; // alignment point out of range
158
+ }
159
+ }
160
+ }
161
+
162
+ // create array for unaligned words
163
+ for( int i=0; i<target_length; i++ ) {
164
+ m_unaligned[i] = true;
165
+ }
166
+ for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
167
+ int target = m_array[ ap+1 ];
168
+ m_unaligned[ target ] = false;
169
+ }
170
+
171
+ // prior unaligned words
172
+ pre_null = 0;
173
+ for(int target = target_start-1; target >= 0 && m_unaligned[ target ]; target--) {
174
+ pre_null++;
175
+ }
176
+
177
+ // post unaligned words;
178
+ post_null = 0;
179
+ for(int target = target_end+1; target < target_length && m_unaligned[ target ]; target++) {
180
+ post_null++;
181
+ }
182
+ return true;
183
+ }
184
+
185
+ void Alignment::Save(const string& fileName ) const
186
+ {
187
+ FILE *pFile = fopen ( (fileName + ".align").c_str() , "w" );
188
+ if (pFile == NULL) {
189
+ cerr << "Cannot open " << fileName << ".align" << endl;
190
+ exit(1);
191
+ }
192
+
193
+ fwrite( &m_size, sizeof(INDEX), 1, pFile );
194
+ fwrite( m_array, sizeof(int), m_size*2, pFile ); // corpus
195
+
196
+ fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
197
+ fwrite( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
198
+ fclose( pFile );
199
+ }
200
+
201
+ void Alignment::Load(const string& fileName )
202
+ {
203
+ FILE *pFile = fopen ( (fileName + ".align").c_str() , "r" );
204
+ if (pFile == NULL) {
205
+ cerr << "no such file or directory: " << fileName << ".align" << endl;
206
+ exit(1);
207
+ }
208
+
209
+ cerr << "loading from " << fileName << ".align" << endl;
210
+
211
+ fread( &m_size, sizeof(INDEX), 1, pFile );
212
+ cerr << "alignment points in corpus: " << m_size << endl;
213
+ m_array = (int*) calloc( sizeof(int), m_size*2 );
214
+ fread( m_array, sizeof(int), m_size*2, pFile ); // corpus
215
+
216
+ fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
217
+ cerr << "sentences in corpus: " << m_sentenceCount << endl;
218
+ m_sentenceEnd = (INDEX*) calloc( sizeof(INDEX), m_sentenceCount );
219
+ fread( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
220
+ fclose( pFile );
221
+ cerr << "done loading\n";
222
+ }
mosesdecoder/biconcor/Alignment.h ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "Vocabulary.h"
4
+
5
+ class Alignment
6
+ {
7
+ public:
8
+ typedef unsigned int INDEX;
9
+
10
+ private:
11
+ int *m_array;
12
+ INDEX *m_sentenceEnd;
13
+ INDEX m_size;
14
+ INDEX m_sentenceCount;
15
+ char m_unaligned[ 256 ]; // here for speed (local to PhraseAlignment)
16
+
17
+ // No copying allowed.
18
+ Alignment(const Alignment&);
19
+ void operator=(const Alignment&);
20
+
21
+ public:
22
+ Alignment();
23
+ ~Alignment();
24
+
25
+ void Create(const std::string& fileName );
26
+ bool PhraseAlignment( INDEX sentence, int target_length,
27
+ int source_start, int source_end,
28
+ int &target_start, int &target_end,
29
+ int &pre_null, int &post_null );
30
+ void Load(const std::string& fileName );
31
+ void Save(const std::string& fileName ) const;
32
+ std::vector<std::string> Tokenize( const char input[] );
33
+
34
+ INDEX GetSentenceStart( INDEX sentence ) const {
35
+ if (sentence == 0) return 0;
36
+ return m_sentenceEnd[ sentence-1 ] + 2;
37
+ }
38
+ INDEX GetNumberOfAlignmentPoints( INDEX sentence ) const {
39
+ return ( m_sentenceEnd[ sentence ] - GetSentenceStart( sentence ) ) / 2;
40
+ }
41
+ int GetSourceWord( INDEX sentence, INDEX alignment_point ) const {
42
+ return m_array[ GetSentenceStart( sentence ) + alignment_point*2 ];
43
+ }
44
+ int GetTargetWord( INDEX sentence, INDEX alignment_point ) const {
45
+ return m_array[ GetSentenceStart( sentence ) + alignment_point*2 + 1 ];
46
+ }
47
+ };
mosesdecoder/biconcor/CMakeLists.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ project(biconcor)
2
+
3
+ FILE(GLOB biconcor_source *.cpp)
4
+
5
+ add_executable(biconcor ${biconcor_source})
mosesdecoder/biconcor/Jamfile ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ exe biconcor : Vocabulary.cpp SuffixArray.cpp TargetCorpus.cpp Alignment.cpp Mismatch.cpp PhrasePair.cpp PhrasePairCollection.cpp biconcor.cpp base64.cpp ;
2
+ exe phrase-lookup : Vocabulary.cpp SuffixArray.cpp phrase-lookup.cpp ;
mosesdecoder/biconcor/Mismatch.cpp ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "Mismatch.h"
2
+
3
+ #include <fstream>
4
+ #include <iostream>
5
+ #include <cstring>
6
+ #include <string>
7
+ #include <cstdlib>
8
+
9
+ #include "SuffixArray.h"
10
+ #include "TargetCorpus.h"
11
+ #include "Alignment.h"
12
+ #include "Vocabulary.h"
13
+
14
+ using namespace std;
15
+
16
+ enum {
17
+ UNANNOTATED = 0,
18
+ PRE_ALIGNED = 1,
19
+ POST_ALIGNED = 2,
20
+ UNALIGNED = 3,
21
+ MISALIGNED = 4,
22
+ ALIGNED = 5
23
+ };
24
+
25
+ Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end )
26
+ :m_suffixArray(sa)
27
+ ,m_targetCorpus(tc)
28
+ ,m_alignment(a)
29
+ ,m_sentence_id(sentence_id)
30
+ ,m_source_length(source_length)
31
+ ,m_target_length(target_length)
32
+ ,m_source_position(position)
33
+ ,m_source_start(source_start)
34
+ ,m_source_end(source_end)
35
+ ,m_unaligned(true)
36
+ {
37
+ // initialize unaligned indexes
38
+ for (int i = 0; i < m_source_length; i++) {
39
+ m_source_unaligned[i] = true;
40
+ }
41
+ for (int i = 0; i < m_target_length; i++) {
42
+ m_target_unaligned[i] = true;
43
+ }
44
+ m_num_alignment_points =
45
+ m_alignment->GetNumberOfAlignmentPoints( sentence_id );
46
+ for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
47
+ m_source_unaligned[ (int)m_alignment->GetSourceWord( sentence_id, ap ) ] = false;
48
+ m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false;
49
+ }
50
+ for(int i = source_start; i <= source_end; i++) {
51
+ if (!m_source_unaligned[ i ]) {
52
+ m_unaligned = false;
53
+ }
54
+ }
55
+ }
56
+
57
+ Mismatch::~Mismatch () {}
58
+
59
+ void Mismatch::PrintClippedHTML( ostream* out, int width )
60
+ {
61
+ int source_annotation[256], target_annotation[256];
62
+ vector< string > label_class;
63
+ label_class.push_back( "" );
64
+ label_class.push_back( "mismatch_pre_aligned" );
65
+ label_class.push_back( "mismatch_post_aligned" );
66
+ label_class.push_back( "null_aligned" );
67
+ label_class.push_back( "mismatch_misaligned" );
68
+ label_class.push_back( "mismatch_aligned" );
69
+
70
+ for(int i=0; i<m_source_length; i++) source_annotation[i] = UNANNOTATED;
71
+ for(int i=0; i<m_target_length; i++) target_annotation[i] = UNANNOTATED;
72
+
73
+ if (m_unaligned) {
74
+ // find alignment points for prior and next word(s) and
75
+ // center target phrase around those.
76
+ bool found_aligned = false;
77
+ for(int i=1; i<m_source_length && !found_aligned; i++) {
78
+ if (m_source_start-i >= 0) {
79
+ int word_id = m_source_start-i;
80
+ source_annotation[ word_id ] = UNALIGNED;
81
+ if (!m_source_unaligned[ word_id ]) {
82
+ found_aligned = true;
83
+ LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
84
+ }
85
+ }
86
+
87
+ if (m_source_end+i < m_source_length) {
88
+ int word_id = m_source_end+i;
89
+ source_annotation[ word_id ] = UNALIGNED;
90
+ if (!m_source_unaligned[ word_id ]) {
91
+ found_aligned = true;
92
+ LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
93
+ }
94
+ }
95
+ }
96
+
97
+ }
98
+ // misalignment
99
+ else {
100
+ // label aligned output words
101
+ for(int i=m_source_start; i<=m_source_end; i++)
102
+ LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
103
+
104
+ // find first and last
105
+ int target_start = -1;
106
+ int target_end = -1;
107
+ for(int i=0; i<m_target_length; i++)
108
+ if (target_annotation[i] == ALIGNED) {
109
+ if (target_start == -1)
110
+ target_start = i;
111
+ target_end = i;
112
+ }
113
+ // go over all enclosed target words
114
+ for(int i=target_start; i<=target_end; i++) {
115
+ // label other target words as unaligned or misaligned
116
+ if (m_target_unaligned[ i ])
117
+ target_annotation[ i ] = UNALIGNED;
118
+ else {
119
+ if (target_annotation[ i ] != ALIGNED)
120
+ target_annotation[ i ] = MISALIGNED;
121
+ // loop over aligned source words
122
+ for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
123
+ if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
124
+ int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
125
+ // if not part of the source phrase -> also misaligned
126
+ if (source_word < m_source_start || source_word > m_source_end)
127
+ source_annotation[ source_word ] = MISALIGNED;
128
+ }
129
+ }
130
+ }
131
+ }
132
+ // closure
133
+ bool change = true;
134
+ while(change) {
135
+ change = false;
136
+ for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
137
+ int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
138
+ int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
139
+ if (source_annotation[source_word] != UNANNOTATED &&
140
+ target_annotation[target_word] == UNANNOTATED) {
141
+ target_annotation[target_word] = MISALIGNED;
142
+ change = true;
143
+ }
144
+ if (source_annotation[source_word] == UNANNOTATED &&
145
+ target_annotation[target_word] != UNANNOTATED) {
146
+ source_annotation[source_word] = MISALIGNED;
147
+ change = true;
148
+ }
149
+ }
150
+ }
151
+ }
152
+
153
+ // print source
154
+ // shorten source context if too long
155
+ int sentence_start = m_source_position - m_source_start;
156
+ int context_space = width/2;
157
+ for(int i=m_source_start; i<=m_source_end; i++)
158
+ context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
159
+ context_space /= 2;
160
+
161
+ int remaining = context_space;
162
+ int start_word = m_source_start;
163
+ for(; start_word>0 && remaining>0; start_word--)
164
+ remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
165
+ if (remaining<0 || start_word == -1) start_word++;
166
+
167
+ remaining = context_space;
168
+ int end_word = m_source_end;
169
+ for(; end_word<m_source_length && remaining>0; end_word++)
170
+ remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
171
+ end_word--;
172
+
173
+ // output with markup
174
+ *out << "<tr><td class=\"pp_source_left\">";
175
+ char current_label = UNANNOTATED;
176
+ if (start_word>0) {
177
+ current_label = source_annotation[start_word-1];
178
+ *out << "... ";
179
+ }
180
+ for(int i=start_word; i<=end_word; i++) {
181
+ // change to phrase block
182
+ if (i == m_source_start) {
183
+ if (current_label != UNANNOTATED && i!=start_word)
184
+ *out << "</span>";
185
+ *out << "</td><td class=\"pp_source\">";
186
+ current_label = UNANNOTATED;
187
+ }
188
+
189
+ // change to labeled word
190
+ else if (source_annotation[i] != current_label &&
191
+ source_annotation[i] != ALIGNED) {
192
+ if (current_label != UNANNOTATED && i!=start_word)
193
+ *out << "</span>";
194
+ if (source_annotation[i] != UNANNOTATED)
195
+ *out << "<span class=\""
196
+ << label_class[ source_annotation[i] ]
197
+ << "\">";
198
+ current_label = source_annotation[i];
199
+ }
200
+
201
+ // output word
202
+ *out << m_suffixArray->GetWord( sentence_start + i ) << " ";
203
+
204
+ // change to right context block
205
+ if (i == m_source_end) {
206
+ *out << "</td><td class=\"pp_source_right\">";
207
+ current_label = UNANNOTATED;
208
+ }
209
+ }
210
+
211
+ if (current_label != UNANNOTATED && end_word>m_source_end)
212
+ *out << "</span>";
213
+ if (end_word<m_source_length-1)
214
+ *out << "... ";
215
+
216
+ // print target
217
+ // shorten target context if too long
218
+ int target_start = -1;
219
+ int target_end=0;
220
+ for(int i=0; i<m_target_length; i++)
221
+ if (target_annotation[i] != UNANNOTATED) {
222
+ if (target_start == -1)
223
+ target_start = i;
224
+ target_end = i;
225
+ }
226
+
227
+ context_space = width/2;
228
+ for(int i=target_start; i<=target_end; i++)
229
+ context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
230
+ while (context_space < 0) { // shorten matched part, if too long
231
+ context_space +=
232
+ m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
233
+ m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
234
+ target_start++;
235
+ target_end--;
236
+ }
237
+ context_space /= 2;
238
+
239
+ remaining = context_space;
240
+ start_word = target_start;
241
+ for(; start_word>0 && remaining>0; start_word--) {
242
+ //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
243
+ remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
244
+ }
245
+ if (remaining<0 || start_word == -1) start_word++;
246
+
247
+ remaining = context_space;
248
+ end_word = target_end;
249
+ for(; end_word<m_target_length && remaining>0; end_word++) {
250
+ //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
251
+ remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
252
+ }
253
+ end_word--;
254
+
255
+ // output with markup
256
+ *out << "</td><td class=\"mismatch_target\">";
257
+ current_label = UNANNOTATED;
258
+ if (start_word>0) {
259
+ current_label = target_annotation[start_word-1];
260
+ *out << "... ";
261
+ }
262
+ for(int i=start_word; i<=end_word; i++) {
263
+ if (target_annotation[i] != current_label) {
264
+ if (current_label != UNANNOTATED && i!=start_word)
265
+ *out << "</span>";
266
+ if (target_annotation[i] != UNANNOTATED)
267
+ *out << "<span class=\""
268
+ << label_class[ target_annotation[i] ]
269
+ << "\">";
270
+ current_label = target_annotation[i];
271
+ }
272
+
273
+ // output word
274
+ *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
275
+ }
276
+
277
+ if (current_label != UNANNOTATED && end_word>target_end)
278
+ *out << "</span>";
279
+ if (end_word<m_target_length-1)
280
+ *out << "... ";
281
+ *out << "</td></tr>";
282
+ }
283
+
284
+ void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label )
285
+ {
286
+ for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
287
+ if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
288
+ source_annotation[ source_id ] = label;
289
+ target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
290
+ }
291
+ }
292
+ }
mosesdecoder/biconcor/Mismatch.h ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <iosfwd>
4
+
5
+ class Alignment;
6
+ class SuffixArray;
7
+ class TargetCorpus;
8
+
9
+ class Mismatch
10
+ {
11
+ public:
12
+ typedef unsigned int INDEX;
13
+
14
+ private:
15
+ SuffixArray *m_suffixArray;
16
+ TargetCorpus *m_targetCorpus;
17
+ Alignment *m_alignment;
18
+ INDEX m_sentence_id;
19
+ INDEX m_num_alignment_points;
20
+ int m_source_length;
21
+ int m_target_length;
22
+ INDEX m_source_position;
23
+ int m_source_start;
24
+ int m_source_end;
25
+ bool m_source_unaligned[ 256 ];
26
+ bool m_target_unaligned[ 256 ];
27
+ bool m_unaligned;
28
+
29
+ // No copying allowed.
30
+ Mismatch(const Mismatch&);
31
+ void operator=(const Mismatch&);
32
+
33
+ public:
34
+ Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end );
35
+ ~Mismatch();
36
+
37
+ bool Unaligned() const {
38
+ return m_unaligned;
39
+ }
40
+ void PrintClippedHTML(std::ostream* out, int width );
41
+ void LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label );
42
+ };