Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +163 -0
- mosesdecoder/.beautify-ignore +38 -0
- mosesdecoder/.gitignore +90 -0
- mosesdecoder/.gitmodules +9 -0
- mosesdecoder/.travis.yml +24 -0
- mosesdecoder/COPYING +460 -0
- mosesdecoder/Jamroot +345 -0
- mosesdecoder/OnDiskPt/Jamfile +5 -0
- mosesdecoder/OnDiskPt/Main.cpp +273 -0
- mosesdecoder/OnDiskPt/Main.h +39 -0
- mosesdecoder/OnDiskPt/OnDiskQuery.cpp +83 -0
- mosesdecoder/OnDiskPt/OnDiskQuery.h +39 -0
- mosesdecoder/OnDiskPt/OnDiskWrapper.cpp +223 -0
- mosesdecoder/OnDiskPt/OnDiskWrapper.h +111 -0
- mosesdecoder/OnDiskPt/Phrase.cpp +108 -0
- mosesdecoder/OnDiskPt/Phrase.h +66 -0
- mosesdecoder/OnDiskPt/PhraseNode.cpp +268 -0
- mosesdecoder/OnDiskPt/PhraseNode.h +108 -0
- mosesdecoder/OnDiskPt/SourcePhrase.cpp +27 -0
- mosesdecoder/OnDiskPt/SourcePhrase.h +38 -0
- mosesdecoder/OnDiskPt/TargetPhrase.cpp +402 -0
- mosesdecoder/OnDiskPt/TargetPhrase.h +127 -0
- mosesdecoder/OnDiskPt/TargetPhraseCollection.cpp +171 -0
- mosesdecoder/OnDiskPt/TargetPhraseCollection.h +84 -0
- mosesdecoder/OnDiskPt/Vocab.cpp +101 -0
- mosesdecoder/OnDiskPt/Vocab.h +58 -0
- mosesdecoder/OnDiskPt/Word.cpp +144 -0
- mosesdecoder/OnDiskPt/Word.h +91 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt +3 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Main.o +0 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/OnDiskQuery.o +0 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/OnDiskWrapper.o +0 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Phrase.o +0 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/PhraseNode.o +0 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/SourcePhrase.o +0 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/TargetPhrase.o +0 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/TargetPhraseCollection.o +0 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Vocab.o +0 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Word.o +0 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/queryOnDiskPt +0 -0
- mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/queryOnDiskPt.o +0 -0
- mosesdecoder/OnDiskPt/queryOnDiskPt.cpp +86 -0
- mosesdecoder/README +14 -0
- mosesdecoder/azure-pipelines.yml +100 -0
- mosesdecoder/biconcor/Alignment.cpp +222 -0
- mosesdecoder/biconcor/Alignment.h +47 -0
- mosesdecoder/biconcor/CMakeLists.txt +5 -0
- mosesdecoder/biconcor/Jamfile +2 -0
- mosesdecoder/biconcor/Mismatch.cpp +292 -0
- mosesdecoder/biconcor/Mismatch.h +42 -0
.gitattributes
CHANGED
|
@@ -36,3 +36,166 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 36 |
HiSd/data.blm.sd filter=lfs diff=lfs merge=lfs -text
|
| 37 |
HiSd/phrase-table.minphr filter=lfs diff=lfs merge=lfs -text
|
| 38 |
HiSd/reordering-table.minlexr filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
HiSd/data.blm.sd filter=lfs diff=lfs merge=lfs -text
|
| 37 |
HiSd/phrase-table.minphr filter=lfs diff=lfs merge=lfs -text
|
| 38 |
HiSd/reordering-table.minlexr filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
mosesdecoder/bin/build_binary filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
mosesdecoder/bin/consolidate filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
mosesdecoder/bin/consolidate-direct filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
mosesdecoder/bin/consolidate-reverse filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
mosesdecoder/bin/CreateOnDiskPt filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
mosesdecoder/bin/CreateProbingPT filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
mosesdecoder/bin/dump_counts filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
mosesdecoder/bin/evaluator filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
mosesdecoder/bin/extract filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
mosesdecoder/bin/extract-ghkm filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
mosesdecoder/bin/extract-lex filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
mosesdecoder/bin/extract-mixed-syntax filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
mosesdecoder/bin/extract-rules filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
mosesdecoder/bin/extractor filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
mosesdecoder/bin/filter filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
mosesdecoder/bin/filter-rule-table filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
mosesdecoder/bin/fragment filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
mosesdecoder/bin/gcc-9/debug/empty_test_static filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
mosesdecoder/bin/hgdecode filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
mosesdecoder/bin/kbmira filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
mosesdecoder/bin/kenlm_benchmark filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
mosesdecoder/bin/lexical-reordering-score filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
mosesdecoder/bin/lmbrgrid filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
mosesdecoder/bin/lmplz filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
mosesdecoder/bin/merge-sorted filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
mosesdecoder/bin/mert filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
mosesdecoder/bin/moses filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
mosesdecoder/bin/moses2 filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
mosesdecoder/bin/mosesserver filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
mosesdecoder/bin/postprocess-egret-forests filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
mosesdecoder/bin/prepare-expected-bleu-training filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
mosesdecoder/bin/pro filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
mosesdecoder/bin/processLexicalTable filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
mosesdecoder/bin/processLexicalTableMin filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
mosesdecoder/bin/processPhraseTableMin filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
mosesdecoder/bin/prunePhraseTable filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
mosesdecoder/bin/query filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
mosesdecoder/bin/queryLexicalTable filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
mosesdecoder/bin/queryPhraseTableMin filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
mosesdecoder/bin/relax-parse filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
mosesdecoder/bin/score filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
mosesdecoder/bin/score-stsg filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
mosesdecoder/bin/sentence-bleu-nbest filter=lfs diff=lfs merge=lfs -text
|
| 82 |
+
mosesdecoder/bin/statistics filter=lfs diff=lfs merge=lfs -text
|
| 83 |
+
mosesdecoder/bin/train-expected-bleu filter=lfs diff=lfs merge=lfs -text
|
| 84 |
+
mosesdecoder/bin/vwtrainer filter=lfs diff=lfs merge=lfs -text
|
| 85 |
+
mosesdecoder/cmph-2.0.2/lib/libcmph.a filter=lfs diff=lfs merge=lfs -text
|
| 86 |
+
mosesdecoder/cmph-2.0.2/src/.libs/libcmph.a filter=lfs diff=lfs merge=lfs -text
|
| 87 |
+
mosesdecoder/contrib/expected-bleu-training/bin/gcc-9/release/link-static/threading-multi/prepare-expected-bleu-training filter=lfs diff=lfs merge=lfs -text
|
| 88 |
+
mosesdecoder/contrib/expected-bleu-training/bin/gcc-9/release/link-static/threading-multi/train-expected-bleu filter=lfs diff=lfs merge=lfs -text
|
| 89 |
+
mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1 filter=lfs diff=lfs merge=lfs -text
|
| 90 |
+
mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1.binphr.tgtdata.wa filter=lfs diff=lfs merge=lfs -text
|
| 91 |
+
mosesdecoder/contrib/server/bin/gcc-9/release/link-static/threading-multi/mosesserver filter=lfs diff=lfs merge=lfs -text
|
| 92 |
+
mosesdecoder/lib/libmert_lib.a filter=lfs diff=lfs merge=lfs -text
|
| 93 |
+
mosesdecoder/lib/libmoses.a filter=lfs diff=lfs merge=lfs -text
|
| 94 |
+
mosesdecoder/lib/libmoses2_lib.a filter=lfs diff=lfs merge=lfs -text
|
| 95 |
+
mosesdecoder/lib/libmoses2decoder.a filter=lfs diff=lfs merge=lfs -text
|
| 96 |
+
mosesdecoder/lib/libprobingpt.a filter=lfs diff=lfs merge=lfs -text
|
| 97 |
+
mosesdecoder/lib/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
|
| 98 |
+
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary filter=lfs diff=lfs merge=lfs -text
|
| 99 |
+
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment filter=lfs diff=lfs merge=lfs -text
|
| 100 |
+
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark filter=lfs diff=lfs merge=lfs -text
|
| 101 |
+
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query filter=lfs diff=lfs merge=lfs -text
|
| 102 |
+
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test filter=lfs diff=lfs merge=lfs -text
|
| 103 |
+
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test filter=lfs diff=lfs merge=lfs -text
|
| 104 |
+
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.o filter=lfs diff=lfs merge=lfs -text
|
| 105 |
+
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test filter=lfs diff=lfs merge=lfs -text
|
| 106 |
+
mosesdecoder/lm/builder/bin/gcc-9/release/link-static/threading-multi/dump_counts filter=lfs diff=lfs merge=lfs -text
|
| 107 |
+
mosesdecoder/lm/builder/bin/gcc-9/release/link-static/threading-multi/lmplz filter=lfs diff=lfs merge=lfs -text
|
| 108 |
+
mosesdecoder/lm/filter/bin/gcc-9/release/link-static/threading-multi/filter filter=lfs diff=lfs merge=lfs -text
|
| 109 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/bleu_scorer_test filter=lfs diff=lfs merge=lfs -text
|
| 110 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/data_test filter=lfs diff=lfs merge=lfs -text
|
| 111 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/evaluator filter=lfs diff=lfs merge=lfs -text
|
| 112 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/extractor filter=lfs diff=lfs merge=lfs -text
|
| 113 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/feature_data_test filter=lfs diff=lfs merge=lfs -text
|
| 114 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/forest_rescore_test filter=lfs diff=lfs merge=lfs -text
|
| 115 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/hgdecode filter=lfs diff=lfs merge=lfs -text
|
| 116 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/hypergraph_test filter=lfs diff=lfs merge=lfs -text
|
| 117 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/kbmira filter=lfs diff=lfs merge=lfs -text
|
| 118 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/libmert_lib.a filter=lfs diff=lfs merge=lfs -text
|
| 119 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/mert filter=lfs diff=lfs merge=lfs -text
|
| 120 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/mira_feature_vector_test filter=lfs diff=lfs merge=lfs -text
|
| 121 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/ngram_test filter=lfs diff=lfs merge=lfs -text
|
| 122 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/optimizer_factory_test filter=lfs diff=lfs merge=lfs -text
|
| 123 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/point_test filter=lfs diff=lfs merge=lfs -text
|
| 124 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/pro filter=lfs diff=lfs merge=lfs -text
|
| 125 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/reference_test filter=lfs diff=lfs merge=lfs -text
|
| 126 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/sentence-bleu-nbest filter=lfs diff=lfs merge=lfs -text
|
| 127 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/singleton_test filter=lfs diff=lfs merge=lfs -text
|
| 128 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/TER/tools.o filter=lfs diff=lfs merge=lfs -text
|
| 129 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/timer_test filter=lfs diff=lfs merge=lfs -text
|
| 130 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/util_test filter=lfs diff=lfs merge=lfs -text
|
| 131 |
+
mosesdecoder/mert/bin/gcc-9/release/link-static/threading-multi/vocabulary_test filter=lfs diff=lfs merge=lfs -text
|
| 132 |
+
mosesdecoder/mert/evaluator filter=lfs diff=lfs merge=lfs -text
|
| 133 |
+
mosesdecoder/mert/extractor filter=lfs diff=lfs merge=lfs -text
|
| 134 |
+
mosesdecoder/mert/hgdecode filter=lfs diff=lfs merge=lfs -text
|
| 135 |
+
mosesdecoder/mert/kbmira filter=lfs diff=lfs merge=lfs -text
|
| 136 |
+
mosesdecoder/mert/mert filter=lfs diff=lfs merge=lfs -text
|
| 137 |
+
mosesdecoder/mert/pro filter=lfs diff=lfs merge=lfs -text
|
| 138 |
+
mosesdecoder/mert/sentence-bleu-nbest filter=lfs diff=lfs merge=lfs -text
|
| 139 |
+
mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/merge-sorted filter=lfs diff=lfs merge=lfs -text
|
| 140 |
+
mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/processLexicalTable filter=lfs diff=lfs merge=lfs -text
|
| 141 |
+
mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/processLexicalTableMin filter=lfs diff=lfs merge=lfs -text
|
| 142 |
+
mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/processPhraseTableMin filter=lfs diff=lfs merge=lfs -text
|
| 143 |
+
mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/prunePhraseTable filter=lfs diff=lfs merge=lfs -text
|
| 144 |
+
mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/queryLexicalTable filter=lfs diff=lfs merge=lfs -text
|
| 145 |
+
mosesdecoder/misc/bin/gcc-9/release/link-static/threading-multi/queryPhraseTableMin filter=lfs diff=lfs merge=lfs -text
|
| 146 |
+
mosesdecoder/moses/bin/gcc-9/release/link-static/threading-multi/libmoses.a filter=lfs diff=lfs merge=lfs -text
|
| 147 |
+
mosesdecoder/moses/bin/gcc-9/release/link-static/threading-multi/moses_test filter=lfs diff=lfs merge=lfs -text
|
| 148 |
+
mosesdecoder/moses/LM/bin/BackwardTest.test/gcc-9/release/link-static/threading-multi/BackwardTest filter=lfs diff=lfs merge=lfs -text
|
| 149 |
+
mosesdecoder/moses/TranslationModel/UG/util/ibm1-align filter=lfs diff=lfs merge=lfs -text
|
| 150 |
+
mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/lmbrgrid filter=lfs diff=lfs merge=lfs -text
|
| 151 |
+
mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/moses filter=lfs diff=lfs merge=lfs -text
|
| 152 |
+
mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/vwtrainer filter=lfs diff=lfs merge=lfs -text
|
| 153 |
+
mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2_lib.a filter=lfs diff=lfs merge=lfs -text
|
| 154 |
+
mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/libmoses2decoder.a filter=lfs diff=lfs merge=lfs -text
|
| 155 |
+
mosesdecoder/moses2/bin/gcc-9/release/link-static/threading-multi/moses2 filter=lfs diff=lfs merge=lfs -text
|
| 156 |
+
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt filter=lfs diff=lfs merge=lfs -text
|
| 157 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate filter=lfs diff=lfs merge=lfs -text
|
| 158 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate-direct filter=lfs diff=lfs merge=lfs -text
|
| 159 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate-reverse filter=lfs diff=lfs merge=lfs -text
|
| 160 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract filter=lfs diff=lfs merge=lfs -text
|
| 161 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract-lex filter=lfs diff=lfs merge=lfs -text
|
| 162 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract-rules filter=lfs diff=lfs merge=lfs -text
|
| 163 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/relax-parse filter=lfs diff=lfs merge=lfs -text
|
| 164 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/score filter=lfs diff=lfs merge=lfs -text
|
| 165 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/statistics filter=lfs diff=lfs merge=lfs -text
|
| 166 |
+
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest filter=lfs diff=lfs merge=lfs -text
|
| 167 |
+
mosesdecoder/phrase-extract/extract-ghkm/bin/gcc-9/release/link-static/threading-multi/extract-ghkm filter=lfs diff=lfs merge=lfs -text
|
| 168 |
+
mosesdecoder/phrase-extract/extract-mixed-syntax/bin/gcc-9/release/link-static/threading-multi/extract-mixed-syntax filter=lfs diff=lfs merge=lfs -text
|
| 169 |
+
mosesdecoder/phrase-extract/filter-rule-table/bin/gcc-9/release/link-static/threading-multi/filter-rule-table filter=lfs diff=lfs merge=lfs -text
|
| 170 |
+
mosesdecoder/phrase-extract/lexical-reordering/bin/gcc-9/release/link-static/threading-multi/lexical-reordering-score filter=lfs diff=lfs merge=lfs -text
|
| 171 |
+
mosesdecoder/phrase-extract/postprocess-egret-forests/bin/gcc-9/release/link-static/threading-multi/postprocess-egret-forests filter=lfs diff=lfs merge=lfs -text
|
| 172 |
+
mosesdecoder/phrase-extract/score-stsg/bin/gcc-9/release/link-static/threading-multi/score-stsg filter=lfs diff=lfs merge=lfs -text
|
| 173 |
+
mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
|
| 174 |
+
mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_fragment_tokenizer_test filter=lfs diff=lfs merge=lfs -text
|
| 175 |
+
mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_test filter=lfs diff=lfs merge=lfs -text
|
| 176 |
+
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/CreateProbingPT filter=lfs diff=lfs merge=lfs -text
|
| 177 |
+
mosesdecoder/probingpt/bin/gcc-9/release/link-static/threading-multi/libprobingpt.a filter=lfs diff=lfs merge=lfs -text
|
| 178 |
+
mosesdecoder/util/bin/file_piece_test.test/gcc-9/release/link-static/threading-multi/file_piece_test filter=lfs diff=lfs merge=lfs -text
|
| 179 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/bit_packing_test filter=lfs diff=lfs merge=lfs -text
|
| 180 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/integer_to_string_test filter=lfs diff=lfs merge=lfs -text
|
| 181 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/joint_sort_test filter=lfs diff=lfs merge=lfs -text
|
| 182 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/multi_intersection_test filter=lfs diff=lfs merge=lfs -text
|
| 183 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/pcqueue_test filter=lfs diff=lfs merge=lfs -text
|
| 184 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/probing_hash_table_test filter=lfs diff=lfs merge=lfs -text
|
| 185 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/random_test filter=lfs diff=lfs merge=lfs -text
|
| 186 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/sized_iterator_test filter=lfs diff=lfs merge=lfs -text
|
| 187 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/sorted_uniform_test filter=lfs diff=lfs merge=lfs -text
|
| 188 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/string_stream_test filter=lfs diff=lfs merge=lfs -text
|
| 189 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/tempfile_test filter=lfs diff=lfs merge=lfs -text
|
| 190 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/tokenize_piece_test filter=lfs diff=lfs merge=lfs -text
|
| 191 |
+
mosesdecoder/util/bin/gcc-9/release/link-static/threading-multi/tokenize_test filter=lfs diff=lfs merge=lfs -text
|
| 192 |
+
mosesdecoder/util/stream/bin/gcc-9/release/link-static/threading-multi/io_test filter=lfs diff=lfs merge=lfs -text
|
| 193 |
+
mosesdecoder/util/stream/bin/gcc-9/release/link-static/threading-multi/rewindable_stream_test filter=lfs diff=lfs merge=lfs -text
|
| 194 |
+
mosesdecoder/util/stream/bin/gcc-9/release/link-static/threading-multi/sort_test filter=lfs diff=lfs merge=lfs -text
|
| 195 |
+
mosesdecoder/util/stream/bin/gcc-9/release/link-static/threading-multi/stream_test filter=lfs diff=lfs merge=lfs -text
|
| 196 |
+
mosesdecoder/xmlrpc-c-1.51.06/lib/abyss/src/libxmlrpc_abyss.a filter=lfs diff=lfs merge=lfs -text
|
| 197 |
+
mosesdecoder/xmlrpc-c-1.51.06/lib/libxmlrpc++.a filter=lfs diff=lfs merge=lfs -text
|
| 198 |
+
mosesdecoder/xmlrpc-c-1.51.06/lib/libxmlrpc.a filter=lfs diff=lfs merge=lfs -text
|
| 199 |
+
mosesdecoder/xmlrpc-c-1.51.06/lib/libxmlrpc_abyss.a filter=lfs diff=lfs merge=lfs -text
|
| 200 |
+
mosesdecoder/xmlrpc-c-1.51.06/src/cpp/libxmlrpc++.a filter=lfs diff=lfs merge=lfs -text
|
| 201 |
+
mosesdecoder/xmlrpc-c-1.51.06/src/libxmlrpc.a filter=lfs diff=lfs merge=lfs -text
|
mosesdecoder/.beautify-ignore
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Files and directories that beautify.py should not clean up.
|
| 2 |
+
#
|
| 3 |
+
# This file is not as advanced as, say, .gitignore. It only supports files
|
| 4 |
+
# and directory paths relative to the project root, one per line, no globs,
|
| 5 |
+
# no quotes.
|
| 6 |
+
#
|
| 7 |
+
# Leading and trailing whitespace is stripped from filenames, but internal
|
| 8 |
+
# whitespace is preserved.
|
| 9 |
+
#
|
| 10 |
+
# Lines starting with a hash mark, such as this one, are comments. The hash
|
| 11 |
+
# mark must be the first character on the line. Blank lines are ignored.
|
| 12 |
+
#
|
| 13 |
+
# The .beautify-ignore file must be encoded in UTF-8.
|
| 14 |
+
|
| 15 |
+
boost
|
| 16 |
+
contrib
|
| 17 |
+
irstlm
|
| 18 |
+
jam-files
|
| 19 |
+
lm
|
| 20 |
+
mingw/MosesGUI/icons_rc.py
|
| 21 |
+
mingw/MosesGUI/Ui_credits.py
|
| 22 |
+
mingw/MosesGUI/Ui_mainWindow.py
|
| 23 |
+
moses/TranslationModel/UG
|
| 24 |
+
moses/server
|
| 25 |
+
moses/parameters
|
| 26 |
+
moses/thread_safe_container.h
|
| 27 |
+
phrase-extract/pcfg-common
|
| 28 |
+
phrase-extract/syntax-common
|
| 29 |
+
randlm
|
| 30 |
+
# Filename suffixes in here are language codes, so e.g. ".pl" means
|
| 31 |
+
# Polish, not Perl.
|
| 32 |
+
scripts/share/nonbreaking_prefixes
|
| 33 |
+
search
|
| 34 |
+
srilm
|
| 35 |
+
util
|
| 36 |
+
xmlrpc-c
|
| 37 |
+
.git
|
| 38 |
+
util/ug_cache_with_timeout.h
|
mosesdecoder/.gitignore
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tools
|
| 2 |
+
*.d
|
| 3 |
+
*.pyc
|
| 4 |
+
*.lo
|
| 5 |
+
*.o
|
| 6 |
+
*.so
|
| 7 |
+
*.lo
|
| 8 |
+
*.o
|
| 9 |
+
*.la
|
| 10 |
+
*.a
|
| 11 |
+
*.swp
|
| 12 |
+
*.save
|
| 13 |
+
*.cmd
|
| 14 |
+
*~
|
| 15 |
+
*.gch
|
| 16 |
+
dist*
|
| 17 |
+
jam-files/bjam
|
| 18 |
+
jam-files/engine/bootstrap
|
| 19 |
+
jam-files/engine/bin.*
|
| 20 |
+
lm/build_binary
|
| 21 |
+
lm/query
|
| 22 |
+
mert/evaluator
|
| 23 |
+
mert/extractor
|
| 24 |
+
mert/hgdecode
|
| 25 |
+
mert/mert
|
| 26 |
+
mert/megam_i686.opt
|
| 27 |
+
mert/pro
|
| 28 |
+
mert/kbmira
|
| 29 |
+
misc/processLexicalTable
|
| 30 |
+
misc/processPhraseTable
|
| 31 |
+
misc/queryLexicalTable
|
| 32 |
+
mira/mira
|
| 33 |
+
mira/Makefile
|
| 34 |
+
mira/Makefile.in
|
| 35 |
+
misc/queryPhraseTable
|
| 36 |
+
moses-chart-cmd/src/moses_chart
|
| 37 |
+
moses-cmd/src/checkplf
|
| 38 |
+
moses-cmd/src/lmbrgrid
|
| 39 |
+
moses-cmd/src/moses
|
| 40 |
+
regression-testing/moses-reg-test-data-*
|
| 41 |
+
regression-testing/tests/mert.extractor-bin/FEATSTAT*
|
| 42 |
+
regression-testing/tests/mert.extractor-bin/SCORESTAT*
|
| 43 |
+
scripts/ems/biconcor/biconcor
|
| 44 |
+
scripts/release-exclude
|
| 45 |
+
scripts/training/cmert-0.5/mert
|
| 46 |
+
scripts/training/compact-rule-table/tools/compactify
|
| 47 |
+
scripts/training/eppex/counter
|
| 48 |
+
scripts/training/eppex/eppex
|
| 49 |
+
scripts/training/lexical-reordering/score
|
| 50 |
+
scripts/training/memscore/memscore
|
| 51 |
+
scripts/training/mbr/mbr
|
| 52 |
+
scripts/training/phrase-extract/consolidate
|
| 53 |
+
scripts/training/phrase-extract/consolidate-direct
|
| 54 |
+
scripts/training/phrase-extract/consolidate-reverse
|
| 55 |
+
scripts/training/phrase-extract/extract
|
| 56 |
+
scripts/training/phrase-extract/extract-ghkm/tools/extract-ghkm
|
| 57 |
+
scripts/training/phrase-extract/extract-lex
|
| 58 |
+
scripts/training/phrase-extract/extract-rules
|
| 59 |
+
scripts/training/phrase-extract/relax-parse
|
| 60 |
+
scripts/training/phrase-extract/score
|
| 61 |
+
scripts/training/phrase-extract/statistics
|
| 62 |
+
scripts/training/symal/symal
|
| 63 |
+
dist
|
| 64 |
+
bin
|
| 65 |
+
previous.sh
|
| 66 |
+
contrib/other-builds/*.xcodeproj/project.xcworkspace/
|
| 67 |
+
contrib/other-builds/*.xcodeproj/xcuserdata/
|
| 68 |
+
*/*.xcodeproj/project.xcworkspace
|
| 69 |
+
*/*.xcodeproj/xcuserdata
|
| 70 |
+
|
| 71 |
+
mert/sentence-bleu
|
| 72 |
+
mert/sentence-bleu-nbest
|
| 73 |
+
._*
|
| 74 |
+
.DS_Store
|
| 75 |
+
*.pbxuser
|
| 76 |
+
*.mode1v3
|
| 77 |
+
|
| 78 |
+
*.exe
|
| 79 |
+
build/
|
| 80 |
+
nbproject/
|
| 81 |
+
|
| 82 |
+
mingw/MosesGUI/MosesGUI.e4p
|
| 83 |
+
mingw/MosesGUI/_eric4project/
|
| 84 |
+
|
| 85 |
+
contrib/m4m/merge-sorted
|
| 86 |
+
mert/hgdecode
|
| 87 |
+
.bash_history*
|
| 88 |
+
doxygen.conf
|
| 89 |
+
doxy
|
| 90 |
+
opt
|
mosesdecoder/.gitmodules
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[submodule "contrib/arrow-pipelines/python/pcl"]
|
| 2 |
+
path = contrib/arrow-pipelines/python/pcl
|
| 3 |
+
url = https://github.com/ianj-als/pcl.git
|
| 4 |
+
[submodule "contrib/omtc/omtc"]
|
| 5 |
+
path = contrib/omtc/omtc
|
| 6 |
+
url = https://github.com/ianj-als/omtc.git
|
| 7 |
+
[submodule "regtest"]
|
| 8 |
+
path = regtest
|
| 9 |
+
url = https://github.com/moses-smt/moses-regression-tests
|
mosesdecoder/.travis.yml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sudo: false
|
| 2 |
+
dist: trusty
|
| 3 |
+
language: c
|
| 4 |
+
compiler: gcc
|
| 5 |
+
env:
|
| 6 |
+
matrix:
|
| 7 |
+
addons:
|
| 8 |
+
apt:
|
| 9 |
+
sources:
|
| 10 |
+
- ubuntu-toolchain-r-test
|
| 11 |
+
packages:
|
| 12 |
+
- subversion
|
| 13 |
+
- automake
|
| 14 |
+
- libtool
|
| 15 |
+
- zlib1g-dev
|
| 16 |
+
- libbz2-dev
|
| 17 |
+
- liblzma-dev
|
| 18 |
+
- libboost-all-dev
|
| 19 |
+
- libgoogle-perftools-dev
|
| 20 |
+
- libxmlrpc-c++.*-dev
|
| 21 |
+
- cmake
|
| 22 |
+
- csh
|
| 23 |
+
script:
|
| 24 |
+
- ./bjam -j4
|
mosesdecoder/COPYING
ADDED
|
@@ -0,0 +1,460 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
GNU LESSER GENERAL PUBLIC LICENSE
|
| 3 |
+
Version 2.1, February 1999
|
| 4 |
+
|
| 5 |
+
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
|
| 6 |
+
51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
| 7 |
+
Everyone is permitted to copy and distribute verbatim copies
|
| 8 |
+
of this license document, but changing it is not allowed.
|
| 9 |
+
|
| 10 |
+
[This is the first released version of the Lesser GPL. It also counts
|
| 11 |
+
as the successor of the GNU Library Public License, version 2, hence
|
| 12 |
+
the version number 2.1.]
|
| 13 |
+
|
| 14 |
+
Preamble
|
| 15 |
+
|
| 16 |
+
The licenses for most software are designed to take away your
|
| 17 |
+
freedom to share and change it. By contrast, the GNU General Public
|
| 18 |
+
Licenses are intended to guarantee your freedom to share and change
|
| 19 |
+
free software--to make sure the software is free for all its users.
|
| 20 |
+
|
| 21 |
+
This license, the Lesser General Public License, applies to some
|
| 22 |
+
specially designated software packages--typically libraries--of the
|
| 23 |
+
Free Software Foundation and other authors who decide to use it. You
|
| 24 |
+
can use it too, but we suggest you first think carefully about whether
|
| 25 |
+
this license or the ordinary General Public License is the better
|
| 26 |
+
strategy to use in any particular case, based on the explanations
|
| 27 |
+
below.
|
| 28 |
+
|
| 29 |
+
When we speak of free software, we are referring to freedom of use,
|
| 30 |
+
not price. Our General Public Licenses are designed to make sure that
|
| 31 |
+
you have the freedom to distribute copies of free software (and charge
|
| 32 |
+
for this service if you wish); that you receive source code or can get
|
| 33 |
+
it if you want it; that you can change the software and use pieces of
|
| 34 |
+
it in new free programs; and that you are informed that you can do
|
| 35 |
+
these things.
|
| 36 |
+
|
| 37 |
+
To protect your rights, we need to make restrictions that forbid
|
| 38 |
+
distributors to deny you these rights or to ask you to surrender these
|
| 39 |
+
rights. These restrictions translate to certain responsibilities for
|
| 40 |
+
you if you distribute copies of the library or if you modify it.
|
| 41 |
+
|
| 42 |
+
For example, if you distribute copies of the library, whether gratis
|
| 43 |
+
or for a fee, you must give the recipients all the rights that we gave
|
| 44 |
+
you. You must make sure that they, too, receive or can get the source
|
| 45 |
+
code. If you link other code with the library, you must provide
|
| 46 |
+
complete object files to the recipients, so that they can relink them
|
| 47 |
+
with the library after making changes to the library and recompiling
|
| 48 |
+
it. And you must show them these terms so they know their rights.
|
| 49 |
+
|
| 50 |
+
We protect your rights with a two-step method: (1) we copyright the
|
| 51 |
+
library, and (2) we offer you this license, which gives you legal
|
| 52 |
+
permission to copy, distribute and/or modify the library.
|
| 53 |
+
|
| 54 |
+
To protect each distributor, we want to make it very clear that
|
| 55 |
+
there is no warranty for the free library. Also, if the library is
|
| 56 |
+
modified by someone else and passed on, the recipients should know
|
| 57 |
+
that what they have is not the original version, so that the original
|
| 58 |
+
author's reputation will not be affected by problems that might be
|
| 59 |
+
introduced by others.
|
| 60 |
+
|
| 61 |
+
Finally, software patents pose a constant threat to the existence of
|
| 62 |
+
any free program. We wish to make sure that a company cannot
|
| 63 |
+
effectively restrict the users of a free program by obtaining a
|
| 64 |
+
restrictive license from a patent holder. Therefore, we insist that
|
| 65 |
+
any patent license obtained for a version of the library must be
|
| 66 |
+
consistent with the full freedom of use specified in this license.
|
| 67 |
+
|
| 68 |
+
Most GNU software, including some libraries, is covered by the
|
| 69 |
+
ordinary GNU General Public License. This license, the GNU Lesser
|
| 70 |
+
General Public License, applies to certain designated libraries, and
|
| 71 |
+
is quite different from the ordinary General Public License. We use
|
| 72 |
+
this license for certain libraries in order to permit linking those
|
| 73 |
+
libraries into non-free programs.
|
| 74 |
+
|
| 75 |
+
When a program is linked with a library, whether statically or using
|
| 76 |
+
a shared library, the combination of the two is legally speaking a
|
| 77 |
+
combined work, a derivative of the original library. The ordinary
|
| 78 |
+
General Public License therefore permits such linking only if the
|
| 79 |
+
entire combination fits its criteria of freedom. The Lesser General
|
| 80 |
+
Public License permits more lax criteria for linking other code with
|
| 81 |
+
the library.
|
| 82 |
+
|
| 83 |
+
We call this license the "Lesser" General Public License because it
|
| 84 |
+
does Less to protect the user's freedom than the ordinary General
|
| 85 |
+
Public License. It also provides other free software developers Less
|
| 86 |
+
of an advantage over competing non-free programs. These disadvantages
|
| 87 |
+
are the reason we use the ordinary General Public License for many
|
| 88 |
+
libraries. However, the Lesser license provides advantages in certain
|
| 89 |
+
special circumstances.
|
| 90 |
+
|
| 91 |
+
For example, on rare occasions, there may be a special need to
|
| 92 |
+
encourage the widest possible use of a certain library, so that it
|
| 93 |
+
becomes a de-facto standard. To achieve this, non-free programs must
|
| 94 |
+
be allowed to use the library. A more frequent case is that a free
|
| 95 |
+
library does the same job as widely used non-free libraries. In this
|
| 96 |
+
case, there is little to gain by limiting the free library to free
|
| 97 |
+
software only, so we use the Lesser General Public License.
|
| 98 |
+
|
| 99 |
+
In other cases, permission to use a particular library in non-free
|
| 100 |
+
programs enables a greater number of people to use a large body of
|
| 101 |
+
free software. For example, permission to use the GNU C Library in
|
| 102 |
+
non-free programs enables many more people to use the whole GNU
|
| 103 |
+
operating system, as well as its variant, the GNU/Linux operating
|
| 104 |
+
system.
|
| 105 |
+
|
| 106 |
+
Although the Lesser General Public License is Less protective of the
|
| 107 |
+
users' freedom, it does ensure that the user of a program that is
|
| 108 |
+
linked with the Library has the freedom and the wherewithal to run
|
| 109 |
+
that program using a modified version of the Library.
|
| 110 |
+
|
| 111 |
+
The precise terms and conditions for copying, distribution and
|
| 112 |
+
modification follow. Pay close attention to the difference between a
|
| 113 |
+
"work based on the library" and a "work that uses the library". The
|
| 114 |
+
former contains code derived from the library, whereas the latter must
|
| 115 |
+
be combined with the library in order to run.
|
| 116 |
+
|
| 117 |
+
GNU LESSER GENERAL PUBLIC LICENSE
|
| 118 |
+
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
| 119 |
+
|
| 120 |
+
0. This License Agreement applies to any software library or other
|
| 121 |
+
program which contains a notice placed by the copyright holder or
|
| 122 |
+
other authorized party saying it may be distributed under the terms of
|
| 123 |
+
this Lesser General Public License (also called "this License").
|
| 124 |
+
Each licensee is addressed as "you".
|
| 125 |
+
|
| 126 |
+
A "library" means a collection of software functions and/or data
|
| 127 |
+
prepared so as to be conveniently linked with application programs
|
| 128 |
+
(which use some of those functions and data) to form executables.
|
| 129 |
+
|
| 130 |
+
The "Library", below, refers to any such software library or work
|
| 131 |
+
which has been distributed under these terms. A "work based on the
|
| 132 |
+
Library" means either the Library or any derivative work under
|
| 133 |
+
copyright law: that is to say, a work containing the Library or a
|
| 134 |
+
portion of it, either verbatim or with modifications and/or translated
|
| 135 |
+
straightforwardly into another language. (Hereinafter, translation is
|
| 136 |
+
included without limitation in the term "modification".)
|
| 137 |
+
|
| 138 |
+
"Source code" for a work means the preferred form of the work for
|
| 139 |
+
making modifications to it. For a library, complete source code means
|
| 140 |
+
all the source code for all modules it contains, plus any associated
|
| 141 |
+
interface definition files, plus the scripts used to control
|
| 142 |
+
compilation and installation of the library.
|
| 143 |
+
|
| 144 |
+
Activities other than copying, distribution and modification are not
|
| 145 |
+
covered by this License; they are outside its scope. The act of
|
| 146 |
+
running a program using the Library is not restricted, and output from
|
| 147 |
+
such a program is covered only if its contents constitute a work based
|
| 148 |
+
on the Library (independent of the use of the Library in a tool for
|
| 149 |
+
writing it). Whether that is true depends on what the Library does
|
| 150 |
+
and what the program that uses the Library does.
|
| 151 |
+
|
| 152 |
+
1. You may copy and distribute verbatim copies of the Library's
|
| 153 |
+
complete source code as you receive it, in any medium, provided that
|
| 154 |
+
you conspicuously and appropriately publish on each copy an
|
| 155 |
+
appropriate copyright notice and disclaimer of warranty; keep intact
|
| 156 |
+
all the notices that refer to this License and to the absence of any
|
| 157 |
+
warranty; and distribute a copy of this License along with the
|
| 158 |
+
Library.
|
| 159 |
+
|
| 160 |
+
You may charge a fee for the physical act of transferring a copy,
|
| 161 |
+
and you may at your option offer warranty protection in exchange for a
|
| 162 |
+
fee.
|
| 163 |
+
|
| 164 |
+
2. You may modify your copy or copies of the Library or any portion
|
| 165 |
+
of it, thus forming a work based on the Library, and copy and
|
| 166 |
+
distribute such modifications or work under the terms of Section 1
|
| 167 |
+
above, provided that you also meet all of these conditions:
|
| 168 |
+
|
| 169 |
+
a) The modified work must itself be a software library.
|
| 170 |
+
|
| 171 |
+
b) You must cause the files modified to carry prominent notices
|
| 172 |
+
stating that you changed the files and the date of any change.
|
| 173 |
+
|
| 174 |
+
c) You must cause the whole of the work to be licensed at no
|
| 175 |
+
charge to all third parties under the terms of this License.
|
| 176 |
+
|
| 177 |
+
d) If a facility in the modified Library refers to a function or a
|
| 178 |
+
table of data to be supplied by an application program that uses
|
| 179 |
+
the facility, other than as an argument passed when the facility
|
| 180 |
+
is invoked, then you must make a good faith effort to ensure that,
|
| 181 |
+
in the event an application does not supply such function or
|
| 182 |
+
table, the facility still operates, and performs whatever part of
|
| 183 |
+
its purpose remains meaningful.
|
| 184 |
+
|
| 185 |
+
(For example, a function in a library to compute square roots has
|
| 186 |
+
a purpose that is entirely well-defined independent of the
|
| 187 |
+
application. Therefore, Subsection 2d requires that any
|
| 188 |
+
application-supplied function or table used by this function must
|
| 189 |
+
be optional: if the application does not supply it, the square
|
| 190 |
+
root function must still compute square roots.)
|
| 191 |
+
|
| 192 |
+
These requirements apply to the modified work as a whole. If
|
| 193 |
+
identifiable sections of that work are not derived from the Library,
|
| 194 |
+
and can be reasonably considered independent and separate works in
|
| 195 |
+
themselves, then this License, and its terms, do not apply to those
|
| 196 |
+
sections when you distribute them as separate works. But when you
|
| 197 |
+
distribute the same sections as part of a whole which is a work based
|
| 198 |
+
on the Library, the distribution of the whole must be on the terms of
|
| 199 |
+
this License, whose permissions for other licensees extend to the
|
| 200 |
+
entire whole, and thus to each and every part regardless of who wrote
|
| 201 |
+
it.
|
| 202 |
+
|
| 203 |
+
Thus, it is not the intent of this section to claim rights or contest
|
| 204 |
+
your rights to work written entirely by you; rather, the intent is to
|
| 205 |
+
exercise the right to control the distribution of derivative or
|
| 206 |
+
collective works based on the Library.
|
| 207 |
+
|
| 208 |
+
In addition, mere aggregation of another work not based on the Library
|
| 209 |
+
with the Library (or with a work based on the Library) on a volume of
|
| 210 |
+
a storage or distribution medium does not bring the other work under
|
| 211 |
+
the scope of this License.
|
| 212 |
+
|
| 213 |
+
3. You may opt to apply the terms of the ordinary GNU General Public
|
| 214 |
+
License instead of this License to a given copy of the Library. To do
|
| 215 |
+
this, you must alter all the notices that refer to this License, so
|
| 216 |
+
that they refer to the ordinary GNU General Public License, version 2,
|
| 217 |
+
instead of to this License. (If a newer version than version 2 of the
|
| 218 |
+
ordinary GNU General Public License has appeared, then you can specify
|
| 219 |
+
that version instead if you wish.) Do not make any other change in
|
| 220 |
+
these notices.
|
| 221 |
+
|
| 222 |
+
Once this change is made in a given copy, it is irreversible for
|
| 223 |
+
that copy, so the ordinary GNU General Public License applies to all
|
| 224 |
+
subsequent copies and derivative works made from that copy.
|
| 225 |
+
|
| 226 |
+
This option is useful when you wish to copy part of the code of
|
| 227 |
+
the Library into a program that is not a library.
|
| 228 |
+
|
| 229 |
+
4. You may copy and distribute the Library (or a portion or
|
| 230 |
+
derivative of it, under Section 2) in object code or executable form
|
| 231 |
+
under the terms of Sections 1 and 2 above provided that you accompany
|
| 232 |
+
it with the complete corresponding machine-readable source code, which
|
| 233 |
+
must be distributed under the terms of Sections 1 and 2 above on a
|
| 234 |
+
medium customarily used for software interchange.
|
| 235 |
+
|
| 236 |
+
If distribution of object code is made by offering access to copy
|
| 237 |
+
from a designated place, then offering equivalent access to copy the
|
| 238 |
+
source code from the same place satisfies the requirement to
|
| 239 |
+
distribute the source code, even though third parties are not
|
| 240 |
+
compelled to copy the source along with the object code.
|
| 241 |
+
|
| 242 |
+
5. A program that contains no derivative of any portion of the
|
| 243 |
+
Library, but is designed to work with the Library by being compiled or
|
| 244 |
+
linked with it, is called a "work that uses the Library". Such a
|
| 245 |
+
work, in isolation, is not a derivative work of the Library, and
|
| 246 |
+
therefore falls outside the scope of this License.
|
| 247 |
+
|
| 248 |
+
However, linking a "work that uses the Library" with the Library
|
| 249 |
+
creates an executable that is a derivative of the Library (because it
|
| 250 |
+
contains portions of the Library), rather than a "work that uses the
|
| 251 |
+
library". The executable is therefore covered by this License.
|
| 252 |
+
Section 6 states terms for distribution of such executables.
|
| 253 |
+
|
| 254 |
+
When a "work that uses the Library" uses material from a header file
|
| 255 |
+
that is part of the Library, the object code for the work may be a
|
| 256 |
+
derivative work of the Library even though the source code is not.
|
| 257 |
+
Whether this is true is especially significant if the work can be
|
| 258 |
+
linked without the Library, or if the work is itself a library. The
|
| 259 |
+
threshold for this to be true is not precisely defined by law.
|
| 260 |
+
|
| 261 |
+
If such an object file uses only numerical parameters, data
|
| 262 |
+
structure layouts and accessors, and small macros and small inline
|
| 263 |
+
functions (ten lines or less in length), then the use of the object
|
| 264 |
+
file is unrestricted, regardless of whether it is legally a derivative
|
| 265 |
+
work. (Executables containing this object code plus portions of the
|
| 266 |
+
Library will still fall under Section 6.)
|
| 267 |
+
|
| 268 |
+
Otherwise, if the work is a derivative of the Library, you may
|
| 269 |
+
distribute the object code for the work under the terms of Section 6.
|
| 270 |
+
Any executables containing that work also fall under Section 6,
|
| 271 |
+
whether or not they are linked directly with the Library itself.
|
| 272 |
+
|
| 273 |
+
6. As an exception to the Sections above, you may also combine or
|
| 274 |
+
link a "work that uses the Library" with the Library to produce a
|
| 275 |
+
work containing portions of the Library, and distribute that work
|
| 276 |
+
under terms of your choice, provided that the terms permit
|
| 277 |
+
modification of the work for the customer's own use and reverse
|
| 278 |
+
engineering for debugging such modifications.
|
| 279 |
+
|
| 280 |
+
You must give prominent notice with each copy of the work that the
|
| 281 |
+
Library is used in it and that the Library and its use are covered by
|
| 282 |
+
this License. You must supply a copy of this License. If the work
|
| 283 |
+
during execution displays copyright notices, you must include the
|
| 284 |
+
copyright notice for the Library among them, as well as a reference
|
| 285 |
+
directing the user to the copy of this License. Also, you must do one
|
| 286 |
+
of these things:
|
| 287 |
+
|
| 288 |
+
a) Accompany the work with the complete corresponding
|
| 289 |
+
machine-readable source code for the Library including whatever
|
| 290 |
+
changes were used in the work (which must be distributed under
|
| 291 |
+
Sections 1 and 2 above); and, if the work is an executable linked
|
| 292 |
+
with the Library, with the complete machine-readable "work that
|
| 293 |
+
uses the Library", as object code and/or source code, so that the
|
| 294 |
+
user can modify the Library and then relink to produce a modified
|
| 295 |
+
executable containing the modified Library. (It is understood
|
| 296 |
+
that the user who changes the contents of definitions files in the
|
| 297 |
+
Library will not necessarily be able to recompile the application
|
| 298 |
+
to use the modified definitions.)
|
| 299 |
+
|
| 300 |
+
b) Use a suitable shared library mechanism for linking with the
|
| 301 |
+
Library. A suitable mechanism is one that (1) uses at run time a
|
| 302 |
+
copy of the library already present on the user's computer system,
|
| 303 |
+
rather than copying library functions into the executable, and (2)
|
| 304 |
+
will operate properly with a modified version of the library, if
|
| 305 |
+
the user installs one, as long as the modified version is
|
| 306 |
+
interface-compatible with the version that the work was made with.
|
| 307 |
+
|
| 308 |
+
c) Accompany the work with a written offer, valid for at least
|
| 309 |
+
three years, to give the same user the materials specified in
|
| 310 |
+
Subsection 6a, above, for a charge no more than the cost of
|
| 311 |
+
performing this distribution.
|
| 312 |
+
|
| 313 |
+
d) If distribution of the work is made by offering access to copy
|
| 314 |
+
from a designated place, offer equivalent access to copy the above
|
| 315 |
+
specified materials from the same place.
|
| 316 |
+
|
| 317 |
+
e) Verify that the user has already received a copy of these
|
| 318 |
+
materials or that you have already sent this user a copy.
|
| 319 |
+
|
| 320 |
+
For an executable, the required form of the "work that uses the
|
| 321 |
+
Library" must include any data and utility programs needed for
|
| 322 |
+
reproducing the executable from it. However, as a special exception,
|
| 323 |
+
the materials to be distributed need not include anything that is
|
| 324 |
+
normally distributed (in either source or binary form) with the major
|
| 325 |
+
components (compiler, kernel, and so on) of the operating system on
|
| 326 |
+
which the executable runs, unless that component itself accompanies
|
| 327 |
+
the executable.
|
| 328 |
+
|
| 329 |
+
It may happen that this requirement contradicts the license
|
| 330 |
+
restrictions of other proprietary libraries that do not normally
|
| 331 |
+
accompany the operating system. Such a contradiction means you cannot
|
| 332 |
+
use both them and the Library together in an executable that you
|
| 333 |
+
distribute.
|
| 334 |
+
|
| 335 |
+
7. You may place library facilities that are a work based on the
|
| 336 |
+
Library side-by-side in a single library together with other library
|
| 337 |
+
facilities not covered by this License, and distribute such a combined
|
| 338 |
+
library, provided that the separate distribution of the work based on
|
| 339 |
+
the Library and of the other library facilities is otherwise
|
| 340 |
+
permitted, and provided that you do these two things:
|
| 341 |
+
|
| 342 |
+
a) Accompany the combined library with a copy of the same work
|
| 343 |
+
based on the Library, uncombined with any other library
|
| 344 |
+
facilities. This must be distributed under the terms of the
|
| 345 |
+
Sections above.
|
| 346 |
+
|
| 347 |
+
b) Give prominent notice with the combined library of the fact
|
| 348 |
+
that part of it is a work based on the Library, and explaining
|
| 349 |
+
where to find the accompanying uncombined form of the same work.
|
| 350 |
+
|
| 351 |
+
8. You may not copy, modify, sublicense, link with, or distribute
|
| 352 |
+
the Library except as expressly provided under this License. Any
|
| 353 |
+
attempt otherwise to copy, modify, sublicense, link with, or
|
| 354 |
+
distribute the Library is void, and will automatically terminate your
|
| 355 |
+
rights under this License. However, parties who have received copies,
|
| 356 |
+
or rights, from you under this License will not have their licenses
|
| 357 |
+
terminated so long as such parties remain in full compliance.
|
| 358 |
+
|
| 359 |
+
9. You are not required to accept this License, since you have not
|
| 360 |
+
signed it. However, nothing else grants you permission to modify or
|
| 361 |
+
distribute the Library or its derivative works. These actions are
|
| 362 |
+
prohibited by law if you do not accept this License. Therefore, by
|
| 363 |
+
modifying or distributing the Library (or any work based on the
|
| 364 |
+
Library), you indicate your acceptance of this License to do so, and
|
| 365 |
+
all its terms and conditions for copying, distributing or modifying
|
| 366 |
+
the Library or works based on it.
|
| 367 |
+
|
| 368 |
+
10. Each time you redistribute the Library (or any work based on the
|
| 369 |
+
Library), the recipient automatically receives a license from the
|
| 370 |
+
original licensor to copy, distribute, link with or modify the Library
|
| 371 |
+
subject to these terms and conditions. You may not impose any further
|
| 372 |
+
restrictions on the recipients' exercise of the rights granted herein.
|
| 373 |
+
You are not responsible for enforcing compliance by third parties with
|
| 374 |
+
this License.
|
| 375 |
+
|
| 376 |
+
11. If, as a consequence of a court judgment or allegation of patent
|
| 377 |
+
infringement or for any other reason (not limited to patent issues),
|
| 378 |
+
conditions are imposed on you (whether by court order, agreement or
|
| 379 |
+
otherwise) that contradict the conditions of this License, they do not
|
| 380 |
+
excuse you from the conditions of this License. If you cannot
|
| 381 |
+
distribute so as to satisfy simultaneously your obligations under this
|
| 382 |
+
License and any other pertinent obligations, then as a consequence you
|
| 383 |
+
may not distribute the Library at all. For example, if a patent
|
| 384 |
+
license would not permit royalty-free redistribution of the Library by
|
| 385 |
+
all those who receive copies directly or indirectly through you, then
|
| 386 |
+
the only way you could satisfy both it and this License would be to
|
| 387 |
+
refrain entirely from distribution of the Library.
|
| 388 |
+
|
| 389 |
+
If any portion of this section is held invalid or unenforceable under
|
| 390 |
+
any particular circumstance, the balance of the section is intended to
|
| 391 |
+
apply, and the section as a whole is intended to apply in other
|
| 392 |
+
circumstances.
|
| 393 |
+
|
| 394 |
+
It is not the purpose of this section to induce you to infringe any
|
| 395 |
+
patents or other property right claims or to contest validity of any
|
| 396 |
+
such claims; this section has the sole purpose of protecting the
|
| 397 |
+
integrity of the free software distribution system which is
|
| 398 |
+
implemented by public license practices. Many people have made
|
| 399 |
+
generous contributions to the wide range of software distributed
|
| 400 |
+
through that system in reliance on consistent application of that
|
| 401 |
+
system; it is up to the author/donor to decide if he or she is willing
|
| 402 |
+
to distribute software through any other system and a licensee cannot
|
| 403 |
+
impose that choice.
|
| 404 |
+
|
| 405 |
+
This section is intended to make thoroughly clear what is believed to
|
| 406 |
+
be a consequence of the rest of this License.
|
| 407 |
+
|
| 408 |
+
12. If the distribution and/or use of the Library is restricted in
|
| 409 |
+
certain countries either by patents or by copyrighted interfaces, the
|
| 410 |
+
original copyright holder who places the Library under this License
|
| 411 |
+
may add an explicit geographical distribution limitation excluding those
|
| 412 |
+
countries, so that distribution is permitted only in or among
|
| 413 |
+
countries not thus excluded. In such case, this License incorporates
|
| 414 |
+
the limitation as if written in the body of this License.
|
| 415 |
+
|
| 416 |
+
13. The Free Software Foundation may publish revised and/or new
|
| 417 |
+
versions of the Lesser General Public License from time to time.
|
| 418 |
+
Such new versions will be similar in spirit to the present version,
|
| 419 |
+
but may differ in detail to address new problems or concerns.
|
| 420 |
+
|
| 421 |
+
Each version is given a distinguishing version number. If the Library
|
| 422 |
+
specifies a version number of this License which applies to it and
|
| 423 |
+
"any later version", you have the option of following the terms and
|
| 424 |
+
conditions either of that version or of any later version published by
|
| 425 |
+
the Free Software Foundation. If the Library does not specify a
|
| 426 |
+
license version number, you may choose any version ever published by
|
| 427 |
+
the Free Software Foundation.
|
| 428 |
+
|
| 429 |
+
14. If you wish to incorporate parts of the Library into other free
|
| 430 |
+
programs whose distribution conditions are incompatible with these,
|
| 431 |
+
write to the author to ask for permission. For software which is
|
| 432 |
+
copyrighted by the Free Software Foundation, write to the Free
|
| 433 |
+
Software Foundation; we sometimes make exceptions for this. Our
|
| 434 |
+
decision will be guided by the two goals of preserving the free status
|
| 435 |
+
of all derivatives of our free software and of promoting the sharing
|
| 436 |
+
and reuse of software generally.
|
| 437 |
+
|
| 438 |
+
NO WARRANTY
|
| 439 |
+
|
| 440 |
+
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
|
| 441 |
+
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
|
| 442 |
+
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
|
| 443 |
+
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
|
| 444 |
+
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
|
| 445 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 446 |
+
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
|
| 447 |
+
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
|
| 448 |
+
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
| 449 |
+
|
| 450 |
+
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
|
| 451 |
+
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
|
| 452 |
+
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
|
| 453 |
+
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
|
| 454 |
+
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
|
| 455 |
+
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
| 456 |
+
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
| 457 |
+
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
| 458 |
+
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
| 459 |
+
DAMAGES.
|
| 460 |
+
|
mosesdecoder/Jamroot
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#BUILDING MOSES
|
| 2 |
+
|
| 3 |
+
#PACKAGES
|
| 4 |
+
#Language models (optional):
|
| 5 |
+
#--with-irstlm=/path/to/irstlm
|
| 6 |
+
#--with-srilm=/path/to/srilm See moses/LM/Jamfile for more options.
|
| 7 |
+
#--with-maxent-srilm=true (requires a maxent-enabled version of SRILM to be specified via --with-srilm)
|
| 8 |
+
#--with-nplm=/path/to/nplm
|
| 9 |
+
#--with-randlm=/path/to/randlm
|
| 10 |
+
#KenLM is always compiled.
|
| 11 |
+
#
|
| 12 |
+
#--with-boost=/path/to/boost
|
| 13 |
+
#If Boost is in a non-standard location, specify it here. This directory is
|
| 14 |
+
#expected to contain include and lib or lib64.
|
| 15 |
+
#
|
| 16 |
+
#--with-xmlrpc-c=/path/to/xmlrpc-c for libxmlrpc-c (used by server)
|
| 17 |
+
#Note that, like language models, this is the --prefix where the library was
|
| 18 |
+
#installed, not some executable within the library.
|
| 19 |
+
#
|
| 20 |
+
#--no-xmlrpc-c
|
| 21 |
+
# Don't use xmlrpc-c library, even if it exists. Don't build moses server
|
| 22 |
+
#
|
| 23 |
+
#Compact phrase table and compact lexical reordering table
|
| 24 |
+
#--with-cmph=/path/to/cmph
|
| 25 |
+
#
|
| 26 |
+
#Thread-caching malloc (if present, used for multi-threaded builds by default)
|
| 27 |
+
#--without-tcmalloc does not compile with tcmalloc even if present
|
| 28 |
+
#--full-tcmalloc links against the full version (useful for memory profiling)
|
| 29 |
+
#
|
| 30 |
+
#REGRESSION TESTING
|
| 31 |
+
#--with-regtest=/path/to/moses-reg-test-data
|
| 32 |
+
#
|
| 33 |
+
#INSTALLATION
|
| 34 |
+
#--prefix=/path/to/prefix sets the install prefix [default is source root].
|
| 35 |
+
#--bindir=/path/to/prefix/bin sets the bin directory [PREFIX/bin]
|
| 36 |
+
#--libdir=/path/to/prefix/lib sets the lib directory [PREFIX/lib]
|
| 37 |
+
#--includedir=/path/to/prefix/include installs headers.
|
| 38 |
+
# Does not install if missing. No argument defaults to PREFIX/include .
|
| 39 |
+
#--install-scripts=/path/to/scripts copies scripts into a directory.
|
| 40 |
+
# Does not install if missing. No argument defaults to PREFIX/scripts .
|
| 41 |
+
#--git appends the git revision to the prefix directory.
|
| 42 |
+
#
|
| 43 |
+
#
|
| 44 |
+
#BUILD OPTIONS
|
| 45 |
+
# By default, the build is multi-threaded, optimized, and statically linked.
|
| 46 |
+
# Pass these to change the build:
|
| 47 |
+
#
|
| 48 |
+
# threading=single|multi controls threading (default multi)
|
| 49 |
+
#
|
| 50 |
+
# variant=release|debug|profile builds optimized (default), for debug, or for
|
| 51 |
+
# profiling
|
| 52 |
+
#
|
| 53 |
+
# link=static|shared controls preferred linking (default static)
|
| 54 |
+
# --static forces static linking (the default will fall
|
| 55 |
+
# back to shared)
|
| 56 |
+
#
|
| 57 |
+
# debug-symbols=on|off include or exclude (default) debugging
|
| 58 |
+
# information also known as -g
|
| 59 |
+
# --notrace compiles without TRACE macros
|
| 60 |
+
#
|
| 61 |
+
# --enable-boost-pool uses Boost pools for the memory SCFG tabgle
|
| 62 |
+
#
|
| 63 |
+
# --enable-mpi switch on mpi
|
| 64 |
+
# --without-libsegfault does not link with libSegFault
|
| 65 |
+
#
|
| 66 |
+
# --max-kenlm-order maximum ngram order that kenlm can process (default 6)
|
| 67 |
+
#
|
| 68 |
+
# --max-factors maximum number of factors (default 4)
|
| 69 |
+
#
|
| 70 |
+
# --unlabelled-source ignore source labels (redundant in hiero or string-to-tree system)
|
| 71 |
+
# for better performance
|
| 72 |
+
#CONTROLLING THE BUILD
|
| 73 |
+
#-a to build from scratch
|
| 74 |
+
#-j$NCPUS to compile in parallel
|
| 75 |
+
#--clean to clean
|
| 76 |
+
#--debug-build to build with Og. Only available with gcc 4.8+
|
| 77 |
+
|
| 78 |
+
import os ;
|
| 79 |
+
import option ;
|
| 80 |
+
import modules ;
|
| 81 |
+
import path ;
|
| 82 |
+
path-constant TOP : . ;
|
| 83 |
+
|
| 84 |
+
include $(TOP)/jam-files/sanity.jam ;
|
| 85 |
+
|
| 86 |
+
home = [ os.environ "HOME" ] ;
|
| 87 |
+
if [ path.exists $(home)/moses-environment.jam ]
|
| 88 |
+
{
|
| 89 |
+
# for those of use who don't like typing in command line bjam options all day long
|
| 90 |
+
include $(home)/moses-environment.jam ;
|
| 91 |
+
}
|
| 92 |
+
include $(TOP)/jam-files/check-environment.jam ; # get resource locations
|
| 93 |
+
# from environment variables
|
| 94 |
+
include $(TOP)/jam-files/xmlrpc-c.jam ; # xmlrpc-c stuff for the server
|
| 95 |
+
# include $(TOP)/jam-files/curlpp.jam ; # curlpp stuff for bias lookup (MMT only)
|
| 96 |
+
|
| 97 |
+
# exit "done" : 0 ;
|
| 98 |
+
|
| 99 |
+
max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
|
| 100 |
+
if ! [ option.get "max-kenlm-order" ]
|
| 101 |
+
{
|
| 102 |
+
# some classes in Moses pull in header files from KenLM, so this needs to be
|
| 103 |
+
# defined here, not in moses/lm/Jamfile
|
| 104 |
+
option.set "max-kenlm-order" : 6 ;
|
| 105 |
+
requirements += <define>KENLM_MAX_ORDER=$(max-order) ;
|
| 106 |
+
}
|
| 107 |
+
# exit "all done" : 0 ;
|
| 108 |
+
|
| 109 |
+
boost 104400 ;
|
| 110 |
+
external-lib z ;
|
| 111 |
+
|
| 112 |
+
#lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
|
| 113 |
+
#requirements += <library>dl ;
|
| 114 |
+
requirements += <cxxflags>-std=c++0x ;
|
| 115 |
+
|
| 116 |
+
# Allow moses to report the git commit hash of the version used for compilation
|
| 117 |
+
moses_githash = [ _shell "git describe --dirty" ] ;
|
| 118 |
+
requirements += <define>MOSES_VERSION_ID=\\\"$(moses_githash)\\\" ;
|
| 119 |
+
|
| 120 |
+
if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] {
|
| 121 |
+
if [ option.get "full-tcmalloc" : : "yes" ] {
|
| 122 |
+
external-lib unwind ;
|
| 123 |
+
external-lib tcmalloc_and_profiler : : unwind ;
|
| 124 |
+
requirements += <library>tcmalloc_and_profiler <library>unwind <cflags>-fno-omit-frame-pointer <cxxflags>-fno-omit-frame-pointer ;
|
| 125 |
+
} else {
|
| 126 |
+
external-lib tcmalloc_minimal ;
|
| 127 |
+
requirements += <threading>multi:<library>tcmalloc_minimal ;
|
| 128 |
+
}
|
| 129 |
+
} else {
|
| 130 |
+
echo "Tip: install tcmalloc for faster threading. See BUILD-INSTRUCTIONS.txt for more information." ;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
if [ option.get "filter-warnings" : : "yes" ] {
|
| 134 |
+
# given the low coding standards in Moses, we may want to filter out
|
| 135 |
+
# warnings about poor coding practice that no-one is ever going to fix
|
| 136 |
+
# anyway ...
|
| 137 |
+
requirements += <cxxflags>-Wno-deprecated ;
|
| 138 |
+
requirements += <cxxflags>-Wno-reorder ;
|
| 139 |
+
requirements += <cxxflags>-Wno-sign-compare ;
|
| 140 |
+
requirements += <cxxflags>-Wno-unused-but-set-variable ;
|
| 141 |
+
requirements += <cxxflags>-Wno-unused-result ;
|
| 142 |
+
requirements += <cxxflags>-Wno-unused-variable ;
|
| 143 |
+
requirements += <cxxflags>-Wno-comment ;
|
| 144 |
+
requirements += <cxxflags>-Wno-strict-aliasing ;
|
| 145 |
+
requirements += <cxxflags>-Wno-overloaded-virtual ;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
if [ option.get "debug-build" : : "yes" ] {
|
| 149 |
+
requirements += <cxxflags>-Og ;
|
| 150 |
+
echo "Building with -Og to enable easier profiling and debugging. Only available on gcc 4.8+." ;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
if [ option.get "with-address-sanitizer" : : "yes" ] {
|
| 154 |
+
requirements += <cxxflags>-fsanitize=address ;
|
| 155 |
+
requirements += <cxxflags>-fno-omit-frame-pointer ;
|
| 156 |
+
requirements += <linkflags>-fsanitize=address ;
|
| 157 |
+
echo "Building with AddressSanitizer to enable debugging of memory errors. Only available on gcc 4.8+." ;
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
if [ option.get "enable-mpi" : : "yes" ] {
|
| 161 |
+
import mpi ;
|
| 162 |
+
using mpi ;
|
| 163 |
+
external-lib boost_mpi ;
|
| 164 |
+
external-lib boost_serialization ;
|
| 165 |
+
requirements += <define>MPI_ENABLE ;
|
| 166 |
+
requirements += <library>mpi ;
|
| 167 |
+
requirements += <library>boost_mpi ;
|
| 168 |
+
requirements += <library>boost_serialization ;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
mmt = [ option.get "mmt" ] ;
|
| 172 |
+
if $(mmt) {
|
| 173 |
+
requirements += <define>MMT ;
|
| 174 |
+
requirements += <include>$(mmt) ;
|
| 175 |
+
mmt_githash = [ _shell "cd $(mmt) && git describe --dirty" ] ;
|
| 176 |
+
requirements += <define>MMT_VERSION_ID=\\\"$(mmt_githash)\\\" ;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
requirements += [ option.get "notrace" : <define>TRACE_ENABLE=1 ] ;
|
| 180 |
+
requirements += [ option.get "enable-boost-pool" : : <define>USE_BOOST_POOL ] ;
|
| 181 |
+
requirements += [ option.get "with-mm" : : <define>PT_UG ] ;
|
| 182 |
+
requirements += [ option.get "with-mm" : : <define>MAX_NUM_FACTORS=4 ] ;
|
| 183 |
+
requirements += [ option.get "unlabelled-source" : : <define>UNLABELLED_SOURCE ] ;
|
| 184 |
+
|
| 185 |
+
if [ option.get "with-oxlm" ] {
|
| 186 |
+
external-lib gomp ;
|
| 187 |
+
requirements += <library>boost_serialization ;
|
| 188 |
+
requirements += <library>gomp ;
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
if [ option.get "with-cmph" : : "yes" ] {
|
| 192 |
+
requirements += <define>HAVE_CMPH ;
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
if [ option.get "with-icu" : : "yes" ]
|
| 196 |
+
{
|
| 197 |
+
external-lib icuuc ;
|
| 198 |
+
external-lib icuio ;
|
| 199 |
+
external-lib icui18n ;
|
| 200 |
+
requirements += <library>icuuc/<link>shared ;
|
| 201 |
+
requirements += <library>icuio/<link>shared ;
|
| 202 |
+
requirements += <library>icui18n/<link>shared ;
|
| 203 |
+
requirements += <cxxflags>-fPIC ;
|
| 204 |
+
requirements += <address-model>64 ;
|
| 205 |
+
# requirements += <runtime-link>shared ;
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
# for probing pt
|
| 209 |
+
external-lib boost_serialization ;
|
| 210 |
+
requirements += <library>boost_serialization/<runtime-link>static ;
|
| 211 |
+
|
| 212 |
+
if [ option.get "with-vw" ] {
|
| 213 |
+
requirements += <define>HAVE_VW ;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
project : default-build
|
| 217 |
+
<threading>multi
|
| 218 |
+
<warnings>on
|
| 219 |
+
<debug-symbols>off
|
| 220 |
+
<variant>release
|
| 221 |
+
<link>static
|
| 222 |
+
;
|
| 223 |
+
|
| 224 |
+
#Apparently OS X likes to link against iconv for fgetsUTF8.
|
| 225 |
+
lib iconv ;
|
| 226 |
+
requirements += <os>MACOSX:<library>iconv ;
|
| 227 |
+
|
| 228 |
+
project : requirements
|
| 229 |
+
<threading>multi:<define>WITH_THREADS
|
| 230 |
+
<threading>multi:<library>boost_thread
|
| 231 |
+
<library>boost_system
|
| 232 |
+
<library>boost_program_options
|
| 233 |
+
<define>_FILE_OFFSET_BITS=64 <define>_LARGE_FILES
|
| 234 |
+
$(requirements)
|
| 235 |
+
<include>.
|
| 236 |
+
;
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
#Add directories here if you want their incidental targets too (i.e. tests).
|
| 240 |
+
build-projects lm util phrase-extract phrase-extract/syntax-common search moses moses/LM mert moses-cmd scripts regression-testing ;
|
| 241 |
+
# contrib/mira
|
| 242 |
+
|
| 243 |
+
if [ option.get "with-mm-extras" : : "yes" ]
|
| 244 |
+
{
|
| 245 |
+
alias mm-extras :
|
| 246 |
+
moses/TranslationModel/UG//bitext-find
|
| 247 |
+
moses/TranslationModel/UG//ptable-describe-features
|
| 248 |
+
moses/TranslationModel/UG//count-ptable-features
|
| 249 |
+
moses/TranslationModel/UG//ptable-sigtest-filter
|
| 250 |
+
moses/TranslationModel/UG//ptable-lookup
|
| 251 |
+
moses/TranslationModel/UG//ptable-lookup-corpus
|
| 252 |
+
moses/TranslationModel/UG//check-coverage
|
| 253 |
+
moses/TranslationModel/UG/mm//mtt-demo1
|
| 254 |
+
moses/TranslationModel/UG/mm//mtt-dump
|
| 255 |
+
moses/TranslationModel/UG/mm//mam2symal
|
| 256 |
+
moses/TranslationModel/UG/mm//mam_verify
|
| 257 |
+
moses/TranslationModel/UG/mm//mmlex-lookup
|
| 258 |
+
moses/TranslationModel/UG/mm//mtt-count-words
|
| 259 |
+
moses/TranslationModel/UG/mm//calc-coverage
|
| 260 |
+
moses/TranslationModel/UG//try-align
|
| 261 |
+
;
|
| 262 |
+
}
|
| 263 |
+
else
|
| 264 |
+
{
|
| 265 |
+
alias mm-extras ;
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
if [ option.get "with-mm" : : "yes" ]
|
| 269 |
+
{
|
| 270 |
+
alias mm :
|
| 271 |
+
moses/TranslationModel/UG/mm//mtt-build
|
| 272 |
+
moses/TranslationModel/UG/mm//symal2mam
|
| 273 |
+
moses/TranslationModel/UG/mm//mmlex-build
|
| 274 |
+
;
|
| 275 |
+
}
|
| 276 |
+
else
|
| 277 |
+
{
|
| 278 |
+
alias mm ;
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
if [ option.get "with-rephraser" : : "yes" ]
|
| 282 |
+
{
|
| 283 |
+
alias rephraser :
|
| 284 |
+
contrib/rephraser//paraphrase
|
| 285 |
+
;
|
| 286 |
+
}
|
| 287 |
+
else
|
| 288 |
+
{
|
| 289 |
+
alias rephraser ;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
alias programs :
|
| 293 |
+
lm//programs
|
| 294 |
+
moses-cmd//programs
|
| 295 |
+
OnDiskPt//CreateOnDiskPt
|
| 296 |
+
OnDiskPt//queryOnDiskPt
|
| 297 |
+
mert//programs
|
| 298 |
+
misc//programs
|
| 299 |
+
symal
|
| 300 |
+
phrase-extract
|
| 301 |
+
phrase-extract//lexical-reordering
|
| 302 |
+
phrase-extract//extract-ghkm
|
| 303 |
+
phrase-extract//pcfg-extract
|
| 304 |
+
phrase-extract//pcfg-score
|
| 305 |
+
phrase-extract//extract-mixed-syntax
|
| 306 |
+
phrase-extract//score-stsg
|
| 307 |
+
phrase-extract//filter-rule-table
|
| 308 |
+
phrase-extract//postprocess-egret-forests
|
| 309 |
+
biconcor
|
| 310 |
+
# contrib/mira//mira
|
| 311 |
+
contrib/server//mosesserver
|
| 312 |
+
mm
|
| 313 |
+
mm-extras
|
| 314 |
+
rephraser
|
| 315 |
+
contrib/c++tokenizer//tokenizer
|
| 316 |
+
contrib/expected-bleu-training//train-expected-bleu
|
| 317 |
+
contrib/expected-bleu-training//prepare-expected-bleu-training
|
| 318 |
+
|
| 319 |
+
probingpt//programs
|
| 320 |
+
moses2//programs
|
| 321 |
+
;
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
install-bin-libs programs ;
|
| 325 |
+
install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ;
|
| 326 |
+
install-headers headers-moses : moses//headers-to-install : moses ;
|
| 327 |
+
|
| 328 |
+
alias install : prefix-bin prefix-lib headers-base headers-moses ;
|
| 329 |
+
|
| 330 |
+
if ! [ option.get "includedir" : : $(prefix)/include ] {
|
| 331 |
+
explicit install headers-base headers-moses ;
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
|
| 335 |
+
echo "You have a $(TOP)/dist directory, but the build system now places files directly in the root i.e. $(TOP)/bin ." ;
|
| 336 |
+
echo "To disable this message, delete $(TOP)/dist ." ;
|
| 337 |
+
echo ;
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
#local temp = [ _shell "bash source ./s.sh" ] ;
|
| 341 |
+
local temp = [ _shell "mkdir -p $(PREFIX)/bin" ] ;
|
| 342 |
+
local temp = [ _shell "rm -f $(PREFIX)/bin/moses_chart" ] ;
|
| 343 |
+
local temp = [ _shell "cd $(PREFIX)/bin && ln -sf moses moses_chart" ] ;
|
| 344 |
+
local temp = [ _shell "cd $(PREFIX)/bin && ln -sf CreateProbingPT CreateProbingPT2" ] ;
|
| 345 |
+
|
mosesdecoder/OnDiskPt/Jamfile
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fakelib OnDiskPt : OnDiskWrapper.cpp SourcePhrase.cpp TargetPhrase.cpp Word.cpp Phrase.cpp PhraseNode.cpp TargetPhraseCollection.cpp Vocab.cpp OnDiskQuery.cpp ../moses//headers ;
|
| 2 |
+
|
| 3 |
+
exe CreateOnDiskPt : Main.cpp ..//boost_filesystem ../moses//moses OnDiskPt ;
|
| 4 |
+
exe queryOnDiskPt : queryOnDiskPt.cpp ..//boost_filesystem ../moses//moses OnDiskPt ;
|
| 5 |
+
|
mosesdecoder/OnDiskPt/Main.cpp
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 4 |
+
Copyright (C) 2009 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
|
| 21 |
+
#include <algorithm>
|
| 22 |
+
#include <iostream>
|
| 23 |
+
#include <string>
|
| 24 |
+
#include <vector>
|
| 25 |
+
#include <iterator>
|
| 26 |
+
#include <cassert>
|
| 27 |
+
#include "moses/InputFileStream.h"
|
| 28 |
+
#include "moses/Timer.h"
|
| 29 |
+
#include "moses/Util.h"
|
| 30 |
+
#include "OnDiskWrapper.h"
|
| 31 |
+
#include "SourcePhrase.h"
|
| 32 |
+
#include "TargetPhrase.h"
|
| 33 |
+
#include "TargetPhraseCollection.h"
|
| 34 |
+
#include "Word.h"
|
| 35 |
+
#include "Vocab.h"
|
| 36 |
+
#include "Main.h"
|
| 37 |
+
|
| 38 |
+
using namespace std;
|
| 39 |
+
using namespace OnDiskPt;
|
| 40 |
+
|
| 41 |
+
int main (int argc, char * const argv[])
|
| 42 |
+
{
|
| 43 |
+
// insert code here...
|
| 44 |
+
Moses::ResetUserTime();
|
| 45 |
+
Moses::PrintUserTime("Starting");
|
| 46 |
+
|
| 47 |
+
if (argc != 8) {
|
| 48 |
+
std::cerr << "Usage: " << argv[0] << " numSourceFactors numTargetFactors numScores tableLimit sortScoreIndex inputPath outputPath" << std::endl;
|
| 49 |
+
return 1;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
int numSourceFactors = Moses::Scan<int>(argv[1])
|
| 53 |
+
, numTargetFactors = Moses::Scan<int>(argv[2])
|
| 54 |
+
, numScores = Moses::Scan<int>(argv[3])
|
| 55 |
+
, tableLimit = Moses::Scan<int>(argv[4]);
|
| 56 |
+
TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]);
|
| 57 |
+
assert(TargetPhraseCollection::s_sortScoreInd < numScores);
|
| 58 |
+
|
| 59 |
+
const string filePath = argv[6]
|
| 60 |
+
,destPath = argv[7];
|
| 61 |
+
|
| 62 |
+
Moses::InputFileStream inStream(filePath);
|
| 63 |
+
|
| 64 |
+
OnDiskWrapper onDiskWrapper;
|
| 65 |
+
onDiskWrapper.BeginSave(destPath, numSourceFactors, numTargetFactors, numScores);
|
| 66 |
+
|
| 67 |
+
PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode();
|
| 68 |
+
size_t lineNum = 0;
|
| 69 |
+
string line;
|
| 70 |
+
|
| 71 |
+
while(getline(inStream, line)) {
|
| 72 |
+
lineNum++;
|
| 73 |
+
if (lineNum%1000 == 0) cerr << "." << flush;
|
| 74 |
+
if (lineNum%10000 == 0) cerr << ":" << flush;
|
| 75 |
+
if (lineNum%100000 == 0) cerr << lineNum << flush;
|
| 76 |
+
//cerr << lineNum << " " << line << endl;
|
| 77 |
+
|
| 78 |
+
std::vector<float> misc(1);
|
| 79 |
+
SourcePhrase sourcePhrase;
|
| 80 |
+
TargetPhrase *targetPhrase = new TargetPhrase(numScores);
|
| 81 |
+
OnDiskPt::PhrasePtr spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc);
|
| 82 |
+
assert(misc.size() == onDiskWrapper.GetNumCounts());
|
| 83 |
+
|
| 84 |
+
rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort);
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
rootNode.Save(onDiskWrapper, 0, tableLimit);
|
| 88 |
+
onDiskWrapper.EndSave();
|
| 89 |
+
|
| 90 |
+
Moses::PrintUserTime("Finished");
|
| 91 |
+
|
| 92 |
+
//pause();
|
| 93 |
+
return 0;
|
| 94 |
+
|
| 95 |
+
} // main()
|
| 96 |
+
|
| 97 |
+
bool Flush(const OnDiskPt::SourcePhrase *prevSourcePhrase, const OnDiskPt::SourcePhrase *currSourcePhrase)
|
| 98 |
+
{
|
| 99 |
+
if (prevSourcePhrase == NULL)
|
| 100 |
+
return false;
|
| 101 |
+
|
| 102 |
+
assert(currSourcePhrase);
|
| 103 |
+
bool ret = (*currSourcePhrase > *prevSourcePhrase);
|
| 104 |
+
//cerr << *prevSourcePhrase << endl << *currSourcePhrase << " " << ret << endl << endl;
|
| 105 |
+
|
| 106 |
+
return ret;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, const std::string &lineStr, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc)
|
| 110 |
+
{
|
| 111 |
+
char line[lineStr.size() + 1];
|
| 112 |
+
strcpy(line, lineStr.c_str());
|
| 113 |
+
|
| 114 |
+
stringstream sparseFeatures, property;
|
| 115 |
+
|
| 116 |
+
size_t scoreInd = 0;
|
| 117 |
+
|
| 118 |
+
// MAIN LOOP
|
| 119 |
+
size_t stage = 0;
|
| 120 |
+
/* 0 = source phrase
|
| 121 |
+
1 = target phrase
|
| 122 |
+
2 = scores
|
| 123 |
+
3 = align
|
| 124 |
+
4 = count
|
| 125 |
+
7 = properties
|
| 126 |
+
*/
|
| 127 |
+
char *tok = strtok (line," ");
|
| 128 |
+
OnDiskPt::PhrasePtr out(new Phrase());
|
| 129 |
+
while (tok != NULL) {
|
| 130 |
+
if (0 == strcmp(tok, "|||")) {
|
| 131 |
+
++stage;
|
| 132 |
+
} else {
|
| 133 |
+
switch (stage) {
|
| 134 |
+
case 0: {
|
| 135 |
+
WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper, 1);
|
| 136 |
+
if (w != NULL)
|
| 137 |
+
out->AddWord(w);
|
| 138 |
+
|
| 139 |
+
break;
|
| 140 |
+
}
|
| 141 |
+
case 1: {
|
| 142 |
+
Tokenize(targetPhrase, tok, false, true, onDiskWrapper, 0);
|
| 143 |
+
break;
|
| 144 |
+
}
|
| 145 |
+
case 2: {
|
| 146 |
+
float score = Moses::Scan<float>(tok);
|
| 147 |
+
targetPhrase.SetScore(score, scoreInd);
|
| 148 |
+
++scoreInd;
|
| 149 |
+
break;
|
| 150 |
+
}
|
| 151 |
+
case 3: {
|
| 152 |
+
//targetPhrase.Create1AlignFromString(tok);
|
| 153 |
+
targetPhrase.CreateAlignFromString(tok);
|
| 154 |
+
break;
|
| 155 |
+
}
|
| 156 |
+
case 4: {
|
| 157 |
+
// store only the 3rd one (rule count)
|
| 158 |
+
float val = Moses::Scan<float>(tok);
|
| 159 |
+
misc[0] = val;
|
| 160 |
+
break;
|
| 161 |
+
}
|
| 162 |
+
case 5: {
|
| 163 |
+
// sparse features
|
| 164 |
+
sparseFeatures << tok << " ";
|
| 165 |
+
break;
|
| 166 |
+
}
|
| 167 |
+
case 6: {
|
| 168 |
+
property << tok << " ";
|
| 169 |
+
break;
|
| 170 |
+
}
|
| 171 |
+
default:
|
| 172 |
+
cerr << "ERROR in line " << line << endl;
|
| 173 |
+
assert(false);
|
| 174 |
+
break;
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
tok = strtok (NULL, " ");
|
| 179 |
+
} // while (tok != NULL)
|
| 180 |
+
|
| 181 |
+
assert(scoreInd == numScores);
|
| 182 |
+
targetPhrase.SetSparseFeatures(Moses::Trim(sparseFeatures.str()));
|
| 183 |
+
targetPhrase.SetProperty(Moses::Trim(property.str()));
|
| 184 |
+
targetPhrase.SortAlign();
|
| 185 |
+
return out;
|
| 186 |
+
} // Tokenize()
|
| 187 |
+
|
| 188 |
+
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
|
| 189 |
+
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
|
| 190 |
+
, OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget)
|
| 191 |
+
{
|
| 192 |
+
// retSourceTarget: 0 = don't return anything. 1 = source, 2 = target
|
| 193 |
+
|
| 194 |
+
bool nonTerm = false;
|
| 195 |
+
size_t tokSize = token.size();
|
| 196 |
+
int comStr =token.compare(0, 1, "[");
|
| 197 |
+
|
| 198 |
+
if (comStr == 0) {
|
| 199 |
+
comStr = token.compare(tokSize - 1, 1, "]");
|
| 200 |
+
nonTerm = comStr == 0;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
OnDiskPt::WordPtr out;
|
| 204 |
+
if (nonTerm) {
|
| 205 |
+
// non-term
|
| 206 |
+
size_t splitPos = token.find_first_of("[", 2);
|
| 207 |
+
string wordStr = token.substr(0, splitPos);
|
| 208 |
+
|
| 209 |
+
if (splitPos == string::npos) {
|
| 210 |
+
// lhs - only 1 word
|
| 211 |
+
WordPtr word(new Word());
|
| 212 |
+
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
|
| 213 |
+
phrase.AddWord(word);
|
| 214 |
+
} else {
|
| 215 |
+
// source & target non-terms
|
| 216 |
+
if (addSourceNonTerm) {
|
| 217 |
+
WordPtr word(new Word());
|
| 218 |
+
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
|
| 219 |
+
phrase.AddWord(word);
|
| 220 |
+
|
| 221 |
+
if (retSourceTarget == 1) {
|
| 222 |
+
out = word;
|
| 223 |
+
}
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
wordStr = token.substr(splitPos, tokSize - splitPos);
|
| 227 |
+
if (addTargetNonTerm) {
|
| 228 |
+
WordPtr word(new Word());
|
| 229 |
+
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
|
| 230 |
+
phrase.AddWord(word);
|
| 231 |
+
|
| 232 |
+
if (retSourceTarget == 2) {
|
| 233 |
+
out = word;
|
| 234 |
+
}
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
}
|
| 238 |
+
} else {
|
| 239 |
+
// term
|
| 240 |
+
WordPtr word(new Word());
|
| 241 |
+
word->CreateFromString(token, onDiskWrapper.GetVocab());
|
| 242 |
+
phrase.AddWord(word);
|
| 243 |
+
out = word;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
return out;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const ::AlignType &alignments)
|
| 250 |
+
{
|
| 251 |
+
for (int ind = alignments.size() - 1; ind >= 0; --ind) {
|
| 252 |
+
const ::AlignPair &alignPair = alignments[ind];
|
| 253 |
+
size_t sourcePos = alignPair.first
|
| 254 |
+
,targetPos = alignPair.second;
|
| 255 |
+
|
| 256 |
+
const string &target = targetToks[targetPos];
|
| 257 |
+
sourceToks.insert(sourceToks.begin() + sourcePos + 1, target);
|
| 258 |
+
|
| 259 |
+
}
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
class AlignOrderer
|
| 263 |
+
{
|
| 264 |
+
public:
|
| 265 |
+
bool operator()(const ::AlignPair &a, const ::AlignPair &b) const {
|
| 266 |
+
return a.first < b.first;
|
| 267 |
+
}
|
| 268 |
+
};
|
| 269 |
+
|
| 270 |
+
void SortAlign(::AlignType &alignments)
|
| 271 |
+
{
|
| 272 |
+
std::sort(alignments.begin(), alignments.end(), AlignOrderer());
|
| 273 |
+
}
|
mosesdecoder/OnDiskPt/Main.h
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// $Id$
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 5 |
+
Copyright (C) 2009 Hieu Hoang
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
#include <string>
|
| 22 |
+
#include "SourcePhrase.h"
|
| 23 |
+
#include "TargetPhrase.h"
|
| 24 |
+
|
| 25 |
+
typedef std::pair<size_t, size_t> AlignPair;
|
| 26 |
+
typedef std::vector<AlignPair> AlignType;
|
| 27 |
+
|
| 28 |
+
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
|
| 29 |
+
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
|
| 30 |
+
, OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget);
|
| 31 |
+
OnDiskPt::PhrasePtr Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
|
| 32 |
+
, const std::string &lineStr, OnDiskPt::OnDiskWrapper &onDiskWrapper
|
| 33 |
+
, int numScores
|
| 34 |
+
, std::vector<float> &misc);
|
| 35 |
+
|
| 36 |
+
void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const AlignType &alignments);
|
| 37 |
+
void SortAlign(AlignType &alignments);
|
| 38 |
+
bool Flush(const OnDiskPt::SourcePhrase *prevSource, const OnDiskPt::SourcePhrase *currSource);
|
| 39 |
+
|
mosesdecoder/OnDiskPt/OnDiskQuery.cpp
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "OnDiskQuery.h"
|
| 2 |
+
|
| 3 |
+
namespace OnDiskPt
|
| 4 |
+
{
|
| 5 |
+
|
| 6 |
+
void OnDiskQuery::Tokenize(Phrase &phrase,
|
| 7 |
+
const std::string &token,
|
| 8 |
+
bool addSourceNonTerm,
|
| 9 |
+
bool addTargetNonTerm)
|
| 10 |
+
{
|
| 11 |
+
bool nonTerm = false;
|
| 12 |
+
size_t tokSize = token.size();
|
| 13 |
+
int comStr =token.compare(0, 1, "[");
|
| 14 |
+
|
| 15 |
+
if (comStr == 0) {
|
| 16 |
+
comStr = token.compare(tokSize - 1, 1, "]");
|
| 17 |
+
nonTerm = comStr == 0;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
if (nonTerm) {
|
| 21 |
+
// non-term
|
| 22 |
+
size_t splitPos = token.find_first_of("[", 2);
|
| 23 |
+
std::string wordStr = token.substr(0, splitPos);
|
| 24 |
+
|
| 25 |
+
if (splitPos == std::string::npos) {
|
| 26 |
+
// lhs - only 1 word
|
| 27 |
+
WordPtr word (new Word());
|
| 28 |
+
word->CreateFromString(wordStr, m_wrapper.GetVocab());
|
| 29 |
+
phrase.AddWord(word);
|
| 30 |
+
} else {
|
| 31 |
+
// source & target non-terms
|
| 32 |
+
if (addSourceNonTerm) {
|
| 33 |
+
WordPtr word( new Word());
|
| 34 |
+
word->CreateFromString(wordStr, m_wrapper.GetVocab());
|
| 35 |
+
phrase.AddWord(word);
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
wordStr = token.substr(splitPos, tokSize - splitPos);
|
| 39 |
+
if (addTargetNonTerm) {
|
| 40 |
+
WordPtr word(new Word());
|
| 41 |
+
word->CreateFromString(wordStr, m_wrapper.GetVocab());
|
| 42 |
+
phrase.AddWord(word);
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
}
|
| 46 |
+
} else {
|
| 47 |
+
// term
|
| 48 |
+
WordPtr word(new Word());
|
| 49 |
+
word->CreateFromString(token, m_wrapper.GetVocab());
|
| 50 |
+
phrase.AddWord(word);
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
SourcePhrase OnDiskQuery::Tokenize(const std::vector<std::string>& tokens)
|
| 55 |
+
{
|
| 56 |
+
SourcePhrase sourcePhrase;
|
| 57 |
+
if (tokens.size() > 0) {
|
| 58 |
+
std::vector<std::string>::const_iterator token = tokens.begin();
|
| 59 |
+
for (; token + 1 != tokens.end(); ++token) {
|
| 60 |
+
Tokenize(sourcePhrase, *token, true, true);
|
| 61 |
+
}
|
| 62 |
+
// last position. LHS non-term
|
| 63 |
+
Tokenize(sourcePhrase, *token, false, true);
|
| 64 |
+
}
|
| 65 |
+
return sourcePhrase;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
const PhraseNode* OnDiskQuery::Query(const SourcePhrase& sourcePhrase)
|
| 69 |
+
{
|
| 70 |
+
const PhraseNode *node = &m_wrapper.GetRootSourceNode();
|
| 71 |
+
assert(node);
|
| 72 |
+
|
| 73 |
+
for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos) {
|
| 74 |
+
const Word &word = sourcePhrase.GetWord(pos);
|
| 75 |
+
node = node->GetChild(word, m_wrapper);
|
| 76 |
+
if (node == NULL) {
|
| 77 |
+
break;
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
return node;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
}
|
mosesdecoder/OnDiskPt/OnDiskQuery.h
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
#include <string>
|
| 3 |
+
#include <vector>
|
| 4 |
+
#include "OnDiskWrapper.h"
|
| 5 |
+
#include "Phrase.h"
|
| 6 |
+
#include "SourcePhrase.h"
|
| 7 |
+
#include "Word.h"
|
| 8 |
+
#include "PhraseNode.h"
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
namespace OnDiskPt
|
| 12 |
+
{
|
| 13 |
+
|
| 14 |
+
class OnDiskQuery
|
| 15 |
+
{
|
| 16 |
+
private:
|
| 17 |
+
OnDiskWrapper &m_wrapper;
|
| 18 |
+
|
| 19 |
+
public:
|
| 20 |
+
|
| 21 |
+
OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper) {}
|
| 22 |
+
|
| 23 |
+
void Tokenize(Phrase &phrase,
|
| 24 |
+
const std::string &token,
|
| 25 |
+
bool addSourceNonTerm,
|
| 26 |
+
bool addTargetNonTerm);
|
| 27 |
+
|
| 28 |
+
SourcePhrase Tokenize(const std::vector<std::string>& tokens);
|
| 29 |
+
|
| 30 |
+
const PhraseNode *Query(const SourcePhrase& sourcePhrase);
|
| 31 |
+
|
| 32 |
+
inline const PhraseNode *Query(const std::vector<std::string>& tokens) {
|
| 33 |
+
return Query(Tokenize(tokens));
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
};
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
}
|
mosesdecoder/OnDiskPt/OnDiskWrapper.cpp
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 4 |
+
Copyright (C) 2009 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
#ifdef WIN32
|
| 21 |
+
#include <direct.h>
|
| 22 |
+
#endif
|
| 23 |
+
#include <sys/stat.h>
|
| 24 |
+
#include <string>
|
| 25 |
+
#include "OnDiskWrapper.h"
|
| 26 |
+
#include "moses/Util.h"
|
| 27 |
+
#include "util/exception.hh"
|
| 28 |
+
#include "util/string_stream.hh"
|
| 29 |
+
|
| 30 |
+
using namespace std;
|
| 31 |
+
|
| 32 |
+
namespace OnDiskPt
|
| 33 |
+
{
|
| 34 |
+
|
| 35 |
+
int OnDiskWrapper::VERSION_NUM = 7;
|
| 36 |
+
|
| 37 |
+
OnDiskWrapper::OnDiskWrapper()
|
| 38 |
+
{
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
OnDiskWrapper::~OnDiskWrapper()
|
| 42 |
+
{
|
| 43 |
+
delete m_rootSourceNode;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
void OnDiskWrapper::BeginLoad(const std::string &filePath)
|
| 47 |
+
{
|
| 48 |
+
if (!OpenForLoad(filePath)) {
|
| 49 |
+
UTIL_THROW(util::FileOpenException, "Couldn't open for loading: " << filePath);
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
if (!m_vocab.Load(*this))
|
| 53 |
+
UTIL_THROW(util::FileOpenException, "Couldn't load vocab");
|
| 54 |
+
|
| 55 |
+
uint64_t rootFilePos = GetMisc("RootNodeOffset");
|
| 56 |
+
m_rootSourceNode = new PhraseNode(rootFilePos, *this);
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
bool OnDiskWrapper::OpenForLoad(const std::string &filePath)
|
| 60 |
+
{
|
| 61 |
+
m_fileSource.open((filePath + "/Source.dat").c_str(), ios::in | ios::binary);
|
| 62 |
+
UTIL_THROW_IF(!m_fileSource.is_open(),
|
| 63 |
+
util::FileOpenException,
|
| 64 |
+
"Couldn't open file " << filePath << "/Source.dat");
|
| 65 |
+
|
| 66 |
+
m_fileTargetInd.open((filePath + "/TargetInd.dat").c_str(), ios::in | ios::binary);
|
| 67 |
+
UTIL_THROW_IF(!m_fileTargetInd.is_open(),
|
| 68 |
+
util::FileOpenException,
|
| 69 |
+
"Couldn't open file " << filePath << "/TargetInd.dat");
|
| 70 |
+
|
| 71 |
+
m_fileTargetColl.open((filePath + "/TargetColl.dat").c_str(), ios::in | ios::binary);
|
| 72 |
+
UTIL_THROW_IF(!m_fileTargetColl.is_open(),
|
| 73 |
+
util::FileOpenException,
|
| 74 |
+
"Couldn't open file " << filePath << "/TargetColl.dat");
|
| 75 |
+
|
| 76 |
+
m_fileVocab.open((filePath + "/Vocab.dat").c_str(), ios::in);
|
| 77 |
+
UTIL_THROW_IF(!m_fileVocab.is_open(),
|
| 78 |
+
util::FileOpenException,
|
| 79 |
+
"Couldn't open file " << filePath << "/Vocab.dat");
|
| 80 |
+
|
| 81 |
+
m_fileMisc.open((filePath + "/Misc.dat").c_str(), ios::in);
|
| 82 |
+
UTIL_THROW_IF(!m_fileMisc.is_open(),
|
| 83 |
+
util::FileOpenException,
|
| 84 |
+
"Couldn't open file " << filePath << "/Misc.dat");
|
| 85 |
+
|
| 86 |
+
// set up root node
|
| 87 |
+
LoadMisc();
|
| 88 |
+
m_numSourceFactors = GetMisc("NumSourceFactors");
|
| 89 |
+
m_numTargetFactors = GetMisc("NumTargetFactors");
|
| 90 |
+
m_numScores = GetMisc("NumScores");
|
| 91 |
+
|
| 92 |
+
return true;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
bool OnDiskWrapper::LoadMisc()
|
| 96 |
+
{
|
| 97 |
+
char line[100000];
|
| 98 |
+
|
| 99 |
+
while(m_fileMisc.getline(line, 100000)) {
|
| 100 |
+
vector<string> tokens;
|
| 101 |
+
Moses::Tokenize(tokens, line);
|
| 102 |
+
UTIL_THROW_IF2(tokens.size() != 2, "Except key value. Found " << line);
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
const string &key = tokens[0];
|
| 106 |
+
m_miscInfo[key] = Moses::Scan<uint64_t>(tokens[1]);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
return true;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
void OnDiskWrapper::BeginSave(const std::string &filePath
|
| 113 |
+
, int numSourceFactors, int numTargetFactors, int numScores)
|
| 114 |
+
{
|
| 115 |
+
m_numSourceFactors = numSourceFactors;
|
| 116 |
+
m_numTargetFactors = numTargetFactors;
|
| 117 |
+
m_numScores = numScores;
|
| 118 |
+
m_filePath = filePath;
|
| 119 |
+
|
| 120 |
+
#ifdef WIN32
|
| 121 |
+
mkdir(filePath.c_str());
|
| 122 |
+
#else
|
| 123 |
+
mkdir(filePath.c_str(), 0777);
|
| 124 |
+
#endif
|
| 125 |
+
|
| 126 |
+
m_fileSource.open((filePath + "/Source.dat").c_str(), ios::out | ios::in | ios::binary | ios::ate | ios::trunc);
|
| 127 |
+
UTIL_THROW_IF(!m_fileSource.is_open(),
|
| 128 |
+
util::FileOpenException,
|
| 129 |
+
"Couldn't open file " << filePath << "/Source.dat");
|
| 130 |
+
|
| 131 |
+
m_fileTargetInd.open((filePath + "/TargetInd.dat").c_str(), ios::out | ios::binary | ios::ate | ios::trunc);
|
| 132 |
+
UTIL_THROW_IF(!m_fileTargetInd.is_open(),
|
| 133 |
+
util::FileOpenException,
|
| 134 |
+
"Couldn't open file " << filePath << "/TargetInd.dat");
|
| 135 |
+
|
| 136 |
+
m_fileTargetColl.open((filePath + "/TargetColl.dat").c_str(), ios::out | ios::binary | ios::ate | ios::trunc);
|
| 137 |
+
UTIL_THROW_IF(!m_fileTargetColl.is_open(),
|
| 138 |
+
util::FileOpenException,
|
| 139 |
+
"Couldn't open file " << filePath << "/TargetColl.dat");
|
| 140 |
+
|
| 141 |
+
m_fileVocab.open((filePath + "/Vocab.dat").c_str(), ios::out | ios::ate | ios::trunc);
|
| 142 |
+
UTIL_THROW_IF(!m_fileVocab.is_open(),
|
| 143 |
+
util::FileOpenException,
|
| 144 |
+
"Couldn't open file " << filePath << "/Vocab.dat");
|
| 145 |
+
|
| 146 |
+
m_fileMisc.open((filePath + "/Misc.dat").c_str(), ios::out | ios::ate | ios::trunc);
|
| 147 |
+
UTIL_THROW_IF(!m_fileMisc.is_open(),
|
| 148 |
+
util::FileOpenException,
|
| 149 |
+
"Couldn't open file " << filePath << "/Misc.dat");
|
| 150 |
+
|
| 151 |
+
// offset by 1. 0 offset is reserved
|
| 152 |
+
char c = 0xff;
|
| 153 |
+
m_fileSource.write(&c, 1);
|
| 154 |
+
UTIL_THROW_IF2(1 != m_fileSource.tellp(),
|
| 155 |
+
"Couldn't write to stream m_fileSource");
|
| 156 |
+
|
| 157 |
+
m_fileTargetInd.write(&c, 1);
|
| 158 |
+
UTIL_THROW_IF2(1 != m_fileTargetInd.tellp(),
|
| 159 |
+
"Couldn't write to stream m_fileTargetInd");
|
| 160 |
+
|
| 161 |
+
m_fileTargetColl.write(&c, 1);
|
| 162 |
+
UTIL_THROW_IF2(1 != m_fileTargetColl.tellp(),
|
| 163 |
+
"Couldn't write to stream m_fileTargetColl");
|
| 164 |
+
|
| 165 |
+
// set up root node
|
| 166 |
+
UTIL_THROW_IF2(GetNumCounts() != 1,
|
| 167 |
+
"Not sure what this is...");
|
| 168 |
+
|
| 169 |
+
vector<float> counts(GetNumCounts());
|
| 170 |
+
counts[0] = DEFAULT_COUNT;
|
| 171 |
+
m_rootSourceNode = new PhraseNode();
|
| 172 |
+
m_rootSourceNode->AddCounts(counts);
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
void OnDiskWrapper::EndSave()
|
| 176 |
+
{
|
| 177 |
+
bool ret = m_rootSourceNode->Saved();
|
| 178 |
+
UTIL_THROW_IF2(!ret, "Root node not saved");
|
| 179 |
+
|
| 180 |
+
GetVocab().Save(*this);
|
| 181 |
+
|
| 182 |
+
SaveMisc();
|
| 183 |
+
|
| 184 |
+
m_fileMisc.close();
|
| 185 |
+
m_fileVocab.close();
|
| 186 |
+
m_fileSource.close();
|
| 187 |
+
m_fileTarget.close();
|
| 188 |
+
m_fileTargetInd.close();
|
| 189 |
+
m_fileTargetColl.close();
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
void OnDiskWrapper::SaveMisc()
|
| 193 |
+
{
|
| 194 |
+
m_fileMisc << "Version " << VERSION_NUM << endl;
|
| 195 |
+
m_fileMisc << "NumSourceFactors " << m_numSourceFactors << endl;
|
| 196 |
+
m_fileMisc << "NumTargetFactors " << m_numTargetFactors << endl;
|
| 197 |
+
m_fileMisc << "NumScores " << m_numScores << endl;
|
| 198 |
+
m_fileMisc << "RootNodeOffset " << m_rootSourceNode->GetFilePos() << endl;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
size_t OnDiskWrapper::GetSourceWordSize() const
|
| 202 |
+
{
|
| 203 |
+
return sizeof(uint64_t) + sizeof(char);
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
size_t OnDiskWrapper::GetTargetWordSize() const
|
| 207 |
+
{
|
| 208 |
+
return sizeof(uint64_t) + sizeof(char);
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
uint64_t OnDiskWrapper::GetMisc(const std::string &key) const
|
| 212 |
+
{
|
| 213 |
+
std::map<std::string, uint64_t>::const_iterator iter;
|
| 214 |
+
iter = m_miscInfo.find(key);
|
| 215 |
+
UTIL_THROW_IF2(iter == m_miscInfo.end()
|
| 216 |
+
, "Couldn't find value for key " << key
|
| 217 |
+
);
|
| 218 |
+
|
| 219 |
+
return iter->second;
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
}
|
mosesdecoder/OnDiskPt/OnDiskWrapper.h
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// $Id$
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 5 |
+
Copyright (C) 2009 Hieu Hoang
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
#include <string>
|
| 22 |
+
#include <fstream>
|
| 23 |
+
#include "Vocab.h"
|
| 24 |
+
#include "PhraseNode.h"
|
| 25 |
+
|
| 26 |
+
namespace OnDiskPt
|
| 27 |
+
{
|
| 28 |
+
const float DEFAULT_COUNT = 66666;
|
| 29 |
+
|
| 30 |
+
/** Global class with misc information need to create and use the on-disk rule table.
|
| 31 |
+
* 1 object of this class should be instantiated per rule table.
|
| 32 |
+
* Currently only hierarchical/syntax models use this, but can & should be used with pb models too
|
| 33 |
+
*/
|
| 34 |
+
class OnDiskWrapper
|
| 35 |
+
{
|
| 36 |
+
protected:
|
| 37 |
+
Vocab m_vocab;
|
| 38 |
+
std::string m_filePath;
|
| 39 |
+
int m_numSourceFactors, m_numTargetFactors, m_numScores;
|
| 40 |
+
std::fstream m_fileMisc, m_fileVocab, m_fileSource, m_fileTarget, m_fileTargetInd, m_fileTargetColl;
|
| 41 |
+
|
| 42 |
+
size_t m_defaultNodeSize;
|
| 43 |
+
PhraseNode *m_rootSourceNode;
|
| 44 |
+
|
| 45 |
+
std::map<std::string, uint64_t> m_miscInfo;
|
| 46 |
+
|
| 47 |
+
void SaveMisc();
|
| 48 |
+
bool OpenForLoad(const std::string &filePath);
|
| 49 |
+
bool LoadMisc();
|
| 50 |
+
|
| 51 |
+
public:
|
| 52 |
+
static int VERSION_NUM;
|
| 53 |
+
|
| 54 |
+
OnDiskWrapper();
|
| 55 |
+
~OnDiskWrapper();
|
| 56 |
+
|
| 57 |
+
void BeginLoad(const std::string &filePath);
|
| 58 |
+
|
| 59 |
+
void BeginSave(const std::string &filePath
|
| 60 |
+
, int numSourceFactors, int numTargetFactors, int numScores);
|
| 61 |
+
void EndSave();
|
| 62 |
+
|
| 63 |
+
Vocab &GetVocab() {
|
| 64 |
+
return m_vocab;
|
| 65 |
+
}
|
| 66 |
+
const Vocab &GetVocab() const {
|
| 67 |
+
return m_vocab;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
size_t GetSourceWordSize() const;
|
| 71 |
+
size_t GetTargetWordSize() const;
|
| 72 |
+
|
| 73 |
+
std::fstream &GetFileSource() {
|
| 74 |
+
return m_fileSource;
|
| 75 |
+
}
|
| 76 |
+
std::fstream &GetFileTargetInd() {
|
| 77 |
+
return m_fileTargetInd;
|
| 78 |
+
}
|
| 79 |
+
std::fstream &GetFileTargetColl() {
|
| 80 |
+
return m_fileTargetColl;
|
| 81 |
+
}
|
| 82 |
+
std::fstream &GetFileVocab() {
|
| 83 |
+
return m_fileVocab;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
size_t GetNumSourceFactors() const {
|
| 87 |
+
return m_numSourceFactors;
|
| 88 |
+
}
|
| 89 |
+
size_t GetNumTargetFactors() const {
|
| 90 |
+
return m_numTargetFactors;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
size_t GetNumScores() const {
|
| 94 |
+
return m_numScores;
|
| 95 |
+
}
|
| 96 |
+
size_t GetNumCounts() const {
|
| 97 |
+
return 1;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
PhraseNode &GetRootSourceNode() {
|
| 101 |
+
return *m_rootSourceNode;
|
| 102 |
+
}
|
| 103 |
+
const PhraseNode &GetRootSourceNode() const {
|
| 104 |
+
return *m_rootSourceNode;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
uint64_t GetMisc(const std::string &key) const;
|
| 108 |
+
|
| 109 |
+
};
|
| 110 |
+
|
| 111 |
+
}
|
mosesdecoder/OnDiskPt/Phrase.cpp
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 4 |
+
Copyright (C) 2009 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
#include <iostream>
|
| 21 |
+
#include "moses/Util.h"
|
| 22 |
+
#include "Phrase.h"
|
| 23 |
+
#include "util/exception.hh"
|
| 24 |
+
|
| 25 |
+
using namespace std;
|
| 26 |
+
|
| 27 |
+
namespace OnDiskPt
|
| 28 |
+
{
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
void Phrase::AddWord(WordPtr word)
|
| 32 |
+
{
|
| 33 |
+
m_words.push_back(word);
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
void Phrase::AddWord(WordPtr word, size_t pos)
|
| 37 |
+
{
|
| 38 |
+
UTIL_THROW_IF2(!(pos < m_words.size()),
|
| 39 |
+
"Trying to get word " << pos << " when phrase size is " << m_words.size());
|
| 40 |
+
m_words.insert(m_words.begin() + pos + 1, word);
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
int Phrase::Compare(const Phrase &compare) const
|
| 44 |
+
{
|
| 45 |
+
int ret = 0;
|
| 46 |
+
for (size_t pos = 0; pos < GetSize(); ++pos) {
|
| 47 |
+
if (pos >= compare.GetSize()) {
|
| 48 |
+
// we're bigger than the other. Put 1st
|
| 49 |
+
ret = -1;
|
| 50 |
+
break;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
const Word &thisWord = GetWord(pos)
|
| 54 |
+
,&compareWord = compare.GetWord(pos);
|
| 55 |
+
int wordRet = thisWord.Compare(compareWord);
|
| 56 |
+
if (wordRet != 0) {
|
| 57 |
+
ret = wordRet;
|
| 58 |
+
break;
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
if (ret == 0) {
|
| 63 |
+
assert(compare.GetSize() >= GetSize());
|
| 64 |
+
ret = (compare.GetSize() > GetSize()) ? 1 : 0;
|
| 65 |
+
}
|
| 66 |
+
return ret;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
//! transitive comparison
|
| 70 |
+
bool Phrase::operator<(const Phrase &compare) const
|
| 71 |
+
{
|
| 72 |
+
int ret = Compare(compare);
|
| 73 |
+
return ret < 0;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
bool Phrase::operator>(const Phrase &compare) const
|
| 77 |
+
{
|
| 78 |
+
int ret = Compare(compare);
|
| 79 |
+
return ret > 0;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
bool Phrase::operator==(const Phrase &compare) const
|
| 83 |
+
{
|
| 84 |
+
int ret = Compare(compare);
|
| 85 |
+
return ret == 0;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
void Phrase::DebugPrint(ostream &out, const Vocab &vocab) const
|
| 89 |
+
{
|
| 90 |
+
for (size_t pos = 0; pos < GetSize(); ++pos) {
|
| 91 |
+
const Word &word = GetWord(pos);
|
| 92 |
+
word.DebugPrint(out, vocab);
|
| 93 |
+
out << " ";
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
std::ostream& operator<<(std::ostream &out, const Phrase &phrase)
|
| 98 |
+
{
|
| 99 |
+
for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
|
| 100 |
+
const Word &word = phrase.GetWord(pos);
|
| 101 |
+
out << word << " ";
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
return out;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
}
|
| 108 |
+
|
mosesdecoder/OnDiskPt/Phrase.h
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// $Id$
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 5 |
+
Copyright (C) 2009 Hieu Hoang
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
#include <vector>
|
| 22 |
+
#include <iostream>
|
| 23 |
+
#include <boost/shared_ptr.hpp>
|
| 24 |
+
#include "Word.h"
|
| 25 |
+
|
| 26 |
+
namespace OnDiskPt
|
| 27 |
+
{
|
| 28 |
+
class Vocab;
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
/** A contiguous phrase. SourcePhrase & TargetPhrase inherit from this and add the on-disk functionality
|
| 32 |
+
*/
|
| 33 |
+
class Phrase
|
| 34 |
+
{
|
| 35 |
+
friend std::ostream& operator<<(std::ostream&, const Phrase&);
|
| 36 |
+
|
| 37 |
+
protected:
|
| 38 |
+
std::vector<WordPtr> m_words;
|
| 39 |
+
|
| 40 |
+
public:
|
| 41 |
+
Phrase() {
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
virtual ~Phrase() {}
|
| 45 |
+
|
| 46 |
+
void AddWord(WordPtr word);
|
| 47 |
+
void AddWord(WordPtr word, size_t pos);
|
| 48 |
+
|
| 49 |
+
const Word &GetWord(size_t pos) const {
|
| 50 |
+
return *m_words[pos];
|
| 51 |
+
}
|
| 52 |
+
size_t GetSize() const {
|
| 53 |
+
return m_words.size();
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
|
| 57 |
+
|
| 58 |
+
int Compare(const Phrase &compare) const;
|
| 59 |
+
bool operator<(const Phrase &compare) const;
|
| 60 |
+
bool operator>(const Phrase &compare) const;
|
| 61 |
+
bool operator==(const Phrase &compare) const;
|
| 62 |
+
};
|
| 63 |
+
|
| 64 |
+
typedef boost::shared_ptr<Phrase> PhrasePtr;
|
| 65 |
+
|
| 66 |
+
}
|
mosesdecoder/OnDiskPt/PhraseNode.cpp
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 4 |
+
Copyright (C) 2009 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
#include "PhraseNode.h"
|
| 21 |
+
#include "OnDiskWrapper.h"
|
| 22 |
+
#include "TargetPhraseCollection.h"
|
| 23 |
+
#include "SourcePhrase.h"
|
| 24 |
+
#include "moses/Util.h"
|
| 25 |
+
#include "util/exception.hh"
|
| 26 |
+
|
| 27 |
+
using namespace std;
|
| 28 |
+
|
| 29 |
+
namespace OnDiskPt
|
| 30 |
+
{
|
| 31 |
+
|
| 32 |
+
size_t PhraseNode::GetNodeSize(size_t numChildren, size_t wordSize, size_t countSize)
|
| 33 |
+
{
|
| 34 |
+
size_t ret = sizeof(uint64_t) * 2 // num children, value
|
| 35 |
+
+ (wordSize + sizeof(uint64_t)) * numChildren // word + ptr to next source node
|
| 36 |
+
+ sizeof(float) * countSize; // count info
|
| 37 |
+
return ret;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
PhraseNode::PhraseNode()
|
| 41 |
+
: m_value(0)
|
| 42 |
+
,m_currChild(NULL)
|
| 43 |
+
,m_saved(false)
|
| 44 |
+
,m_memLoad(NULL)
|
| 45 |
+
{
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
PhraseNode::PhraseNode(uint64_t filePos, OnDiskWrapper &onDiskWrapper)
|
| 49 |
+
:m_counts(onDiskWrapper.GetNumCounts())
|
| 50 |
+
{
|
| 51 |
+
// load saved node
|
| 52 |
+
m_filePos = filePos;
|
| 53 |
+
|
| 54 |
+
size_t countSize = onDiskWrapper.GetNumCounts();
|
| 55 |
+
|
| 56 |
+
std::fstream &file = onDiskWrapper.GetFileSource();
|
| 57 |
+
file.seekg(filePos);
|
| 58 |
+
assert(filePos == (uint64_t)file.tellg());
|
| 59 |
+
|
| 60 |
+
file.read((char*) &m_numChildrenLoad, sizeof(uint64_t));
|
| 61 |
+
|
| 62 |
+
size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize);
|
| 63 |
+
m_memLoad = (char*) malloc(memAlloc);
|
| 64 |
+
|
| 65 |
+
// go to start of node again
|
| 66 |
+
file.seekg(filePos);
|
| 67 |
+
assert(filePos == (uint64_t)file.tellg());
|
| 68 |
+
|
| 69 |
+
// read everything into memory
|
| 70 |
+
file.read(m_memLoad, memAlloc);
|
| 71 |
+
assert(filePos + memAlloc == (uint64_t)file.tellg());
|
| 72 |
+
|
| 73 |
+
// get value
|
| 74 |
+
m_value = ((uint64_t*)m_memLoad)[1];
|
| 75 |
+
|
| 76 |
+
// get counts
|
| 77 |
+
float *memFloat = (float*) (m_memLoad + sizeof(uint64_t) * 2);
|
| 78 |
+
|
| 79 |
+
assert(countSize == 1);
|
| 80 |
+
m_counts[0] = memFloat[0];
|
| 81 |
+
|
| 82 |
+
m_memLoadLast = m_memLoad + memAlloc;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
PhraseNode::~PhraseNode()
|
| 86 |
+
{
|
| 87 |
+
free(m_memLoad);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
float PhraseNode::GetCount(size_t ind) const
|
| 91 |
+
{
|
| 92 |
+
return m_counts[ind];
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimit)
|
| 96 |
+
{
|
| 97 |
+
UTIL_THROW_IF2(m_saved, "Already saved");
|
| 98 |
+
|
| 99 |
+
// save this node
|
| 100 |
+
m_targetPhraseColl.Sort(tableLimit);
|
| 101 |
+
m_targetPhraseColl.Save(onDiskWrapper);
|
| 102 |
+
m_value = m_targetPhraseColl.GetFilePos();
|
| 103 |
+
|
| 104 |
+
size_t numCounts = onDiskWrapper.GetNumCounts();
|
| 105 |
+
|
| 106 |
+
size_t memAlloc = GetNodeSize(GetSize(), onDiskWrapper.GetSourceWordSize(), numCounts);
|
| 107 |
+
char *mem = (char*) malloc(memAlloc);
|
| 108 |
+
//memset(mem, 0xfe, memAlloc);
|
| 109 |
+
|
| 110 |
+
size_t memUsed = 0;
|
| 111 |
+
uint64_t *memArray = (uint64_t*) mem;
|
| 112 |
+
memArray[0] = GetSize(); // num of children
|
| 113 |
+
memArray[1] = m_value; // file pos of corresponding target phrases
|
| 114 |
+
memUsed += 2 * sizeof(uint64_t);
|
| 115 |
+
|
| 116 |
+
// count info
|
| 117 |
+
float *memFloat = (float*) (mem + memUsed);
|
| 118 |
+
UTIL_THROW_IF2(numCounts != 1, "Can only store 1 phrase count");
|
| 119 |
+
memFloat[0] = (m_counts.size() == 0) ? DEFAULT_COUNT : m_counts[0]; // if count = 0, put in very large num to make sure its still used. HACK
|
| 120 |
+
memUsed += sizeof(float) * numCounts;
|
| 121 |
+
|
| 122 |
+
// recursively save chm_countsildren
|
| 123 |
+
ChildColl::iterator iter;
|
| 124 |
+
for (iter = m_children.begin(); iter != m_children.end(); ++iter) {
|
| 125 |
+
const Word &childWord = iter->first;
|
| 126 |
+
PhraseNode &childNode = iter->second;
|
| 127 |
+
|
| 128 |
+
// recursive
|
| 129 |
+
if (!childNode.Saved())
|
| 130 |
+
childNode.Save(onDiskWrapper, pos + 1, tableLimit);
|
| 131 |
+
|
| 132 |
+
char *currMem = mem + memUsed;
|
| 133 |
+
size_t wordMemUsed = childWord.WriteToMemory(currMem);
|
| 134 |
+
memUsed += wordMemUsed;
|
| 135 |
+
|
| 136 |
+
uint64_t *memArray = (uint64_t*) (mem + memUsed);
|
| 137 |
+
memArray[0] = childNode.GetFilePos();
|
| 138 |
+
memUsed += sizeof(uint64_t);
|
| 139 |
+
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
// save this node
|
| 143 |
+
//Moses::DebugMem(mem, memAlloc);
|
| 144 |
+
assert(memUsed == memAlloc);
|
| 145 |
+
|
| 146 |
+
std::fstream &file = onDiskWrapper.GetFileSource();
|
| 147 |
+
m_filePos = file.tellp();
|
| 148 |
+
file.seekp(0, ios::end);
|
| 149 |
+
file.write(mem, memUsed);
|
| 150 |
+
|
| 151 |
+
uint64_t endPos = file.tellp();
|
| 152 |
+
assert(m_filePos + memUsed == endPos);
|
| 153 |
+
|
| 154 |
+
free(mem);
|
| 155 |
+
|
| 156 |
+
m_children.clear();
|
| 157 |
+
m_saved = true;
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
void PhraseNode::AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase
|
| 161 |
+
, OnDiskWrapper &onDiskWrapper, size_t tableLimit
|
| 162 |
+
, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort)
|
| 163 |
+
{
|
| 164 |
+
AddTargetPhrase(0, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort);
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
|
| 168 |
+
, TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
|
| 169 |
+
, size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort)
|
| 170 |
+
{
|
| 171 |
+
size_t phraseSize = sourcePhrase.GetSize();
|
| 172 |
+
if (pos < phraseSize) {
|
| 173 |
+
const Word &word = sourcePhrase.GetWord(pos);
|
| 174 |
+
|
| 175 |
+
PhraseNode &node = m_children[word];
|
| 176 |
+
if (m_currChild != &node) {
|
| 177 |
+
// new node
|
| 178 |
+
node.SetPos(pos);
|
| 179 |
+
|
| 180 |
+
if (m_currChild) {
|
| 181 |
+
m_currChild->Save(onDiskWrapper, pos, tableLimit);
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
m_currChild = &node;
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
// keep searching for target phrase node..
|
| 188 |
+
node.AddTargetPhrase(pos + 1, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort);
|
| 189 |
+
} else {
|
| 190 |
+
// drilled down to the right node
|
| 191 |
+
m_counts = counts;
|
| 192 |
+
targetPhrase->SetSourcePhrase(spShort);
|
| 193 |
+
m_targetPhraseColl.AddTargetPhrase(targetPhrase);
|
| 194 |
+
}
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
const PhraseNode *PhraseNode::GetChild(const Word &wordSought, OnDiskWrapper &onDiskWrapper) const
|
| 198 |
+
{
|
| 199 |
+
const PhraseNode *ret = NULL;
|
| 200 |
+
|
| 201 |
+
int l = 0;
|
| 202 |
+
int r = m_numChildrenLoad - 1;
|
| 203 |
+
int x;
|
| 204 |
+
|
| 205 |
+
while (r >= l) {
|
| 206 |
+
x = (l + r) / 2;
|
| 207 |
+
|
| 208 |
+
Word wordFound;
|
| 209 |
+
uint64_t childFilePos;
|
| 210 |
+
GetChild(wordFound, childFilePos, x, onDiskWrapper);
|
| 211 |
+
|
| 212 |
+
if (wordSought == wordFound) {
|
| 213 |
+
ret = new PhraseNode(childFilePos, onDiskWrapper);
|
| 214 |
+
break;
|
| 215 |
+
}
|
| 216 |
+
if (wordSought < wordFound)
|
| 217 |
+
r = x - 1;
|
| 218 |
+
else
|
| 219 |
+
l = x + 1;
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
return ret;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
void PhraseNode::GetChild(Word &wordFound, uint64_t &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const
|
| 226 |
+
{
|
| 227 |
+
|
| 228 |
+
size_t wordSize = onDiskWrapper.GetSourceWordSize();
|
| 229 |
+
size_t childSize = wordSize + sizeof(uint64_t);
|
| 230 |
+
|
| 231 |
+
char *currMem = m_memLoad
|
| 232 |
+
+ sizeof(uint64_t) * 2 // size & file pos of target phrase coll
|
| 233 |
+
+ sizeof(float) * onDiskWrapper.GetNumCounts() // count info
|
| 234 |
+
+ childSize * ind;
|
| 235 |
+
|
| 236 |
+
size_t memRead = ReadChild(wordFound, childFilePos, currMem);
|
| 237 |
+
assert(memRead == childSize);
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
size_t PhraseNode::ReadChild(Word &wordFound, uint64_t &childFilePos, const char *mem) const
|
| 241 |
+
{
|
| 242 |
+
size_t memRead = wordFound.ReadFromMemory(mem);
|
| 243 |
+
|
| 244 |
+
const char *currMem = mem + memRead;
|
| 245 |
+
uint64_t *memArray = (uint64_t*) (currMem);
|
| 246 |
+
childFilePos = memArray[0];
|
| 247 |
+
|
| 248 |
+
memRead += sizeof(uint64_t);
|
| 249 |
+
return memRead;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
TargetPhraseCollection::shared_ptr
|
| 253 |
+
PhraseNode::
|
| 254 |
+
GetTargetPhraseCollection(size_t tableLimit, OnDiskWrapper &onDiskWrapper) const
|
| 255 |
+
{
|
| 256 |
+
TargetPhraseCollection::shared_ptr ret(new TargetPhraseCollection);
|
| 257 |
+
if (m_value > 0) ret->ReadFromFile(tableLimit, m_value, onDiskWrapper);
|
| 258 |
+
return ret;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
std::ostream& operator<<(std::ostream &out, const PhraseNode &node)
|
| 262 |
+
{
|
| 263 |
+
out << "node (" << node.GetFilePos() << "," << node.GetValue() << "," << node.m_pos << ")";
|
| 264 |
+
return out;
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
}
|
| 268 |
+
|
mosesdecoder/OnDiskPt/PhraseNode.h
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// $Id$
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 5 |
+
Copyright (C) 2009 Hieu Hoang
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
#include <fstream>
|
| 22 |
+
#include <vector>
|
| 23 |
+
#include <map>
|
| 24 |
+
#include "Word.h"
|
| 25 |
+
#include "TargetPhraseCollection.h"
|
| 26 |
+
#include "Phrase.h"
|
| 27 |
+
|
| 28 |
+
namespace OnDiskPt
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
class OnDiskWrapper;
|
| 32 |
+
class SourcePhrase;
|
| 33 |
+
|
| 34 |
+
/** A node in the source tree trie */
|
| 35 |
+
class PhraseNode
|
| 36 |
+
{
|
| 37 |
+
friend std::ostream& operator<<(std::ostream&, const PhraseNode&);
|
| 38 |
+
protected:
|
| 39 |
+
uint64_t m_filePos, m_value;
|
| 40 |
+
|
| 41 |
+
typedef std::map<Word, PhraseNode> ChildColl;
|
| 42 |
+
ChildColl m_children;
|
| 43 |
+
PhraseNode *m_currChild;
|
| 44 |
+
bool m_saved;
|
| 45 |
+
size_t m_pos;
|
| 46 |
+
std::vector<float> m_counts;
|
| 47 |
+
|
| 48 |
+
TargetPhraseCollection m_targetPhraseColl;
|
| 49 |
+
|
| 50 |
+
char *m_memLoad, *m_memLoadLast;
|
| 51 |
+
uint64_t m_numChildrenLoad;
|
| 52 |
+
|
| 53 |
+
void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
|
| 54 |
+
, TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
|
| 55 |
+
, size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);
|
| 56 |
+
size_t ReadChild(Word &wordFound, uint64_t &childFilePos, const char *mem) const;
|
| 57 |
+
void GetChild(Word &wordFound, uint64_t &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const;
|
| 58 |
+
|
| 59 |
+
public:
|
| 60 |
+
static size_t GetNodeSize(size_t numChildren, size_t wordSize, size_t countSize);
|
| 61 |
+
|
| 62 |
+
PhraseNode(); // unsaved node
|
| 63 |
+
PhraseNode(uint64_t filePos, OnDiskWrapper &onDiskWrapper); // load saved node
|
| 64 |
+
~PhraseNode();
|
| 65 |
+
|
| 66 |
+
void Add(const Word &word, uint64_t nextFilePos, size_t wordSize);
|
| 67 |
+
void Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimit);
|
| 68 |
+
|
| 69 |
+
void AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase
|
| 70 |
+
, OnDiskWrapper &onDiskWrapper, size_t tableLimit
|
| 71 |
+
, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);
|
| 72 |
+
|
| 73 |
+
uint64_t GetFilePos() const {
|
| 74 |
+
return m_filePos;
|
| 75 |
+
}
|
| 76 |
+
uint64_t GetValue() const {
|
| 77 |
+
return m_value;
|
| 78 |
+
}
|
| 79 |
+
void SetValue(uint64_t value) {
|
| 80 |
+
m_value = value;
|
| 81 |
+
}
|
| 82 |
+
size_t GetSize() const {
|
| 83 |
+
return m_children.size();
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
bool Saved() const {
|
| 87 |
+
return m_saved;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
void SetPos(size_t pos) {
|
| 91 |
+
m_pos = pos;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
const PhraseNode *GetChild(const Word &wordSought, OnDiskWrapper &onDiskWrapper) const;
|
| 95 |
+
|
| 96 |
+
TargetPhraseCollection::shared_ptr
|
| 97 |
+
GetTargetPhraseCollection(size_t tableLimit,
|
| 98 |
+
OnDiskWrapper &onDiskWrapper) const;
|
| 99 |
+
|
| 100 |
+
void AddCounts(const std::vector<float> &counts) {
|
| 101 |
+
m_counts = counts;
|
| 102 |
+
}
|
| 103 |
+
float GetCount(size_t ind) const;
|
| 104 |
+
|
| 105 |
+
};
|
| 106 |
+
|
| 107 |
+
}
|
| 108 |
+
|
mosesdecoder/OnDiskPt/SourcePhrase.cpp
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 4 |
+
Copyright (C) 2009 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
#include "SourcePhrase.h"
|
| 21 |
+
|
| 22 |
+
namespace OnDiskPt
|
| 23 |
+
{
|
| 24 |
+
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
mosesdecoder/OnDiskPt/SourcePhrase.h
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// $Id$
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 5 |
+
Copyright (C) 2009 Hieu Hoang
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
#include <vector>
|
| 22 |
+
#include "Phrase.h"
|
| 23 |
+
#include "Word.h"
|
| 24 |
+
|
| 25 |
+
namespace OnDiskPt
|
| 26 |
+
{
|
| 27 |
+
|
| 28 |
+
/** A source phrase. No extension of a norm Phrase class because source phrases are saved as tries.
|
| 29 |
+
*/
|
| 30 |
+
class SourcePhrase: public Phrase
|
| 31 |
+
{
|
| 32 |
+
protected:
|
| 33 |
+
|
| 34 |
+
public:
|
| 35 |
+
};
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
}
|
mosesdecoder/OnDiskPt/TargetPhrase.cpp
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 4 |
+
Copyright (C) 2009 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
|
| 21 |
+
#include <algorithm>
|
| 22 |
+
#include <iostream>
|
| 23 |
+
#include "moses/Util.h"
|
| 24 |
+
#include "TargetPhrase.h"
|
| 25 |
+
#include "OnDiskWrapper.h"
|
| 26 |
+
#include "util/exception.hh"
|
| 27 |
+
|
| 28 |
+
#include <boost/algorithm/string.hpp>
|
| 29 |
+
|
| 30 |
+
using namespace std;
|
| 31 |
+
|
| 32 |
+
namespace OnDiskPt
|
| 33 |
+
{
|
| 34 |
+
|
| 35 |
+
TargetPhrase::TargetPhrase(size_t numScores)
|
| 36 |
+
:m_scores(numScores)
|
| 37 |
+
{
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
TargetPhrase::TargetPhrase(const TargetPhrase ©)
|
| 41 |
+
:Phrase(copy)
|
| 42 |
+
,m_scores(copy.m_scores)
|
| 43 |
+
{
|
| 44 |
+
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
TargetPhrase::~TargetPhrase()
|
| 48 |
+
{
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
void TargetPhrase::SetLHS(WordPtr lhs)
|
| 52 |
+
{
|
| 53 |
+
AddWord(lhs);
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
void TargetPhrase::Create1AlignFromString(const std::string &align1Str)
|
| 57 |
+
{
|
| 58 |
+
vector<size_t> alignPoints;
|
| 59 |
+
Moses::Tokenize<size_t>(alignPoints, align1Str, "-");
|
| 60 |
+
UTIL_THROW_IF2(alignPoints.size() != 2, "Incorrectly formatted word alignment: " << align1Str);
|
| 61 |
+
m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
void TargetPhrase::CreateAlignFromString(const std::string &alignStr)
|
| 65 |
+
{
|
| 66 |
+
vector<std::string> alignPairs;
|
| 67 |
+
boost::split(alignPairs, alignStr, boost::is_any_of("\t "));
|
| 68 |
+
for (size_t i = 0; i < alignPairs.size(); ++i) {
|
| 69 |
+
vector<size_t> alignPoints;
|
| 70 |
+
Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-");
|
| 71 |
+
m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
void TargetPhrase::SetScore(float score, size_t ind)
|
| 77 |
+
{
|
| 78 |
+
assert(ind < m_scores.size());
|
| 79 |
+
m_scores[ind] = score;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
class AlignOrderer
|
| 83 |
+
{
|
| 84 |
+
public:
|
| 85 |
+
bool operator()(const AlignPair &a, const AlignPair &b) const {
|
| 86 |
+
return a.first < b.first;
|
| 87 |
+
}
|
| 88 |
+
};
|
| 89 |
+
|
| 90 |
+
void TargetPhrase::SortAlign()
|
| 91 |
+
{
|
| 92 |
+
std::sort(m_align.begin(), m_align.end(), AlignOrderer());
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) const
|
| 96 |
+
{
|
| 97 |
+
size_t phraseSize = GetSize();
|
| 98 |
+
size_t targetWordSize = onDiskWrapper.GetTargetWordSize();
|
| 99 |
+
|
| 100 |
+
const PhrasePtr sp = GetSourcePhrase();
|
| 101 |
+
size_t spSize = sp->GetSize();
|
| 102 |
+
size_t sourceWordSize = onDiskWrapper.GetSourceWordSize();
|
| 103 |
+
|
| 104 |
+
size_t memNeeded = sizeof(uint64_t) // num of words
|
| 105 |
+
+ targetWordSize * phraseSize // actual words. lhs as last words
|
| 106 |
+
+ sizeof(uint64_t) // num source words
|
| 107 |
+
+ sourceWordSize * spSize; // actual source words
|
| 108 |
+
|
| 109 |
+
memUsed = 0;
|
| 110 |
+
uint64_t *mem = (uint64_t*) malloc(memNeeded);
|
| 111 |
+
|
| 112 |
+
// write size
|
| 113 |
+
mem[0] = phraseSize;
|
| 114 |
+
memUsed += sizeof(uint64_t);
|
| 115 |
+
|
| 116 |
+
// write each word
|
| 117 |
+
for (size_t pos = 0; pos < phraseSize; ++pos) {
|
| 118 |
+
const Word &word = GetWord(pos);
|
| 119 |
+
char *currPtr = (char*)mem + memUsed;
|
| 120 |
+
memUsed += word.WriteToMemory((char*) currPtr);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
// write size of source phrase and all source words
|
| 124 |
+
char *currPtr = (char*)mem + memUsed;
|
| 125 |
+
uint64_t *memTmp = (uint64_t*) currPtr;
|
| 126 |
+
memTmp[0] = spSize;
|
| 127 |
+
memUsed += sizeof(uint64_t);
|
| 128 |
+
for (size_t pos = 0; pos < spSize; ++pos) {
|
| 129 |
+
const Word &word = sp->GetWord(pos);
|
| 130 |
+
char *currPtr = (char*)mem + memUsed;
|
| 131 |
+
memUsed += word.WriteToMemory((char*) currPtr);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
assert(memUsed == memNeeded);
|
| 135 |
+
return (char *) mem;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
void TargetPhrase::Save(OnDiskWrapper &onDiskWrapper)
|
| 139 |
+
{
|
| 140 |
+
// save in target ind
|
| 141 |
+
size_t memUsed;
|
| 142 |
+
char *mem = WriteToMemory(onDiskWrapper, memUsed);
|
| 143 |
+
|
| 144 |
+
std::fstream &file = onDiskWrapper.GetFileTargetInd();
|
| 145 |
+
|
| 146 |
+
uint64_t startPos = file.tellp();
|
| 147 |
+
|
| 148 |
+
file.seekp(0, ios::end);
|
| 149 |
+
file.write(mem, memUsed);
|
| 150 |
+
|
| 151 |
+
#ifndef NDEBUG
|
| 152 |
+
uint64_t endPos = file.tellp();
|
| 153 |
+
assert(startPos + memUsed == endPos);
|
| 154 |
+
#endif
|
| 155 |
+
|
| 156 |
+
m_filePos = startPos;
|
| 157 |
+
free(mem);
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) const
|
| 161 |
+
{
|
| 162 |
+
// allocate mem
|
| 163 |
+
size_t numScores = onDiskWrapper.GetNumScores()
|
| 164 |
+
,numAlign = GetAlign().size();
|
| 165 |
+
size_t sparseFeatureSize = m_sparseFeatures.size();
|
| 166 |
+
size_t propSize = m_property.size();
|
| 167 |
+
|
| 168 |
+
size_t memNeeded = sizeof(uint64_t) // file pos (phrase id)
|
| 169 |
+
+ sizeof(uint64_t) + 2 * sizeof(uint64_t) * numAlign // align
|
| 170 |
+
+ sizeof(float) * numScores // scores
|
| 171 |
+
+ sizeof(uint64_t) + sparseFeatureSize // sparse features string
|
| 172 |
+
+ sizeof(uint64_t) + propSize; // property string
|
| 173 |
+
|
| 174 |
+
char *mem = (char*) malloc(memNeeded);
|
| 175 |
+
//memset(mem, 0, memNeeded);
|
| 176 |
+
|
| 177 |
+
memUsed = 0;
|
| 178 |
+
|
| 179 |
+
// phrase id
|
| 180 |
+
memcpy(mem, &m_filePos, sizeof(uint64_t));
|
| 181 |
+
memUsed += sizeof(uint64_t);
|
| 182 |
+
|
| 183 |
+
// align
|
| 184 |
+
size_t tmp = WriteAlignToMemory(mem + memUsed);
|
| 185 |
+
memUsed += tmp;
|
| 186 |
+
|
| 187 |
+
// scores
|
| 188 |
+
memUsed += WriteScoresToMemory(mem + memUsed);
|
| 189 |
+
|
| 190 |
+
// sparse features
|
| 191 |
+
memUsed += WriteStringToMemory(mem + memUsed, m_sparseFeatures);
|
| 192 |
+
|
| 193 |
+
// property string
|
| 194 |
+
memUsed += WriteStringToMemory(mem + memUsed, m_property);
|
| 195 |
+
|
| 196 |
+
//DebugMem(mem, memNeeded);
|
| 197 |
+
assert(memNeeded == memUsed);
|
| 198 |
+
return mem;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
size_t TargetPhrase::WriteStringToMemory(char *mem, const std::string &str) const
|
| 202 |
+
{
|
| 203 |
+
size_t memUsed = 0;
|
| 204 |
+
uint64_t *memTmp = (uint64_t*) mem;
|
| 205 |
+
|
| 206 |
+
size_t strSize = str.size();
|
| 207 |
+
memTmp[0] = strSize;
|
| 208 |
+
memUsed += sizeof(uint64_t);
|
| 209 |
+
|
| 210 |
+
const char *charStr = str.c_str();
|
| 211 |
+
memcpy(mem + memUsed, charStr, strSize);
|
| 212 |
+
memUsed += strSize;
|
| 213 |
+
|
| 214 |
+
return memUsed;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
size_t TargetPhrase::WriteAlignToMemory(char *mem) const
|
| 218 |
+
{
|
| 219 |
+
size_t memUsed = 0;
|
| 220 |
+
|
| 221 |
+
// num of alignments
|
| 222 |
+
uint64_t numAlign = m_align.size();
|
| 223 |
+
memcpy(mem, &numAlign, sizeof(numAlign));
|
| 224 |
+
memUsed += sizeof(numAlign);
|
| 225 |
+
|
| 226 |
+
// actual alignments
|
| 227 |
+
AlignType::const_iterator iter;
|
| 228 |
+
for (iter = m_align.begin(); iter != m_align.end(); ++iter) {
|
| 229 |
+
const AlignPair &alignPair = *iter;
|
| 230 |
+
|
| 231 |
+
memcpy(mem + memUsed, &alignPair.first, sizeof(alignPair.first));
|
| 232 |
+
memUsed += sizeof(alignPair.first);
|
| 233 |
+
|
| 234 |
+
memcpy(mem + memUsed, &alignPair.second, sizeof(alignPair.second));
|
| 235 |
+
memUsed += sizeof(alignPair.second);
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
return memUsed;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
size_t TargetPhrase::WriteScoresToMemory(char *mem) const
|
| 242 |
+
{
|
| 243 |
+
float *scoreMem = (float*) mem;
|
| 244 |
+
|
| 245 |
+
for (size_t ind = 0; ind < m_scores.size(); ++ind)
|
| 246 |
+
scoreMem[ind] = m_scores[ind];
|
| 247 |
+
|
| 248 |
+
size_t memUsed = sizeof(float) * m_scores.size();
|
| 249 |
+
return memUsed;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
uint64_t TargetPhrase::ReadOtherInfoFromFile(uint64_t filePos, std::fstream &fileTPColl)
|
| 253 |
+
{
|
| 254 |
+
assert(filePos == (uint64_t)fileTPColl.tellg());
|
| 255 |
+
|
| 256 |
+
uint64_t memUsed = 0;
|
| 257 |
+
fileTPColl.read((char*) &m_filePos, sizeof(uint64_t));
|
| 258 |
+
memUsed += sizeof(uint64_t);
|
| 259 |
+
assert(m_filePos != 0);
|
| 260 |
+
|
| 261 |
+
memUsed += ReadAlignFromFile(fileTPColl);
|
| 262 |
+
assert((memUsed + filePos) == (uint64_t)fileTPColl.tellg());
|
| 263 |
+
|
| 264 |
+
memUsed += ReadScoresFromFile(fileTPColl);
|
| 265 |
+
assert((memUsed + filePos) == (uint64_t)fileTPColl.tellg());
|
| 266 |
+
|
| 267 |
+
// sparse features
|
| 268 |
+
memUsed += ReadStringFromFile(fileTPColl, m_sparseFeatures);
|
| 269 |
+
|
| 270 |
+
// properties
|
| 271 |
+
memUsed += ReadStringFromFile(fileTPColl, m_property);
|
| 272 |
+
|
| 273 |
+
return memUsed;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
uint64_t TargetPhrase::ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr)
|
| 277 |
+
{
|
| 278 |
+
uint64_t bytesRead = 0;
|
| 279 |
+
|
| 280 |
+
uint64_t strSize;
|
| 281 |
+
fileTPColl.read((char*) &strSize, sizeof(uint64_t));
|
| 282 |
+
bytesRead += sizeof(uint64_t);
|
| 283 |
+
|
| 284 |
+
if (strSize) {
|
| 285 |
+
char *mem = (char*) malloc(strSize + 1);
|
| 286 |
+
mem[strSize] = '\0';
|
| 287 |
+
fileTPColl.read(mem, strSize);
|
| 288 |
+
outStr = string(mem);
|
| 289 |
+
free(mem);
|
| 290 |
+
|
| 291 |
+
bytesRead += strSize;
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
return bytesRead;
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
uint64_t TargetPhrase::ReadFromFile(std::fstream &fileTP)
|
| 298 |
+
{
|
| 299 |
+
uint64_t bytesRead = 0;
|
| 300 |
+
|
| 301 |
+
fileTP.seekg(m_filePos);
|
| 302 |
+
|
| 303 |
+
uint64_t numWords;
|
| 304 |
+
fileTP.read((char*) &numWords, sizeof(uint64_t));
|
| 305 |
+
bytesRead += sizeof(uint64_t);
|
| 306 |
+
|
| 307 |
+
for (size_t ind = 0; ind < numWords; ++ind) {
|
| 308 |
+
WordPtr word(new Word());
|
| 309 |
+
bytesRead += word->ReadFromFile(fileTP);
|
| 310 |
+
AddWord(word);
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
// read source words
|
| 314 |
+
uint64_t numSourceWords;
|
| 315 |
+
fileTP.read((char*) &numSourceWords, sizeof(uint64_t));
|
| 316 |
+
bytesRead += sizeof(uint64_t);
|
| 317 |
+
|
| 318 |
+
PhrasePtr sp(new SourcePhrase());
|
| 319 |
+
for (size_t ind = 0; ind < numSourceWords; ++ind) {
|
| 320 |
+
WordPtr word( new Word());
|
| 321 |
+
bytesRead += word->ReadFromFile(fileTP);
|
| 322 |
+
sp->AddWord(word);
|
| 323 |
+
}
|
| 324 |
+
SetSourcePhrase(sp);
|
| 325 |
+
|
| 326 |
+
return bytesRead;
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
uint64_t TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
|
| 330 |
+
{
|
| 331 |
+
uint64_t bytesRead = 0;
|
| 332 |
+
|
| 333 |
+
uint64_t numAlign;
|
| 334 |
+
fileTPColl.read((char*) &numAlign, sizeof(uint64_t));
|
| 335 |
+
bytesRead += sizeof(uint64_t);
|
| 336 |
+
|
| 337 |
+
for (size_t ind = 0; ind < numAlign; ++ind) {
|
| 338 |
+
AlignPair alignPair;
|
| 339 |
+
fileTPColl.read((char*) &alignPair.first, sizeof(uint64_t));
|
| 340 |
+
fileTPColl.read((char*) &alignPair.second, sizeof(uint64_t));
|
| 341 |
+
m_align.push_back(alignPair);
|
| 342 |
+
|
| 343 |
+
bytesRead += sizeof(uint64_t) * 2;
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
return bytesRead;
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
uint64_t TargetPhrase::ReadScoresFromFile(std::fstream &fileTPColl)
|
| 350 |
+
{
|
| 351 |
+
UTIL_THROW_IF2(m_scores.size() == 0, "Translation rules must must have some scores");
|
| 352 |
+
|
| 353 |
+
uint64_t bytesRead = 0;
|
| 354 |
+
|
| 355 |
+
for (size_t ind = 0; ind < m_scores.size(); ++ind) {
|
| 356 |
+
fileTPColl.read((char*) &m_scores[ind], sizeof(float));
|
| 357 |
+
|
| 358 |
+
bytesRead += sizeof(float);
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
std::transform(m_scores.begin(),m_scores.end(),m_scores.begin(), Moses::TransformScore);
|
| 362 |
+
std::transform(m_scores.begin(),m_scores.end(),m_scores.begin(), Moses::FloorScore);
|
| 363 |
+
|
| 364 |
+
return bytesRead;
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
void TargetPhrase::DebugPrint(ostream &out, const Vocab &vocab) const
|
| 368 |
+
{
|
| 369 |
+
Phrase::DebugPrint(out, vocab);
|
| 370 |
+
|
| 371 |
+
for (size_t ind = 0; ind < m_align.size(); ++ind) {
|
| 372 |
+
const AlignPair &alignPair = m_align[ind];
|
| 373 |
+
out << alignPair.first << "-" << alignPair.second << " ";
|
| 374 |
+
}
|
| 375 |
+
out << ", ";
|
| 376 |
+
|
| 377 |
+
for (size_t ind = 0; ind < m_scores.size(); ++ind) {
|
| 378 |
+
out << m_scores[ind] << " ";
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
return;
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
std::ostream& operator<<(std::ostream &out, const TargetPhrase &phrase)
|
| 385 |
+
{
|
| 386 |
+
out << (const Phrase&) phrase << ", " ;
|
| 387 |
+
|
| 388 |
+
for (size_t ind = 0; ind < phrase.m_align.size(); ++ind) {
|
| 389 |
+
const AlignPair &alignPair = phrase.m_align[ind];
|
| 390 |
+
out << alignPair.first << "-" << alignPair.second << " ";
|
| 391 |
+
}
|
| 392 |
+
out << ", ";
|
| 393 |
+
|
| 394 |
+
for (size_t ind = 0; ind < phrase.m_scores.size(); ++ind) {
|
| 395 |
+
out << phrase.m_scores[ind] << " ";
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
return out;
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
} // namespace
|
| 402 |
+
|
mosesdecoder/OnDiskPt/TargetPhrase.h
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// $Id$
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 5 |
+
Copyright (C) 2009 Hieu Hoang
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include <fstream>
|
| 23 |
+
#include <string>
|
| 24 |
+
#include <vector>
|
| 25 |
+
#include "Word.h"
|
| 26 |
+
#include "Phrase.h"
|
| 27 |
+
#include "SourcePhrase.h"
|
| 28 |
+
|
| 29 |
+
namespace Moses
|
| 30 |
+
{
|
| 31 |
+
class PhraseDictionary;
|
| 32 |
+
class TargetPhrase;
|
| 33 |
+
class Phrase;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
namespace OnDiskPt
|
| 37 |
+
{
|
| 38 |
+
|
| 39 |
+
typedef std::pair<uint64_t, uint64_t> AlignPair;
|
| 40 |
+
typedef std::vector<AlignPair> AlignType;
|
| 41 |
+
|
| 42 |
+
class Vocab;
|
| 43 |
+
|
| 44 |
+
/** A target phrase, with the score breakdowns, alignment info and assorted other information it need.
|
| 45 |
+
* Readable and writeable to disk
|
| 46 |
+
*/
|
| 47 |
+
class TargetPhrase: public Phrase
|
| 48 |
+
{
|
| 49 |
+
friend std::ostream& operator<<(std::ostream&, const TargetPhrase&);
|
| 50 |
+
protected:
|
| 51 |
+
AlignType m_align;
|
| 52 |
+
PhrasePtr m_sourcePhrase;
|
| 53 |
+
std::string m_sparseFeatures, m_property;
|
| 54 |
+
|
| 55 |
+
std::vector<float> m_scores;
|
| 56 |
+
uint64_t m_filePos;
|
| 57 |
+
|
| 58 |
+
size_t WriteAlignToMemory(char *mem) const;
|
| 59 |
+
size_t WriteScoresToMemory(char *mem) const;
|
| 60 |
+
size_t WriteStringToMemory(char *mem, const std::string &str) const;
|
| 61 |
+
|
| 62 |
+
uint64_t ReadAlignFromFile(std::fstream &fileTPColl);
|
| 63 |
+
uint64_t ReadScoresFromFile(std::fstream &fileTPColl);
|
| 64 |
+
uint64_t ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr);
|
| 65 |
+
|
| 66 |
+
public:
|
| 67 |
+
TargetPhrase() {
|
| 68 |
+
}
|
| 69 |
+
TargetPhrase(size_t numScores);
|
| 70 |
+
TargetPhrase(const TargetPhrase ©);
|
| 71 |
+
virtual ~TargetPhrase();
|
| 72 |
+
|
| 73 |
+
void SetSourcePhrase(PhrasePtr p) {
|
| 74 |
+
m_sourcePhrase = p;
|
| 75 |
+
}
|
| 76 |
+
const PhrasePtr GetSourcePhrase() const {
|
| 77 |
+
return m_sourcePhrase;
|
| 78 |
+
}
|
| 79 |
+
const std::vector<float> &GetScores() const {
|
| 80 |
+
return m_scores;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
void SetLHS(WordPtr lhs);
|
| 84 |
+
|
| 85 |
+
void Create1AlignFromString(const std::string &align1Str);
|
| 86 |
+
void CreateAlignFromString(const std::string &align1Str);
|
| 87 |
+
void SetScore(float score, size_t ind);
|
| 88 |
+
|
| 89 |
+
const AlignType &GetAlign() const {
|
| 90 |
+
return m_align;
|
| 91 |
+
}
|
| 92 |
+
void SortAlign();
|
| 93 |
+
|
| 94 |
+
char *WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) const;
|
| 95 |
+
char *WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) const;
|
| 96 |
+
void Save(OnDiskWrapper &onDiskWrapper);
|
| 97 |
+
|
| 98 |
+
uint64_t GetFilePos() const {
|
| 99 |
+
return m_filePos;
|
| 100 |
+
}
|
| 101 |
+
float GetScore(size_t ind) const {
|
| 102 |
+
return m_scores[ind];
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
uint64_t ReadOtherInfoFromFile(uint64_t filePos, std::fstream &fileTPColl);
|
| 106 |
+
uint64_t ReadFromFile(std::fstream &fileTP);
|
| 107 |
+
|
| 108 |
+
virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
|
| 109 |
+
|
| 110 |
+
const std::string &GetProperty() const {
|
| 111 |
+
return m_property;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
void SetProperty(const std::string &value) {
|
| 115 |
+
m_property = value;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
const std::string &GetSparseFeatures() const {
|
| 119 |
+
return m_sparseFeatures;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
void SetSparseFeatures(const std::string &value) {
|
| 123 |
+
m_sparseFeatures = value;
|
| 124 |
+
}
|
| 125 |
+
};
|
| 126 |
+
|
| 127 |
+
}
|
mosesdecoder/OnDiskPt/TargetPhraseCollection.cpp
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 4 |
+
Copyright (C) 2009 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
|
| 21 |
+
#include <algorithm>
|
| 22 |
+
#include <iostream>
|
| 23 |
+
#include "moses/Util.h"
|
| 24 |
+
#include "TargetPhraseCollection.h"
|
| 25 |
+
#include "Vocab.h"
|
| 26 |
+
#include "OnDiskWrapper.h"
|
| 27 |
+
|
| 28 |
+
using namespace std;
|
| 29 |
+
|
| 30 |
+
namespace OnDiskPt
|
| 31 |
+
{
|
| 32 |
+
|
| 33 |
+
size_t TargetPhraseCollection::s_sortScoreInd;
|
| 34 |
+
|
| 35 |
+
TargetPhraseCollection::TargetPhraseCollection()
|
| 36 |
+
:m_filePos(777)
|
| 37 |
+
{}
|
| 38 |
+
|
| 39 |
+
TargetPhraseCollection::TargetPhraseCollection(const TargetPhraseCollection ©)
|
| 40 |
+
:m_filePos(copy.m_filePos)
|
| 41 |
+
,m_debugStr(copy.m_debugStr)
|
| 42 |
+
{
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
TargetPhraseCollection::~TargetPhraseCollection()
|
| 46 |
+
{
|
| 47 |
+
Moses::RemoveAllInColl(m_coll);
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
void TargetPhraseCollection::AddTargetPhrase(TargetPhrase *targetPhrase)
|
| 51 |
+
{
|
| 52 |
+
m_coll.push_back(targetPhrase);
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
void TargetPhraseCollection::Sort(size_t tableLimit)
|
| 56 |
+
{
|
| 57 |
+
std::sort(m_coll.begin(), m_coll.end(), TargetPhraseOrderByScore());
|
| 58 |
+
|
| 59 |
+
if (tableLimit && m_coll.size() > tableLimit) {
|
| 60 |
+
CollType::iterator iter;
|
| 61 |
+
for (iter = m_coll.begin() + tableLimit ; iter != m_coll.end(); ++iter) {
|
| 62 |
+
delete *iter;
|
| 63 |
+
}
|
| 64 |
+
m_coll.resize(tableLimit);
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
|
| 69 |
+
{
|
| 70 |
+
std::fstream &file = onDiskWrapper.GetFileTargetColl();
|
| 71 |
+
|
| 72 |
+
size_t memUsed = sizeof(uint64_t);
|
| 73 |
+
char *mem = (char*) malloc(memUsed);
|
| 74 |
+
|
| 75 |
+
// size of coll
|
| 76 |
+
uint64_t numPhrases = GetSize();
|
| 77 |
+
((uint64_t*)mem)[0] = numPhrases;
|
| 78 |
+
|
| 79 |
+
// MAIN LOOP
|
| 80 |
+
CollType::iterator iter;
|
| 81 |
+
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
|
| 82 |
+
// save phrase
|
| 83 |
+
TargetPhrase &targetPhrase = **iter;
|
| 84 |
+
targetPhrase.Save(onDiskWrapper);
|
| 85 |
+
|
| 86 |
+
// save coll
|
| 87 |
+
size_t memUsedTPOtherInfo;
|
| 88 |
+
char *memTPOtherInfo = targetPhrase.WriteOtherInfoToMemory(onDiskWrapper, memUsedTPOtherInfo);
|
| 89 |
+
|
| 90 |
+
// expand existing mem
|
| 91 |
+
mem = (char*) realloc(mem, memUsed + memUsedTPOtherInfo);
|
| 92 |
+
memcpy(mem + memUsed, memTPOtherInfo, memUsedTPOtherInfo);
|
| 93 |
+
memUsed += memUsedTPOtherInfo;
|
| 94 |
+
|
| 95 |
+
free(memTPOtherInfo);
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
// total number of bytes
|
| 99 |
+
//((uint64_t*)mem)[0] = (uint64_t) memUsed;
|
| 100 |
+
|
| 101 |
+
uint64_t startPos = file.tellp();
|
| 102 |
+
file.seekp(0, ios::end);
|
| 103 |
+
file.write((char*) mem, memUsed);
|
| 104 |
+
|
| 105 |
+
free(mem);
|
| 106 |
+
|
| 107 |
+
#ifndef NDEBUG
|
| 108 |
+
uint64_t endPos = file.tellp();
|
| 109 |
+
assert(startPos + memUsed == endPos);
|
| 110 |
+
#endif
|
| 111 |
+
m_filePos = startPos;
|
| 112 |
+
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
void TargetPhraseCollection::ReadFromFile(size_t tableLimit, uint64_t filePos, OnDiskWrapper &onDiskWrapper)
|
| 116 |
+
{
|
| 117 |
+
fstream &fileTPColl = onDiskWrapper.GetFileTargetColl();
|
| 118 |
+
fstream &fileTP = onDiskWrapper.GetFileTargetInd();
|
| 119 |
+
|
| 120 |
+
size_t numScores = onDiskWrapper.GetNumScores();
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
uint64_t numPhrases;
|
| 124 |
+
|
| 125 |
+
uint64_t currFilePos = filePos;
|
| 126 |
+
fileTPColl.seekg(filePos);
|
| 127 |
+
fileTPColl.read((char*) &numPhrases, sizeof(uint64_t));
|
| 128 |
+
|
| 129 |
+
// table limit
|
| 130 |
+
if (tableLimit) {
|
| 131 |
+
numPhrases = std::min(numPhrases, (uint64_t) tableLimit);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
currFilePos += sizeof(uint64_t);
|
| 135 |
+
|
| 136 |
+
for (size_t ind = 0; ind < numPhrases; ++ind) {
|
| 137 |
+
TargetPhrase *tp = new TargetPhrase(numScores);
|
| 138 |
+
|
| 139 |
+
uint64_t sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
|
| 140 |
+
tp->ReadFromFile(fileTP);
|
| 141 |
+
|
| 142 |
+
currFilePos += sizeOtherInfo;
|
| 143 |
+
|
| 144 |
+
m_coll.push_back(tp);
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
uint64_t TargetPhraseCollection::GetFilePos() const
|
| 149 |
+
{
|
| 150 |
+
return m_filePos;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
const std::string TargetPhraseCollection::GetDebugStr() const
|
| 154 |
+
{
|
| 155 |
+
return m_debugStr;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
void TargetPhraseCollection::SetDebugStr(const std::string &str)
|
| 159 |
+
{
|
| 160 |
+
m_debugStr = str;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
const TargetPhrase &TargetPhraseCollection::GetTargetPhrase(size_t ind) const
|
| 164 |
+
{
|
| 165 |
+
assert(ind < GetSize());
|
| 166 |
+
return *m_coll[ind];
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
|
mosesdecoder/OnDiskPt/TargetPhraseCollection.h
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 4 |
+
Copyright (C) 2009 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include "TargetPhrase.h"
|
| 23 |
+
#include "Vocab.h"
|
| 24 |
+
#include <boost/shared_ptr.hpp>
|
| 25 |
+
|
| 26 |
+
namespace Moses
|
| 27 |
+
{
|
| 28 |
+
class TargetPhraseCollection;
|
| 29 |
+
class PhraseDictionary;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
namespace OnDiskPt
|
| 33 |
+
{
|
| 34 |
+
|
| 35 |
+
/** A vector of target phrases
|
| 36 |
+
*/
|
| 37 |
+
class TargetPhraseCollection
|
| 38 |
+
{
|
| 39 |
+
class TargetPhraseOrderByScore
|
| 40 |
+
{
|
| 41 |
+
public:
|
| 42 |
+
bool operator()(const TargetPhrase* a, const TargetPhrase *b) const {
|
| 43 |
+
return a->GetScore(s_sortScoreInd) > b->GetScore(s_sortScoreInd);
|
| 44 |
+
}
|
| 45 |
+
};
|
| 46 |
+
|
| 47 |
+
protected:
|
| 48 |
+
typedef std::vector<TargetPhrase*> CollType;
|
| 49 |
+
CollType m_coll;
|
| 50 |
+
uint64_t m_filePos;
|
| 51 |
+
std::string m_debugStr;
|
| 52 |
+
|
| 53 |
+
public:
|
| 54 |
+
typedef boost::shared_ptr<TargetPhraseCollection const> shared_const_ptr;
|
| 55 |
+
typedef boost::shared_ptr<TargetPhraseCollection> shared_ptr;
|
| 56 |
+
|
| 57 |
+
static size_t s_sortScoreInd;
|
| 58 |
+
|
| 59 |
+
TargetPhraseCollection();
|
| 60 |
+
TargetPhraseCollection(const TargetPhraseCollection ©);
|
| 61 |
+
|
| 62 |
+
~TargetPhraseCollection();
|
| 63 |
+
void AddTargetPhrase(TargetPhrase *targetPhrase);
|
| 64 |
+
void Sort(size_t tableLimit);
|
| 65 |
+
|
| 66 |
+
void Save(OnDiskWrapper &onDiskWrapper);
|
| 67 |
+
|
| 68 |
+
size_t GetSize() const {
|
| 69 |
+
return m_coll.size();
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
const TargetPhrase &GetTargetPhrase(size_t ind) const;
|
| 73 |
+
|
| 74 |
+
uint64_t GetFilePos() const;
|
| 75 |
+
|
| 76 |
+
void ReadFromFile(size_t tableLimit, uint64_t filePos, OnDiskWrapper &onDiskWrapper);
|
| 77 |
+
|
| 78 |
+
const std::string GetDebugStr() const;
|
| 79 |
+
void SetDebugStr(const std::string &str);
|
| 80 |
+
|
| 81 |
+
};
|
| 82 |
+
|
| 83 |
+
}
|
| 84 |
+
|
mosesdecoder/OnDiskPt/Vocab.cpp
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 4 |
+
Copyright (C) 2009 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
#include <string>
|
| 21 |
+
#include <fstream>
|
| 22 |
+
#include "OnDiskWrapper.h"
|
| 23 |
+
#include "Vocab.h"
|
| 24 |
+
#include "moses/Util.h"
|
| 25 |
+
#include "util/exception.hh"
|
| 26 |
+
|
| 27 |
+
using namespace std;
|
| 28 |
+
|
| 29 |
+
namespace OnDiskPt
|
| 30 |
+
{
|
| 31 |
+
|
| 32 |
+
bool Vocab::Load(OnDiskWrapper &onDiskWrapper)
|
| 33 |
+
{
|
| 34 |
+
fstream &file = onDiskWrapper.GetFileVocab();
|
| 35 |
+
|
| 36 |
+
string line;
|
| 37 |
+
while(getline(file, line)) {
|
| 38 |
+
vector<string> tokens;
|
| 39 |
+
Moses::Tokenize(tokens, line);
|
| 40 |
+
UTIL_THROW_IF2(tokens.size() != 2, "Vocab file corrupted");
|
| 41 |
+
const string &key = tokens[0];
|
| 42 |
+
m_vocabColl[key] = Moses::Scan<uint64_t>(tokens[1]);
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
// create lookup
|
| 46 |
+
// assume contiguous vocab id
|
| 47 |
+
m_lookup.resize(m_vocabColl.size() + 1);
|
| 48 |
+
m_nextId = m_lookup.size();
|
| 49 |
+
|
| 50 |
+
CollType::const_iterator iter;
|
| 51 |
+
for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) {
|
| 52 |
+
uint32_t vocabId = iter->second;
|
| 53 |
+
const std::string &word = iter->first;
|
| 54 |
+
|
| 55 |
+
m_lookup[vocabId] = word;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
return true;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
void Vocab::Save(OnDiskWrapper &onDiskWrapper)
|
| 62 |
+
{
|
| 63 |
+
fstream &file = onDiskWrapper.GetFileVocab();
|
| 64 |
+
CollType::const_iterator iterVocab;
|
| 65 |
+
for (iterVocab = m_vocabColl.begin(); iterVocab != m_vocabColl.end(); ++iterVocab) {
|
| 66 |
+
const string &word = iterVocab->first;
|
| 67 |
+
uint32_t vocabId = iterVocab->second;
|
| 68 |
+
|
| 69 |
+
file << word << " " << vocabId << endl;
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
uint64_t Vocab::AddVocabId(const std::string &str)
|
| 74 |
+
{
|
| 75 |
+
// find string id
|
| 76 |
+
CollType::const_iterator iter = m_vocabColl.find(str);
|
| 77 |
+
if (iter == m_vocabColl.end()) {
|
| 78 |
+
// add new vocab entry
|
| 79 |
+
m_vocabColl[str] = m_nextId;
|
| 80 |
+
return m_nextId++;
|
| 81 |
+
} else {
|
| 82 |
+
// return existing entry
|
| 83 |
+
return iter->second;
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
uint64_t Vocab::GetVocabId(const std::string &str, bool &found) const
|
| 88 |
+
{
|
| 89 |
+
// find string id
|
| 90 |
+
CollType::const_iterator iter = m_vocabColl.find(str);
|
| 91 |
+
if (iter == m_vocabColl.end()) {
|
| 92 |
+
found = false;
|
| 93 |
+
return 0; //return whatever
|
| 94 |
+
} else {
|
| 95 |
+
// return existing entry
|
| 96 |
+
found = true;
|
| 97 |
+
return iter->second;
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
}
|
mosesdecoder/OnDiskPt/Vocab.h
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// $Id$
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 5 |
+
Copyright (C) 2009 Hieu Hoang
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
#include <string>
|
| 22 |
+
#include <map>
|
| 23 |
+
#include "moses/TypeDef.h"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
namespace OnDiskPt
|
| 27 |
+
{
|
| 28 |
+
|
| 29 |
+
class OnDiskWrapper;
|
| 30 |
+
|
| 31 |
+
/* A bidirectional map of string<->contiguous id
|
| 32 |
+
* No distinction between source and target language
|
| 33 |
+
*/
|
| 34 |
+
class Vocab
|
| 35 |
+
{
|
| 36 |
+
protected:
|
| 37 |
+
typedef std::map<std::string, uint64_t> CollType;
|
| 38 |
+
CollType m_vocabColl;
|
| 39 |
+
|
| 40 |
+
std::vector<std::string> m_lookup; // opposite of m_vocabColl
|
| 41 |
+
uint64_t m_nextId; // starts @ 1
|
| 42 |
+
|
| 43 |
+
public:
|
| 44 |
+
Vocab()
|
| 45 |
+
:m_nextId(1) {
|
| 46 |
+
}
|
| 47 |
+
uint64_t AddVocabId(const std::string &str);
|
| 48 |
+
uint64_t GetVocabId(const std::string &str, bool &found) const;
|
| 49 |
+
const std::string &GetString(uint64_t vocabId) const {
|
| 50 |
+
return m_lookup[vocabId];
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
bool Load(OnDiskWrapper &onDiskWrapper);
|
| 54 |
+
void Save(OnDiskWrapper &onDiskWrapper);
|
| 55 |
+
};
|
| 56 |
+
|
| 57 |
+
}
|
| 58 |
+
|
mosesdecoder/OnDiskPt/Word.cpp
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 4 |
+
Copyright (C) 2009 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
|
| 21 |
+
#include <boost/algorithm/string/predicate.hpp>
|
| 22 |
+
#include "moses/Util.h"
|
| 23 |
+
#include "Word.h"
|
| 24 |
+
|
| 25 |
+
#include "util/tokenize_piece.hh"
|
| 26 |
+
#include "util/exception.hh"
|
| 27 |
+
|
| 28 |
+
using namespace std;
|
| 29 |
+
using namespace boost::algorithm;
|
| 30 |
+
|
| 31 |
+
namespace OnDiskPt
|
| 32 |
+
{
|
| 33 |
+
|
| 34 |
+
Word::Word(const Word ©)
|
| 35 |
+
:m_isNonTerminal(copy.m_isNonTerminal)
|
| 36 |
+
,m_vocabId(copy.m_vocabId)
|
| 37 |
+
{}
|
| 38 |
+
|
| 39 |
+
Word::~Word()
|
| 40 |
+
{}
|
| 41 |
+
|
| 42 |
+
void Word::CreateFromString(const std::string &inString, Vocab &vocab)
|
| 43 |
+
{
|
| 44 |
+
if (starts_with(inString, "[") && ends_with(inString, "]")) {
|
| 45 |
+
// non-term
|
| 46 |
+
m_isNonTerminal = true;
|
| 47 |
+
string str = inString.substr(1, inString.size() - 2);
|
| 48 |
+
m_vocabId = vocab.AddVocabId(str);
|
| 49 |
+
} else {
|
| 50 |
+
m_isNonTerminal = false;
|
| 51 |
+
m_vocabId = vocab.AddVocabId(inString);
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
size_t Word::WriteToMemory(char *mem) const
|
| 57 |
+
{
|
| 58 |
+
uint64_t *vocabMem = (uint64_t*) mem;
|
| 59 |
+
vocabMem[0] = m_vocabId;
|
| 60 |
+
|
| 61 |
+
size_t size = sizeof(uint64_t);
|
| 62 |
+
|
| 63 |
+
// is non-term
|
| 64 |
+
char bNonTerm = (char) m_isNonTerminal;
|
| 65 |
+
mem[size] = bNonTerm;
|
| 66 |
+
++size;
|
| 67 |
+
|
| 68 |
+
return size;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
size_t Word::ReadFromMemory(const char *mem)
|
| 72 |
+
{
|
| 73 |
+
uint64_t *vocabMem = (uint64_t*) mem;
|
| 74 |
+
m_vocabId = vocabMem[0];
|
| 75 |
+
|
| 76 |
+
size_t memUsed = sizeof(uint64_t);
|
| 77 |
+
|
| 78 |
+
// is non-term
|
| 79 |
+
char bNonTerm;
|
| 80 |
+
bNonTerm = mem[memUsed];
|
| 81 |
+
m_isNonTerminal = (bool) bNonTerm;
|
| 82 |
+
++memUsed;
|
| 83 |
+
|
| 84 |
+
return memUsed;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
size_t Word::ReadFromFile(std::fstream &file)
|
| 88 |
+
{
|
| 89 |
+
const size_t memAlloc = sizeof(uint64_t) + sizeof(char);
|
| 90 |
+
char mem[sizeof(uint64_t) + sizeof(char)];
|
| 91 |
+
file.read(mem, memAlloc);
|
| 92 |
+
|
| 93 |
+
size_t memUsed = ReadFromMemory(mem);
|
| 94 |
+
assert(memAlloc == memUsed);
|
| 95 |
+
|
| 96 |
+
return memAlloc;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
int Word::Compare(const Word &compare) const
|
| 100 |
+
{
|
| 101 |
+
int ret;
|
| 102 |
+
|
| 103 |
+
if (m_isNonTerminal != compare.m_isNonTerminal)
|
| 104 |
+
return m_isNonTerminal ?-1 : 1;
|
| 105 |
+
|
| 106 |
+
if (m_vocabId < compare.m_vocabId)
|
| 107 |
+
ret = -1;
|
| 108 |
+
else if (m_vocabId > compare.m_vocabId)
|
| 109 |
+
ret = 1;
|
| 110 |
+
else
|
| 111 |
+
ret = 0;
|
| 112 |
+
|
| 113 |
+
return ret;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
bool Word::operator<(const Word &compare) const
|
| 117 |
+
{
|
| 118 |
+
int ret = Compare(compare);
|
| 119 |
+
return ret < 0;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
bool Word::operator==(const Word &compare) const
|
| 123 |
+
{
|
| 124 |
+
int ret = Compare(compare);
|
| 125 |
+
return ret == 0;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
void Word::DebugPrint(ostream &out, const Vocab &vocab) const
|
| 129 |
+
{
|
| 130 |
+
const string &str = vocab.GetString(m_vocabId);
|
| 131 |
+
out << str;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
std::ostream& operator<<(std::ostream &out, const Word &word)
|
| 135 |
+
{
|
| 136 |
+
out << "(";
|
| 137 |
+
out << word.m_vocabId;
|
| 138 |
+
|
| 139 |
+
out << (word.m_isNonTerminal ? "n" : "t");
|
| 140 |
+
out << ")";
|
| 141 |
+
|
| 142 |
+
return out;
|
| 143 |
+
}
|
| 144 |
+
}
|
mosesdecoder/OnDiskPt/Word.h
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// $Id$
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
| 5 |
+
Copyright (C) 2009 Hieu Hoang
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
#include <string>
|
| 22 |
+
#include <vector>
|
| 23 |
+
#include <iostream>
|
| 24 |
+
#include <fstream>
|
| 25 |
+
#include <boost/shared_ptr.hpp>
|
| 26 |
+
#include "Vocab.h"
|
| 27 |
+
|
| 28 |
+
namespace Moses
|
| 29 |
+
{
|
| 30 |
+
class Word;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
namespace OnDiskPt
|
| 34 |
+
{
|
| 35 |
+
class Vocab;
|
| 36 |
+
|
| 37 |
+
/* A wrapper around a vocab id, and a boolean indicating whther it is a term or non-term.
|
| 38 |
+
* Factors can be represented by using a vocab string with | character, eg go|VB
|
| 39 |
+
*/
|
| 40 |
+
class Word
|
| 41 |
+
{
|
| 42 |
+
friend std::ostream& operator<<(std::ostream&, const Word&);
|
| 43 |
+
|
| 44 |
+
private:
|
| 45 |
+
bool m_isNonTerminal;
|
| 46 |
+
uint64_t m_vocabId;
|
| 47 |
+
|
| 48 |
+
public:
|
| 49 |
+
explicit Word() {
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
explicit Word(bool isNonTerminal)
|
| 53 |
+
:m_isNonTerminal(isNonTerminal)
|
| 54 |
+
,m_vocabId(0) {
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
Word(const Word ©);
|
| 58 |
+
~Word();
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
void CreateFromString(const std::string &inString, Vocab &vocab);
|
| 62 |
+
bool IsNonTerminal() const {
|
| 63 |
+
return m_isNonTerminal;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
size_t WriteToMemory(char *mem) const;
|
| 67 |
+
size_t ReadFromMemory(const char *mem);
|
| 68 |
+
size_t ReadFromFile(std::fstream &file);
|
| 69 |
+
|
| 70 |
+
uint64_t GetVocabId() const {
|
| 71 |
+
return m_vocabId;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
void SetVocabId(uint64_t vocabId) {
|
| 75 |
+
m_vocabId = vocabId;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
void DebugPrint(std::ostream &out, const Vocab &vocab) const;
|
| 79 |
+
inline const std::string &GetString(const Vocab &vocab) const {
|
| 80 |
+
return vocab.GetString(m_vocabId);
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
int Compare(const Word &compare) const;
|
| 84 |
+
bool operator<(const Word &compare) const;
|
| 85 |
+
bool operator==(const Word &compare) const;
|
| 86 |
+
|
| 87 |
+
};
|
| 88 |
+
|
| 89 |
+
typedef boost::shared_ptr<Word> WordPtr;
|
| 90 |
+
}
|
| 91 |
+
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:924fb66d9f0e64d799938679376511b174bd77ad4cbe1d218e33a9c3278402a3
|
| 3 |
+
size 9824568
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Main.o
ADDED
|
Binary file (67.2 kB). View file
|
|
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/OnDiskQuery.o
ADDED
|
Binary file (23.2 kB). View file
|
|
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/OnDiskWrapper.o
ADDED
|
Binary file (115 kB). View file
|
|
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Phrase.o
ADDED
|
Binary file (26 kB). View file
|
|
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/PhraseNode.o
ADDED
|
Binary file (37.6 kB). View file
|
|
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/SourcePhrase.o
ADDED
|
Binary file (2.19 kB). View file
|
|
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/TargetPhrase.o
ADDED
|
Binary file (115 kB). View file
|
|
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/TargetPhraseCollection.o
ADDED
|
Binary file (18.1 kB). View file
|
|
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Vocab.o
ADDED
|
Binary file (28.1 kB). View file
|
|
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/Word.o
ADDED
|
Binary file (6.87 kB). View file
|
|
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/queryOnDiskPt
ADDED
|
Binary file (977 kB). View file
|
|
|
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/queryOnDiskPt.o
ADDED
|
Binary file (18.3 kB). View file
|
|
|
mosesdecoder/OnDiskPt/queryOnDiskPt.cpp
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Query binary phrase tables.
|
| 2 |
+
// Christian Hardmeier, 16 May 2010
|
| 3 |
+
|
| 4 |
+
#include <cstdlib>
|
| 5 |
+
#include <cstring>
|
| 6 |
+
#include <string>
|
| 7 |
+
#include <vector>
|
| 8 |
+
|
| 9 |
+
#include "moses/Util.h"
|
| 10 |
+
#include "OnDiskWrapper.h"
|
| 11 |
+
#include "SourcePhrase.h"
|
| 12 |
+
#include "OnDiskQuery.h"
|
| 13 |
+
|
| 14 |
+
using namespace std;
|
| 15 |
+
using namespace OnDiskPt;
|
| 16 |
+
|
| 17 |
+
void usage();
|
| 18 |
+
|
| 19 |
+
typedef unsigned int uint;
|
| 20 |
+
|
| 21 |
+
int main(int argc, char **argv)
|
| 22 |
+
{
|
| 23 |
+
int tableLimit = 20;
|
| 24 |
+
std::string ttable = "";
|
| 25 |
+
// bool useAlignments = false;
|
| 26 |
+
|
| 27 |
+
for(int i = 1; i < argc; i++) {
|
| 28 |
+
if(!strcmp(argv[i], "-tlimit")) {
|
| 29 |
+
if(i + 1 == argc)
|
| 30 |
+
usage();
|
| 31 |
+
tableLimit = atoi(argv[++i]);
|
| 32 |
+
} else if(!strcmp(argv[i], "-t")) {
|
| 33 |
+
if(i + 1 == argc)
|
| 34 |
+
usage();
|
| 35 |
+
ttable = argv[++i];
|
| 36 |
+
} else
|
| 37 |
+
usage();
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
if(ttable == "")
|
| 41 |
+
usage();
|
| 42 |
+
|
| 43 |
+
OnDiskWrapper onDiskWrapper;
|
| 44 |
+
onDiskWrapper.BeginLoad(ttable);
|
| 45 |
+
OnDiskQuery onDiskQuery(onDiskWrapper);
|
| 46 |
+
|
| 47 |
+
cerr << "Ready..." << endl;
|
| 48 |
+
|
| 49 |
+
std::string line;
|
| 50 |
+
while(getline(std::cin, line)) {
|
| 51 |
+
std::vector<std::string> tokens;
|
| 52 |
+
tokens = Moses::Tokenize(line, " ");
|
| 53 |
+
|
| 54 |
+
cerr << "line: " << line << endl;
|
| 55 |
+
const PhraseNode* node = onDiskQuery.Query(tokens);
|
| 56 |
+
|
| 57 |
+
if (node) {
|
| 58 |
+
// source phrase points to a bunch of rules
|
| 59 |
+
TargetPhraseCollection::shared_ptr coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper);
|
| 60 |
+
string str = coll->GetDebugStr();
|
| 61 |
+
cout << "Found " << coll->GetSize() << endl;
|
| 62 |
+
|
| 63 |
+
for (size_t ind = 0; ind < coll->GetSize(); ++ind) {
|
| 64 |
+
const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind);
|
| 65 |
+
cerr << " ";
|
| 66 |
+
targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab());
|
| 67 |
+
cerr << endl;
|
| 68 |
+
}
|
| 69 |
+
} else {
|
| 70 |
+
cout << "Not found" << endl;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
std::cout << '\n';
|
| 74 |
+
std::cout.flush();
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
cerr << "Finished." << endl;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
void usage()
|
| 81 |
+
{
|
| 82 |
+
std::cerr << "Usage: queryOnDiskPt [-n <nscores>] [-a] -t <ttable>\n"
|
| 83 |
+
"-tlimit <table limit> max number of rules per source phrase (default: 20)\n"
|
| 84 |
+
"-t <ttable> phrase table\n";
|
| 85 |
+
exit(1);
|
| 86 |
+
}
|
mosesdecoder/README
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Instructions for building and installing Moses are online:
|
| 2 |
+
|
| 3 |
+
http://www.statmt.org/moses/?n=Development.GetStarted
|
| 4 |
+
|
| 5 |
+
Questions should be directed to the mailing list (don't forget to register before sending emails):
|
| 6 |
+
http://mailman.mit.edu/mailman/listinfo/moses-support
|
| 7 |
+
|
| 8 |
+
Some of the code is not originally part of Moses, but is periodically copied
|
| 9 |
+
into the source tree from elsewhere:
|
| 10 |
+
|
| 11 |
+
* "bjam-files" is taken from Boost.
|
| 12 |
+
* "util" and "lm" are taken from KenLM: https://github.com/kpu/kenlm
|
| 13 |
+
|
| 14 |
+
|
mosesdecoder/azure-pipelines.yml
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Starter pipeline
|
| 2 |
+
# Start with a minimal pipeline that you can customize to build and deploy your code.
|
| 3 |
+
# Add steps that build, run tests, deploy, and more:
|
| 4 |
+
# https://aka.ms/yaml
|
| 5 |
+
|
| 6 |
+
trigger:
|
| 7 |
+
- master
|
| 8 |
+
|
| 9 |
+
pool:
|
| 10 |
+
#vmImage: 'ubuntu-latest'
|
| 11 |
+
vmImage: 'ubuntu-16.04'
|
| 12 |
+
|
| 13 |
+
steps:
|
| 14 |
+
|
| 15 |
+
- script: |
|
| 16 |
+
echo Printing some environment information
|
| 17 |
+
echo HOME: $HOME
|
| 18 |
+
echo
|
| 19 |
+
echo UBUNTU VERSION:
|
| 20 |
+
cat /etc/lsb-release
|
| 21 |
+
echo
|
| 22 |
+
echo CPU INFO
|
| 23 |
+
cat /proc/cpuinfo
|
| 24 |
+
echo
|
| 25 |
+
echo MEM INFO
|
| 26 |
+
cat /proc/meminfo
|
| 27 |
+
echo
|
| 28 |
+
echo DISK INFO
|
| 29 |
+
df -h
|
| 30 |
+
echo
|
| 31 |
+
echo PWD: $PWD
|
| 32 |
+
echo
|
| 33 |
+
ls
|
| 34 |
+
displayName: 'Printing some environment information'
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
## Installation commands for Ubuntu
|
| 38 |
+
- script: |
|
| 39 |
+
sudo apt-get install \
|
| 40 |
+
g++ \
|
| 41 |
+
git \
|
| 42 |
+
subversion \
|
| 43 |
+
automake \
|
| 44 |
+
libtool \
|
| 45 |
+
zlib1g-dev \
|
| 46 |
+
libicu-dev \
|
| 47 |
+
libboost-all-dev \
|
| 48 |
+
libssl-dev \
|
| 49 |
+
libbz2-dev \
|
| 50 |
+
liblzma-dev \
|
| 51 |
+
python-dev \
|
| 52 |
+
graphviz \
|
| 53 |
+
imagemagick \
|
| 54 |
+
make \
|
| 55 |
+
cmake \
|
| 56 |
+
libgoogle-perftools-dev \
|
| 57 |
+
autoconf \
|
| 58 |
+
doxygen
|
| 59 |
+
displayName: 'Install Ubuntu packages'
|
| 60 |
+
|
| 61 |
+
- script: |
|
| 62 |
+
wget "https://sourceforge.net/projects/cmph/files/v2.0.2/cmph-2.0.2.tar.gz/download"
|
| 63 |
+
mv download cmph-2.0.2.tar.gz
|
| 64 |
+
tar xvzf cmph-2.0.2.tar.gz
|
| 65 |
+
cd cmph-2.0.2
|
| 66 |
+
./configure --prefix=$PWD
|
| 67 |
+
make
|
| 68 |
+
make install
|
| 69 |
+
cd ..
|
| 70 |
+
displayName: 'Build and Install cmph'
|
| 71 |
+
|
| 72 |
+
- script: |
|
| 73 |
+
wget "https://sourceforge.net/projects/xmlrpc-c/files/Xmlrpc-c%20Super%20Stable/1.51.06/xmlrpc-c-1.51.06.tgz/download"
|
| 74 |
+
mv download xmlrpc-c-1.51.06.tgz
|
| 75 |
+
tar xvzf xmlrpc-c-1.51.06.tgz
|
| 76 |
+
cd xmlrpc-c-1.51.06
|
| 77 |
+
./configure --prefix=$PWD
|
| 78 |
+
make
|
| 79 |
+
make install
|
| 80 |
+
sudo ldconfig
|
| 81 |
+
cd ..
|
| 82 |
+
displayName: 'Build and Install xmlrpc-c'
|
| 83 |
+
|
| 84 |
+
- script: |
|
| 85 |
+
./bjam \
|
| 86 |
+
--with-cmph=$PWD/cmph-2.0.2 \
|
| 87 |
+
--with-xmlrpc-c=$PWD/xmlrpc-c-1.51.06 \
|
| 88 |
+
-j2
|
| 89 |
+
displayName: 'Build Moses'
|
| 90 |
+
|
| 91 |
+
# - script: |
|
| 92 |
+
# ./bjam \
|
| 93 |
+
# -j2
|
| 94 |
+
# displayName: 'Build Moses'
|
| 95 |
+
|
| 96 |
+
# - task: ComponentGovernanceComponentDetection@0
|
| 97 |
+
# inputs:
|
| 98 |
+
# scanType: 'Register'
|
| 99 |
+
# verbosity: 'Verbose'
|
| 100 |
+
# alertWarningLevel: 'High'
|
mosesdecoder/biconcor/Alignment.cpp
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "Alignment.h"
|
| 2 |
+
|
| 3 |
+
#include <fstream>
|
| 4 |
+
#include <string>
|
| 5 |
+
#include <cstdlib>
|
| 6 |
+
#include <cstring>
|
| 7 |
+
|
| 8 |
+
namespace
|
| 9 |
+
{
|
| 10 |
+
|
| 11 |
+
const int LINE_MAX_LENGTH = 10000;
|
| 12 |
+
|
| 13 |
+
} // namespace
|
| 14 |
+
|
| 15 |
+
using namespace std;
|
| 16 |
+
|
| 17 |
+
void Alignment::Create(const string& fileName)
|
| 18 |
+
{
|
| 19 |
+
ifstream textFile;
|
| 20 |
+
char line[LINE_MAX_LENGTH];
|
| 21 |
+
|
| 22 |
+
// count the number of words first;
|
| 23 |
+
textFile.open(fileName.c_str());
|
| 24 |
+
|
| 25 |
+
if (!textFile) {
|
| 26 |
+
cerr << "No such file or directory: " << fileName << endl;
|
| 27 |
+
exit(1);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
istream *fileP = &textFile;
|
| 31 |
+
m_size = 0;
|
| 32 |
+
m_sentenceCount = 0;
|
| 33 |
+
while(!fileP->eof()) {
|
| 34 |
+
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
| 35 |
+
if (fileP->eof()) break;
|
| 36 |
+
vector<string> alignmentSequence = Tokenize( line );
|
| 37 |
+
m_size += alignmentSequence.size();
|
| 38 |
+
m_sentenceCount++;
|
| 39 |
+
}
|
| 40 |
+
textFile.close();
|
| 41 |
+
cerr << m_size << " alignment points" << endl;
|
| 42 |
+
|
| 43 |
+
// allocate memory
|
| 44 |
+
m_array = (int*) calloc( sizeof(int), m_size*2 );
|
| 45 |
+
m_sentenceEnd = (INDEX*) calloc( sizeof( INDEX ), m_sentenceCount );
|
| 46 |
+
|
| 47 |
+
if (m_array == NULL) {
|
| 48 |
+
cerr << "Error: cannot allocate memory to m_array" << endl;
|
| 49 |
+
exit(1);
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
if (m_sentenceEnd == NULL) {
|
| 53 |
+
cerr << "Error: cannot allocate memory to m_sentenceEnd" << endl;
|
| 54 |
+
exit(1);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
// fill the array
|
| 58 |
+
int alignmentPointIndex = 0;
|
| 59 |
+
int sentenceId = 0;
|
| 60 |
+
|
| 61 |
+
textFile.open(fileName.c_str());
|
| 62 |
+
if (!textFile) {
|
| 63 |
+
cerr << "Failed to open " << fileName << endl;
|
| 64 |
+
exit(1);
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
fileP = &textFile;
|
| 68 |
+
while(!fileP->eof()) {
|
| 69 |
+
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
| 70 |
+
if (fileP->eof()) break;
|
| 71 |
+
vector<string> alignmentSequence = Tokenize( line );
|
| 72 |
+
for(size_t i=0; i<alignmentSequence.size(); i++) {
|
| 73 |
+
int s,t;
|
| 74 |
+
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
|
| 75 |
+
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
|
| 76 |
+
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceId << endl;
|
| 77 |
+
}
|
| 78 |
+
m_array[alignmentPointIndex++] = (char) s;
|
| 79 |
+
m_array[alignmentPointIndex++] = (char) t;
|
| 80 |
+
}
|
| 81 |
+
m_sentenceEnd[ sentenceId++ ] = alignmentPointIndex - 2;
|
| 82 |
+
}
|
| 83 |
+
textFile.close();
|
| 84 |
+
cerr << "done reading " << (alignmentPointIndex/2) << " alignment points, " << sentenceId << " sentences." << endl;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
Alignment::Alignment()
|
| 88 |
+
: m_array(NULL),
|
| 89 |
+
m_sentenceEnd(NULL),
|
| 90 |
+
m_size(0),
|
| 91 |
+
m_sentenceCount(0) {}
|
| 92 |
+
|
| 93 |
+
Alignment::~Alignment()
|
| 94 |
+
{
|
| 95 |
+
if (m_array != NULL) {
|
| 96 |
+
free(m_array);
|
| 97 |
+
}
|
| 98 |
+
if (m_sentenceEnd != NULL) {
|
| 99 |
+
free(m_sentenceEnd);
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
vector<string> Alignment::Tokenize( const char input[] )
|
| 104 |
+
{
|
| 105 |
+
vector< string > token;
|
| 106 |
+
bool betweenWords = true;
|
| 107 |
+
int start=0;
|
| 108 |
+
int i=0;
|
| 109 |
+
for(; input[i] != '\0'; i++) {
|
| 110 |
+
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
| 111 |
+
|
| 112 |
+
if (!isSpace && betweenWords) {
|
| 113 |
+
start = i;
|
| 114 |
+
betweenWords = false;
|
| 115 |
+
} else if (isSpace && !betweenWords) {
|
| 116 |
+
token.push_back( string( input+start, i-start ) );
|
| 117 |
+
betweenWords = true;
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
if (!betweenWords)
|
| 121 |
+
token.push_back( string( input+start, i-start ) );
|
| 122 |
+
return token;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
bool Alignment::PhraseAlignment( INDEX sentence, int target_length,
|
| 126 |
+
int source_start, int source_end,
|
| 127 |
+
int &target_start, int &target_end,
|
| 128 |
+
int &pre_null, int &post_null )
|
| 129 |
+
{
|
| 130 |
+
// get index for first alignment point
|
| 131 |
+
INDEX sentenceStart = 0;
|
| 132 |
+
if (sentence > 0) {
|
| 133 |
+
sentenceStart = m_sentenceEnd[ sentence-1 ] + 2;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
// get target phrase boundaries
|
| 137 |
+
target_start = target_length;
|
| 138 |
+
target_end = 0;
|
| 139 |
+
for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
|
| 140 |
+
int source = m_array[ ap ];
|
| 141 |
+
if (source >= source_start && source <= source_end ) {
|
| 142 |
+
int target = m_array[ ap+1 ];
|
| 143 |
+
if (target < target_start) target_start = target;
|
| 144 |
+
if (target > target_end ) target_end = target;
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
if (target_start == target_length) {
|
| 148 |
+
return false; // done if no alignment points
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
// check consistency
|
| 152 |
+
for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
|
| 153 |
+
int target = m_array[ ap+1 ];
|
| 154 |
+
if (target >= target_start && target <= target_end ) {
|
| 155 |
+
int source = m_array[ ap ];
|
| 156 |
+
if (source < source_start || source > source_end) {
|
| 157 |
+
return false; // alignment point out of range
|
| 158 |
+
}
|
| 159 |
+
}
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
// create array for unaligned words
|
| 163 |
+
for( int i=0; i<target_length; i++ ) {
|
| 164 |
+
m_unaligned[i] = true;
|
| 165 |
+
}
|
| 166 |
+
for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
|
| 167 |
+
int target = m_array[ ap+1 ];
|
| 168 |
+
m_unaligned[ target ] = false;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
// prior unaligned words
|
| 172 |
+
pre_null = 0;
|
| 173 |
+
for(int target = target_start-1; target >= 0 && m_unaligned[ target ]; target--) {
|
| 174 |
+
pre_null++;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
// post unaligned words;
|
| 178 |
+
post_null = 0;
|
| 179 |
+
for(int target = target_end+1; target < target_length && m_unaligned[ target ]; target++) {
|
| 180 |
+
post_null++;
|
| 181 |
+
}
|
| 182 |
+
return true;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
void Alignment::Save(const string& fileName ) const
|
| 186 |
+
{
|
| 187 |
+
FILE *pFile = fopen ( (fileName + ".align").c_str() , "w" );
|
| 188 |
+
if (pFile == NULL) {
|
| 189 |
+
cerr << "Cannot open " << fileName << ".align" << endl;
|
| 190 |
+
exit(1);
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
fwrite( &m_size, sizeof(INDEX), 1, pFile );
|
| 194 |
+
fwrite( m_array, sizeof(int), m_size*2, pFile ); // corpus
|
| 195 |
+
|
| 196 |
+
fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
|
| 197 |
+
fwrite( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
|
| 198 |
+
fclose( pFile );
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
void Alignment::Load(const string& fileName )
|
| 202 |
+
{
|
| 203 |
+
FILE *pFile = fopen ( (fileName + ".align").c_str() , "r" );
|
| 204 |
+
if (pFile == NULL) {
|
| 205 |
+
cerr << "no such file or directory: " << fileName << ".align" << endl;
|
| 206 |
+
exit(1);
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
cerr << "loading from " << fileName << ".align" << endl;
|
| 210 |
+
|
| 211 |
+
fread( &m_size, sizeof(INDEX), 1, pFile );
|
| 212 |
+
cerr << "alignment points in corpus: " << m_size << endl;
|
| 213 |
+
m_array = (int*) calloc( sizeof(int), m_size*2 );
|
| 214 |
+
fread( m_array, sizeof(int), m_size*2, pFile ); // corpus
|
| 215 |
+
|
| 216 |
+
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
|
| 217 |
+
cerr << "sentences in corpus: " << m_sentenceCount << endl;
|
| 218 |
+
m_sentenceEnd = (INDEX*) calloc( sizeof(INDEX), m_sentenceCount );
|
| 219 |
+
fread( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
|
| 220 |
+
fclose( pFile );
|
| 221 |
+
cerr << "done loading\n";
|
| 222 |
+
}
|
mosesdecoder/biconcor/Alignment.h
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "Vocabulary.h"
|
| 4 |
+
|
| 5 |
+
class Alignment
|
| 6 |
+
{
|
| 7 |
+
public:
|
| 8 |
+
typedef unsigned int INDEX;
|
| 9 |
+
|
| 10 |
+
private:
|
| 11 |
+
int *m_array;
|
| 12 |
+
INDEX *m_sentenceEnd;
|
| 13 |
+
INDEX m_size;
|
| 14 |
+
INDEX m_sentenceCount;
|
| 15 |
+
char m_unaligned[ 256 ]; // here for speed (local to PhraseAlignment)
|
| 16 |
+
|
| 17 |
+
// No copying allowed.
|
| 18 |
+
Alignment(const Alignment&);
|
| 19 |
+
void operator=(const Alignment&);
|
| 20 |
+
|
| 21 |
+
public:
|
| 22 |
+
Alignment();
|
| 23 |
+
~Alignment();
|
| 24 |
+
|
| 25 |
+
void Create(const std::string& fileName );
|
| 26 |
+
bool PhraseAlignment( INDEX sentence, int target_length,
|
| 27 |
+
int source_start, int source_end,
|
| 28 |
+
int &target_start, int &target_end,
|
| 29 |
+
int &pre_null, int &post_null );
|
| 30 |
+
void Load(const std::string& fileName );
|
| 31 |
+
void Save(const std::string& fileName ) const;
|
| 32 |
+
std::vector<std::string> Tokenize( const char input[] );
|
| 33 |
+
|
| 34 |
+
INDEX GetSentenceStart( INDEX sentence ) const {
|
| 35 |
+
if (sentence == 0) return 0;
|
| 36 |
+
return m_sentenceEnd[ sentence-1 ] + 2;
|
| 37 |
+
}
|
| 38 |
+
INDEX GetNumberOfAlignmentPoints( INDEX sentence ) const {
|
| 39 |
+
return ( m_sentenceEnd[ sentence ] - GetSentenceStart( sentence ) ) / 2;
|
| 40 |
+
}
|
| 41 |
+
int GetSourceWord( INDEX sentence, INDEX alignment_point ) const {
|
| 42 |
+
return m_array[ GetSentenceStart( sentence ) + alignment_point*2 ];
|
| 43 |
+
}
|
| 44 |
+
int GetTargetWord( INDEX sentence, INDEX alignment_point ) const {
|
| 45 |
+
return m_array[ GetSentenceStart( sentence ) + alignment_point*2 + 1 ];
|
| 46 |
+
}
|
| 47 |
+
};
|
mosesdecoder/biconcor/CMakeLists.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
project(biconcor)
|
| 2 |
+
|
| 3 |
+
FILE(GLOB biconcor_source *.cpp)
|
| 4 |
+
|
| 5 |
+
add_executable(biconcor ${biconcor_source})
|
mosesdecoder/biconcor/Jamfile
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
exe biconcor : Vocabulary.cpp SuffixArray.cpp TargetCorpus.cpp Alignment.cpp Mismatch.cpp PhrasePair.cpp PhrasePairCollection.cpp biconcor.cpp base64.cpp ;
|
| 2 |
+
exe phrase-lookup : Vocabulary.cpp SuffixArray.cpp phrase-lookup.cpp ;
|
mosesdecoder/biconcor/Mismatch.cpp
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "Mismatch.h"
|
| 2 |
+
|
| 3 |
+
#include <fstream>
|
| 4 |
+
#include <iostream>
|
| 5 |
+
#include <cstring>
|
| 6 |
+
#include <string>
|
| 7 |
+
#include <cstdlib>
|
| 8 |
+
|
| 9 |
+
#include "SuffixArray.h"
|
| 10 |
+
#include "TargetCorpus.h"
|
| 11 |
+
#include "Alignment.h"
|
| 12 |
+
#include "Vocabulary.h"
|
| 13 |
+
|
| 14 |
+
using namespace std;
|
| 15 |
+
|
| 16 |
+
enum {
|
| 17 |
+
UNANNOTATED = 0,
|
| 18 |
+
PRE_ALIGNED = 1,
|
| 19 |
+
POST_ALIGNED = 2,
|
| 20 |
+
UNALIGNED = 3,
|
| 21 |
+
MISALIGNED = 4,
|
| 22 |
+
ALIGNED = 5
|
| 23 |
+
};
|
| 24 |
+
|
| 25 |
+
Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end )
|
| 26 |
+
:m_suffixArray(sa)
|
| 27 |
+
,m_targetCorpus(tc)
|
| 28 |
+
,m_alignment(a)
|
| 29 |
+
,m_sentence_id(sentence_id)
|
| 30 |
+
,m_source_length(source_length)
|
| 31 |
+
,m_target_length(target_length)
|
| 32 |
+
,m_source_position(position)
|
| 33 |
+
,m_source_start(source_start)
|
| 34 |
+
,m_source_end(source_end)
|
| 35 |
+
,m_unaligned(true)
|
| 36 |
+
{
|
| 37 |
+
// initialize unaligned indexes
|
| 38 |
+
for (int i = 0; i < m_source_length; i++) {
|
| 39 |
+
m_source_unaligned[i] = true;
|
| 40 |
+
}
|
| 41 |
+
for (int i = 0; i < m_target_length; i++) {
|
| 42 |
+
m_target_unaligned[i] = true;
|
| 43 |
+
}
|
| 44 |
+
m_num_alignment_points =
|
| 45 |
+
m_alignment->GetNumberOfAlignmentPoints( sentence_id );
|
| 46 |
+
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
| 47 |
+
m_source_unaligned[ (int)m_alignment->GetSourceWord( sentence_id, ap ) ] = false;
|
| 48 |
+
m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false;
|
| 49 |
+
}
|
| 50 |
+
for(int i = source_start; i <= source_end; i++) {
|
| 51 |
+
if (!m_source_unaligned[ i ]) {
|
| 52 |
+
m_unaligned = false;
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
Mismatch::~Mismatch () {}
|
| 58 |
+
|
| 59 |
+
void Mismatch::PrintClippedHTML( ostream* out, int width )
|
| 60 |
+
{
|
| 61 |
+
int source_annotation[256], target_annotation[256];
|
| 62 |
+
vector< string > label_class;
|
| 63 |
+
label_class.push_back( "" );
|
| 64 |
+
label_class.push_back( "mismatch_pre_aligned" );
|
| 65 |
+
label_class.push_back( "mismatch_post_aligned" );
|
| 66 |
+
label_class.push_back( "null_aligned" );
|
| 67 |
+
label_class.push_back( "mismatch_misaligned" );
|
| 68 |
+
label_class.push_back( "mismatch_aligned" );
|
| 69 |
+
|
| 70 |
+
for(int i=0; i<m_source_length; i++) source_annotation[i] = UNANNOTATED;
|
| 71 |
+
for(int i=0; i<m_target_length; i++) target_annotation[i] = UNANNOTATED;
|
| 72 |
+
|
| 73 |
+
if (m_unaligned) {
|
| 74 |
+
// find alignment points for prior and next word(s) and
|
| 75 |
+
// center target phrase around those.
|
| 76 |
+
bool found_aligned = false;
|
| 77 |
+
for(int i=1; i<m_source_length && !found_aligned; i++) {
|
| 78 |
+
if (m_source_start-i >= 0) {
|
| 79 |
+
int word_id = m_source_start-i;
|
| 80 |
+
source_annotation[ word_id ] = UNALIGNED;
|
| 81 |
+
if (!m_source_unaligned[ word_id ]) {
|
| 82 |
+
found_aligned = true;
|
| 83 |
+
LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
if (m_source_end+i < m_source_length) {
|
| 88 |
+
int word_id = m_source_end+i;
|
| 89 |
+
source_annotation[ word_id ] = UNALIGNED;
|
| 90 |
+
if (!m_source_unaligned[ word_id ]) {
|
| 91 |
+
found_aligned = true;
|
| 92 |
+
LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
}
|
| 98 |
+
// misalignment
|
| 99 |
+
else {
|
| 100 |
+
// label aligned output words
|
| 101 |
+
for(int i=m_source_start; i<=m_source_end; i++)
|
| 102 |
+
LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
|
| 103 |
+
|
| 104 |
+
// find first and last
|
| 105 |
+
int target_start = -1;
|
| 106 |
+
int target_end = -1;
|
| 107 |
+
for(int i=0; i<m_target_length; i++)
|
| 108 |
+
if (target_annotation[i] == ALIGNED) {
|
| 109 |
+
if (target_start == -1)
|
| 110 |
+
target_start = i;
|
| 111 |
+
target_end = i;
|
| 112 |
+
}
|
| 113 |
+
// go over all enclosed target words
|
| 114 |
+
for(int i=target_start; i<=target_end; i++) {
|
| 115 |
+
// label other target words as unaligned or misaligned
|
| 116 |
+
if (m_target_unaligned[ i ])
|
| 117 |
+
target_annotation[ i ] = UNALIGNED;
|
| 118 |
+
else {
|
| 119 |
+
if (target_annotation[ i ] != ALIGNED)
|
| 120 |
+
target_annotation[ i ] = MISALIGNED;
|
| 121 |
+
// loop over aligned source words
|
| 122 |
+
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
| 123 |
+
if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
|
| 124 |
+
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
|
| 125 |
+
// if not part of the source phrase -> also misaligned
|
| 126 |
+
if (source_word < m_source_start || source_word > m_source_end)
|
| 127 |
+
source_annotation[ source_word ] = MISALIGNED;
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
}
|
| 132 |
+
// closure
|
| 133 |
+
bool change = true;
|
| 134 |
+
while(change) {
|
| 135 |
+
change = false;
|
| 136 |
+
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
| 137 |
+
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
|
| 138 |
+
int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
|
| 139 |
+
if (source_annotation[source_word] != UNANNOTATED &&
|
| 140 |
+
target_annotation[target_word] == UNANNOTATED) {
|
| 141 |
+
target_annotation[target_word] = MISALIGNED;
|
| 142 |
+
change = true;
|
| 143 |
+
}
|
| 144 |
+
if (source_annotation[source_word] == UNANNOTATED &&
|
| 145 |
+
target_annotation[target_word] != UNANNOTATED) {
|
| 146 |
+
source_annotation[source_word] = MISALIGNED;
|
| 147 |
+
change = true;
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
// print source
|
| 154 |
+
// shorten source context if too long
|
| 155 |
+
int sentence_start = m_source_position - m_source_start;
|
| 156 |
+
int context_space = width/2;
|
| 157 |
+
for(int i=m_source_start; i<=m_source_end; i++)
|
| 158 |
+
context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
|
| 159 |
+
context_space /= 2;
|
| 160 |
+
|
| 161 |
+
int remaining = context_space;
|
| 162 |
+
int start_word = m_source_start;
|
| 163 |
+
for(; start_word>0 && remaining>0; start_word--)
|
| 164 |
+
remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
|
| 165 |
+
if (remaining<0 || start_word == -1) start_word++;
|
| 166 |
+
|
| 167 |
+
remaining = context_space;
|
| 168 |
+
int end_word = m_source_end;
|
| 169 |
+
for(; end_word<m_source_length && remaining>0; end_word++)
|
| 170 |
+
remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
|
| 171 |
+
end_word--;
|
| 172 |
+
|
| 173 |
+
// output with markup
|
| 174 |
+
*out << "<tr><td class=\"pp_source_left\">";
|
| 175 |
+
char current_label = UNANNOTATED;
|
| 176 |
+
if (start_word>0) {
|
| 177 |
+
current_label = source_annotation[start_word-1];
|
| 178 |
+
*out << "... ";
|
| 179 |
+
}
|
| 180 |
+
for(int i=start_word; i<=end_word; i++) {
|
| 181 |
+
// change to phrase block
|
| 182 |
+
if (i == m_source_start) {
|
| 183 |
+
if (current_label != UNANNOTATED && i!=start_word)
|
| 184 |
+
*out << "</span>";
|
| 185 |
+
*out << "</td><td class=\"pp_source\">";
|
| 186 |
+
current_label = UNANNOTATED;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
// change to labeled word
|
| 190 |
+
else if (source_annotation[i] != current_label &&
|
| 191 |
+
source_annotation[i] != ALIGNED) {
|
| 192 |
+
if (current_label != UNANNOTATED && i!=start_word)
|
| 193 |
+
*out << "</span>";
|
| 194 |
+
if (source_annotation[i] != UNANNOTATED)
|
| 195 |
+
*out << "<span class=\""
|
| 196 |
+
<< label_class[ source_annotation[i] ]
|
| 197 |
+
<< "\">";
|
| 198 |
+
current_label = source_annotation[i];
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
// output word
|
| 202 |
+
*out << m_suffixArray->GetWord( sentence_start + i ) << " ";
|
| 203 |
+
|
| 204 |
+
// change to right context block
|
| 205 |
+
if (i == m_source_end) {
|
| 206 |
+
*out << "</td><td class=\"pp_source_right\">";
|
| 207 |
+
current_label = UNANNOTATED;
|
| 208 |
+
}
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
if (current_label != UNANNOTATED && end_word>m_source_end)
|
| 212 |
+
*out << "</span>";
|
| 213 |
+
if (end_word<m_source_length-1)
|
| 214 |
+
*out << "... ";
|
| 215 |
+
|
| 216 |
+
// print target
|
| 217 |
+
// shorten target context if too long
|
| 218 |
+
int target_start = -1;
|
| 219 |
+
int target_end=0;
|
| 220 |
+
for(int i=0; i<m_target_length; i++)
|
| 221 |
+
if (target_annotation[i] != UNANNOTATED) {
|
| 222 |
+
if (target_start == -1)
|
| 223 |
+
target_start = i;
|
| 224 |
+
target_end = i;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
context_space = width/2;
|
| 228 |
+
for(int i=target_start; i<=target_end; i++)
|
| 229 |
+
context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
|
| 230 |
+
while (context_space < 0) { // shorten matched part, if too long
|
| 231 |
+
context_space +=
|
| 232 |
+
m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
|
| 233 |
+
m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
|
| 234 |
+
target_start++;
|
| 235 |
+
target_end--;
|
| 236 |
+
}
|
| 237 |
+
context_space /= 2;
|
| 238 |
+
|
| 239 |
+
remaining = context_space;
|
| 240 |
+
start_word = target_start;
|
| 241 |
+
for(; start_word>0 && remaining>0; start_word--) {
|
| 242 |
+
//cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
|
| 243 |
+
remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
|
| 244 |
+
}
|
| 245 |
+
if (remaining<0 || start_word == -1) start_word++;
|
| 246 |
+
|
| 247 |
+
remaining = context_space;
|
| 248 |
+
end_word = target_end;
|
| 249 |
+
for(; end_word<m_target_length && remaining>0; end_word++) {
|
| 250 |
+
//cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
|
| 251 |
+
remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
|
| 252 |
+
}
|
| 253 |
+
end_word--;
|
| 254 |
+
|
| 255 |
+
// output with markup
|
| 256 |
+
*out << "</td><td class=\"mismatch_target\">";
|
| 257 |
+
current_label = UNANNOTATED;
|
| 258 |
+
if (start_word>0) {
|
| 259 |
+
current_label = target_annotation[start_word-1];
|
| 260 |
+
*out << "... ";
|
| 261 |
+
}
|
| 262 |
+
for(int i=start_word; i<=end_word; i++) {
|
| 263 |
+
if (target_annotation[i] != current_label) {
|
| 264 |
+
if (current_label != UNANNOTATED && i!=start_word)
|
| 265 |
+
*out << "</span>";
|
| 266 |
+
if (target_annotation[i] != UNANNOTATED)
|
| 267 |
+
*out << "<span class=\""
|
| 268 |
+
<< label_class[ target_annotation[i] ]
|
| 269 |
+
<< "\">";
|
| 270 |
+
current_label = target_annotation[i];
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
// output word
|
| 274 |
+
*out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
if (current_label != UNANNOTATED && end_word>target_end)
|
| 278 |
+
*out << "</span>";
|
| 279 |
+
if (end_word<m_target_length-1)
|
| 280 |
+
*out << "... ";
|
| 281 |
+
*out << "</td></tr>";
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label )
|
| 285 |
+
{
|
| 286 |
+
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
| 287 |
+
if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
|
| 288 |
+
source_annotation[ source_id ] = label;
|
| 289 |
+
target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
|
| 290 |
+
}
|
| 291 |
+
}
|
| 292 |
+
}
|
mosesdecoder/biconcor/Mismatch.h
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <iosfwd>
|
| 4 |
+
|
| 5 |
+
class Alignment;
|
| 6 |
+
class SuffixArray;
|
| 7 |
+
class TargetCorpus;
|
| 8 |
+
|
| 9 |
+
class Mismatch
|
| 10 |
+
{
|
| 11 |
+
public:
|
| 12 |
+
typedef unsigned int INDEX;
|
| 13 |
+
|
| 14 |
+
private:
|
| 15 |
+
SuffixArray *m_suffixArray;
|
| 16 |
+
TargetCorpus *m_targetCorpus;
|
| 17 |
+
Alignment *m_alignment;
|
| 18 |
+
INDEX m_sentence_id;
|
| 19 |
+
INDEX m_num_alignment_points;
|
| 20 |
+
int m_source_length;
|
| 21 |
+
int m_target_length;
|
| 22 |
+
INDEX m_source_position;
|
| 23 |
+
int m_source_start;
|
| 24 |
+
int m_source_end;
|
| 25 |
+
bool m_source_unaligned[ 256 ];
|
| 26 |
+
bool m_target_unaligned[ 256 ];
|
| 27 |
+
bool m_unaligned;
|
| 28 |
+
|
| 29 |
+
// No copying allowed.
|
| 30 |
+
Mismatch(const Mismatch&);
|
| 31 |
+
void operator=(const Mismatch&);
|
| 32 |
+
|
| 33 |
+
public:
|
| 34 |
+
Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end );
|
| 35 |
+
~Mismatch();
|
| 36 |
+
|
| 37 |
+
bool Unaligned() const {
|
| 38 |
+
return m_unaligned;
|
| 39 |
+
}
|
| 40 |
+
void PrintClippedHTML(std::ostream* out, int width );
|
| 41 |
+
void LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label );
|
| 42 |
+
};
|