suricodes commited on
Commit
0a39881
·
verified ·
1 Parent(s): ac1e89a

Upload 172 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +11 -0
  2. mosesdecoder/lm/CMakeLists.txt +90 -0
  3. mosesdecoder/lm/Jamfile +40 -0
  4. mosesdecoder/lm/bhiksha.cc +94 -0
  5. mosesdecoder/lm/bhiksha.hh +122 -0
  6. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/bhiksha.o +0 -0
  7. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/binary_format.o +0 -0
  8. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary +3 -0
  9. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary_main.o +0 -0
  10. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/config.o +0 -0
  11. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment +3 -0
  12. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment_main.o +0 -0
  13. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark +3 -0
  14. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark_main.o +0 -0
  15. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/lm_exception.o +0 -0
  16. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/model.o +0 -0
  17. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/quantize.o +0 -0
  18. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query +3 -0
  19. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query_main.o +0 -0
  20. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/read_arpa.o +0 -0
  21. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_hashed.o +0 -0
  22. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_trie.o +0 -0
  23. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/sizes.o +0 -0
  24. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie.o +0 -0
  25. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie_sort.o +0 -0
  26. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/value_build.o +0 -0
  27. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/virtual_interface.o +0 -0
  28. mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/vocab.o +0 -0
  29. mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test +3 -0
  30. mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.o +0 -0
  31. mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.output +8 -0
  32. mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.run +8 -0
  33. mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.test +1 -0
  34. mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test +3 -0
  35. mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.o +3 -0
  36. mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.output +11 -0
  37. mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.run +11 -0
  38. mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.test +1 -0
  39. mosesdecoder/lm/bin/order.log +1 -0
  40. mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test +3 -0
  41. mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.o +0 -0
  42. mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.output +8 -0
  43. mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.run +8 -0
  44. mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.test +1 -0
  45. mosesdecoder/lm/binary_format.cc +302 -0
  46. mosesdecoder/lm/binary_format.hh +106 -0
  47. mosesdecoder/lm/blank.hh +42 -0
  48. mosesdecoder/lm/build_binary_main.cc +234 -0
  49. mosesdecoder/lm/builder/CMakeLists.txt +67 -0
  50. mosesdecoder/lm/builder/Jamfile +13 -0
.gitattributes CHANGED
@@ -49,3 +49,14 @@ mosesdecoder/lib/libmoses2_lib.a filter=lfs diff=lfs merge=lfs -text
49
  mosesdecoder/lib/libmoses2decoder.a filter=lfs diff=lfs merge=lfs -text
50
  mosesdecoder/lib/libprobingpt.a filter=lfs diff=lfs merge=lfs -text
51
  mosesdecoder/lib/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
49
  mosesdecoder/lib/libmoses2decoder.a filter=lfs diff=lfs merge=lfs -text
50
  mosesdecoder/lib/libprobingpt.a filter=lfs diff=lfs merge=lfs -text
51
  mosesdecoder/lib/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
52
+ mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary filter=lfs diff=lfs merge=lfs -text
53
+ mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment filter=lfs diff=lfs merge=lfs -text
54
+ mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark filter=lfs diff=lfs merge=lfs -text
55
+ mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query filter=lfs diff=lfs merge=lfs -text
56
+ mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test filter=lfs diff=lfs merge=lfs -text
57
+ mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test filter=lfs diff=lfs merge=lfs -text
58
+ mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.o filter=lfs diff=lfs merge=lfs -text
59
+ mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test filter=lfs diff=lfs merge=lfs -text
60
+ mosesdecoder/lm/builder/bin/gcc-9/release/link-static/threading-multi/dump_counts filter=lfs diff=lfs merge=lfs -text
61
+ mosesdecoder/lm/builder/bin/gcc-9/release/link-static/threading-multi/lmplz filter=lfs diff=lfs merge=lfs -text
62
+ mosesdecoder/lm/filter/bin/gcc-9/release/link-static/threading-multi/filter filter=lfs diff=lfs merge=lfs -text
mosesdecoder/lm/CMakeLists.txt ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 2.8.8)
2
+ #
3
+ # The KenLM cmake files make use of add_library(... OBJECTS ...)
4
+ #
5
+ # This syntax allows grouping of source files when compiling
6
+ # (effectively creating "fake" libraries based on source subdirs).
7
+ #
8
+ # This syntax was only added in cmake version 2.8.8
9
+ #
10
+ # see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
11
+
12
+
13
+ # This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
14
+
15
+
16
+ set(KENLM_MAX_ORDER 6 CACHE STRING "Maximum supported ngram order")
17
+
18
+ add_definitions(-DKENLM_MAX_ORDER=${KENLM_MAX_ORDER})
19
+
20
+
21
+ # Explicitly list the source files for this subdirectory
22
+ #
23
+ # If you add any source files to this subdirectory
24
+ # that should be included in the kenlm library,
25
+ # (this excludes any unit test files)
26
+ # you should add them to the following list:
27
+ set(KENLM_SOURCE
28
+ bhiksha.cc
29
+ binary_format.cc
30
+ config.cc
31
+ lm_exception.cc
32
+ model.cc
33
+ quantize.cc
34
+ read_arpa.cc
35
+ search_hashed.cc
36
+ search_trie.cc
37
+ sizes.cc
38
+ trie.cc
39
+ trie_sort.cc
40
+ value_build.cc
41
+ virtual_interface.cc
42
+ vocab.cc
43
+ )
44
+
45
+
46
+ # Group these objects together for later use.
47
+ #
48
+ # Given add_library(foo OBJECT ${my_foo_sources}),
49
+ # refer to these objects as $<TARGET_OBJECTS:foo>
50
+ #
51
+ add_library(kenlm OBJECT ${KENLM_SOURCE})
52
+
53
+ # This directory has children that need to be processed
54
+ add_subdirectory(builder)
55
+ add_subdirectory(common)
56
+ add_subdirectory(filter)
57
+
58
+
59
+
60
+ # Explicitly list the executable files to be compiled
61
+ set(EXE_LIST
62
+ query
63
+ fragment
64
+ build_binary
65
+ )
66
+
67
+ AddExes(EXES ${EXE_LIST}
68
+ DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
69
+ LIBRARIES ${Boost_LIBRARIES} pthread)
70
+
71
+ # Conditionally build the interpolation code
72
+ if(BUILD_INTERPOLATE)
73
+ add_subdirectory(interpolate)
74
+ endif()
75
+
76
+ if(BUILD_TESTING)
77
+
78
+ set(KENLM_BOOST_TESTS_LIST left_test partial_test)
79
+ AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
80
+ DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
81
+ LIBRARIES ${Boost_LIBRARIES} pthread
82
+ TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa)
83
+
84
+ # model_test requires an extra command line parameter
85
+ KenLMAddTest(TEST model_test
86
+ DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
87
+ LIBRARIES ${Boost_LIBRARIES} pthread
88
+ TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa
89
+ ${CMAKE_CURRENT_SOURCE_DIR}/test_nounk.arpa)
90
+ endif()
mosesdecoder/lm/Jamfile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # If you need higher order, change this option
2
+ # Having this limit means that State can be
3
+ # (KENLM_MAX_ORDER - 1) * sizeof(float) bytes instead of
4
+ # sizeof(float*) + (KENLM_MAX_ORDER - 1) * sizeof(float) + malloc overhead
5
+ max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
6
+ if ( $(max-order) != 6 ) {
7
+ echo "Setting KenLM maximum n-gram order to $(max-order)" ;
8
+ }
9
+ max-order = <define>KENLM_MAX_ORDER=$(max-order) ;
10
+
11
+ path-constant ORDER-LOG : bin/order.log ;
12
+ update-if-changed $(ORDER-LOG) $(max-order) ;
13
+
14
+ max-order += <dependency>$(ORDER-LOG) ;
15
+
16
+ wrappers = ;
17
+ local with-nplm = [ option.get "with-nplm" ] ;
18
+ if $(with-nplm) {
19
+ lib nplm : : <search>$(with-nplm)/src ;
20
+ obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp <include>$(with-nplm)/3rdparty/eigen <define>NPLM_DOUBLE_PRECISION=0 ;
21
+ alias nplm-all : nplm.o nplm ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
22
+ wrappers += nplm-all ;
23
+ }
24
+
25
+ fakelib kenlm : $(wrappers) [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : <include>.. $(max-order) : : <include>.. $(max-order) ;
26
+
27
+ import testing ;
28
+
29
+ run left_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ;
30
+ run model_test.cc kenlm /top//boost_unit_test_framework : : test.arpa test_nounk.arpa ;
31
+ run partial_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ;
32
+
33
+ exes = ;
34
+ for local p in [ glob *_main.cc ] {
35
+ local name = [ MATCH "(.*)\_main.cc" : $(p) ] ;
36
+ exe $(name) : $(p) kenlm ;
37
+ exes += $(name) ;
38
+ }
39
+
40
+ alias programs : $(exes) filter//filter filter//phrase_table_vocab builder//dump_counts : <threading>multi:<source>builder//lmplz ;
mosesdecoder/lm/bhiksha.cc ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "lm/bhiksha.hh"
2
+
3
+ #include "lm/binary_format.hh"
4
+ #include "lm/config.hh"
5
+ #include "util/file.hh"
6
+ #include "util/exception.hh"
7
+
8
+ #include <limits>
9
+
10
+ namespace lm {
11
+ namespace ngram {
12
+ namespace trie {
13
+
14
+ DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
15
+ next_(util::BitsMask::ByMax(max_next)) {}
16
+
17
+ const uint8_t kArrayBhikshaVersion = 0;
18
+
19
+ // TODO: put this in binary file header instead when I change the binary file format again.
20
+ void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
21
+ uint8_t buffer[2];
22
+ file.ReadForConfig(buffer, 2, offset);
23
+ uint8_t version = buffer[0];
24
+ uint8_t configured_bits = buffer[1];
25
+ if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion);
26
+ config.pointer_bhiksha_bits = configured_bits;
27
+ }
28
+
29
+ namespace {
30
+
31
+ // Find argmin_{chopped \in [0, RequiredBits(max_next)]} ChoppedDelta(max_offset)
32
+ uint8_t ChopBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
33
+ uint8_t required = util::RequiredBits(max_next);
34
+ uint8_t best_chop = 0;
35
+ int64_t lowest_change = std::numeric_limits<int64_t>::max();
36
+ // There are probably faster ways but I don't care because this is only done once per order at construction time.
37
+ for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) {
38
+ int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */
39
+ - max_offset * static_cast<int64_t>(chop); /* savings in bits*/
40
+ if (change < lowest_change) {
41
+ lowest_change = change;
42
+ best_chop = chop;
43
+ }
44
+ }
45
+ return best_chop;
46
+ }
47
+
48
+ std::size_t ArrayCount(uint64_t max_offset, uint64_t max_next, const Config &config) {
49
+ uint8_t required = util::RequiredBits(max_next);
50
+ uint8_t chopping = ChopBits(max_offset, max_next, config);
51
+ return (max_next >> (required - chopping)) + 1 /* we store 0 too */;
52
+ }
53
+ } // namespace
54
+
55
+ uint64_t ArrayBhiksha::Size(uint64_t max_offset, uint64_t max_next, const Config &config) {
56
+ return sizeof(uint64_t) * (1 /* header */ + ArrayCount(max_offset, max_next, config)) + 7 /* 8-byte alignment */;
57
+ }
58
+
59
+ uint8_t ArrayBhiksha::InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
60
+ return util::RequiredBits(max_next) - ChopBits(max_offset, max_next, config);
61
+ }
62
+
63
+ namespace {
64
+
65
+ void *AlignTo8(void *from) {
66
+ uint8_t *val = reinterpret_cast<uint8_t*>(from);
67
+ std::size_t remainder = reinterpret_cast<std::size_t>(val) & 7;
68
+ if (!remainder) return val;
69
+ return val + 8 - remainder;
70
+ }
71
+
72
+ } // namespace
73
+
74
+ ArrayBhiksha::ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_next, const Config &config)
75
+ : next_inline_(util::BitsMask::ByBits(InlineBits(max_offset, max_next, config))),
76
+ offset_begin_(reinterpret_cast<const uint64_t*>(AlignTo8(base)) + 1 /* 8-byte header */),
77
+ offset_end_(offset_begin_ + ArrayCount(max_offset, max_next, config)),
78
+ write_to_(reinterpret_cast<uint64_t*>(AlignTo8(base)) + 1 /* 8-byte header */ + 1 /* first entry is 0 */),
79
+ original_base_(base) {}
80
+
81
+ void ArrayBhiksha::FinishedLoading(const Config &config) {
82
+ // *offset_begin_ = 0 but without a const_cast.
83
+ *(write_to_ - (write_to_ - offset_begin_)) = 0;
84
+
85
+ if (write_to_ != offset_end_) UTIL_THROW(util::Exception, "Did not get all the array entries that were expected.");
86
+
87
+ uint8_t *head_write = reinterpret_cast<uint8_t*>(original_base_);
88
+ *(head_write++) = kArrayBhikshaVersion;
89
+ *(head_write++) = config.pointer_bhiksha_bits;
90
+ }
91
+
92
+ } // namespace trie
93
+ } // namespace ngram
94
+ } // namespace lm
mosesdecoder/lm/bhiksha.hh ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Simple implementation of
2
+ * @inproceedings{bhikshacompression,
3
+ * author={Bhiksha Raj and Ed Whittaker},
4
+ * year={2003},
5
+ * title={Lossless Compression of Language Model Structure and Word Identifiers},
6
+ * booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing},
7
+ * pages={388--391},
8
+ * }
9
+ *
10
+ * Currently only used for next pointers.
11
+ */
12
+
13
+ #ifndef LM_BHIKSHA_H
14
+ #define LM_BHIKSHA_H
15
+
16
+ #include "lm/model_type.hh"
17
+ #include "lm/trie.hh"
18
+ #include "util/bit_packing.hh"
19
+ #include "util/sorted_uniform.hh"
20
+
21
+ #include <algorithm>
22
+ #include <stdint.h>
23
+ #include <cassert>
24
+
25
+ namespace lm {
26
+ namespace ngram {
27
+ struct Config;
28
+ class BinaryFormat;
29
+
30
+ namespace trie {
31
+
32
+ class DontBhiksha {
33
+ public:
34
+ static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
35
+
36
+ static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {}
37
+
38
+ static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; }
39
+
40
+ static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) {
41
+ return util::RequiredBits(max_next);
42
+ }
43
+
44
+ DontBhiksha(const void *base, uint64_t max_offset, uint64_t max_next, const Config &config);
45
+
46
+ void ReadNext(const void *base, uint64_t bit_offset, uint64_t /*index*/, uint8_t total_bits, NodeRange &out) const {
47
+ out.begin = util::ReadInt57(base, bit_offset, next_.bits, next_.mask);
48
+ out.end = util::ReadInt57(base, bit_offset + total_bits, next_.bits, next_.mask);
49
+ //assert(out.end >= out.begin);
50
+ }
51
+
52
+ void WriteNext(void *base, uint64_t bit_offset, uint64_t /*index*/, uint64_t value) {
53
+ util::WriteInt57(base, bit_offset, next_.bits, value);
54
+ }
55
+
56
+ void FinishedLoading(const Config &/*config*/) {}
57
+
58
+ uint8_t InlineBits() const { return next_.bits; }
59
+
60
+ private:
61
+ util::BitsMask next_;
62
+ };
63
+
64
+ class ArrayBhiksha {
65
+ public:
66
+ static const ModelType kModelTypeAdd = kArrayAdd;
67
+
68
+ static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);
69
+
70
+ static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config);
71
+
72
+ static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config);
73
+
74
+ ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_value, const Config &config);
75
+
76
+ void ReadNext(const void *base, uint64_t bit_offset, uint64_t index, uint8_t total_bits, NodeRange &out) const {
77
+ // Some assertions are commented out because they are expensive.
78
+ // assert(*offset_begin_ == 0);
79
+ // std::upper_bound returns the first element that is greater. Want the
80
+ // last element that is <= to the index.
81
+ const uint64_t *begin_it = std::upper_bound(offset_begin_, offset_end_, index) - 1;
82
+ // Since *offset_begin_ == 0, the position should be in range.
83
+ // assert(begin_it >= offset_begin_);
84
+ const uint64_t *end_it;
85
+ for (end_it = begin_it + 1; (end_it < offset_end_) && (*end_it <= index + 1); ++end_it) {}
86
+ // assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1));
87
+ --end_it;
88
+ // assert(end_it >= begin_it);
89
+ out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
90
+ util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask);
91
+ out.end = ((end_it - offset_begin_) << next_inline_.bits) |
92
+ util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask);
93
+ // If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052
94
+ assert(out.end >= out.begin);
95
+ }
96
+
97
+ void WriteNext(void *base, uint64_t bit_offset, uint64_t index, uint64_t value) {
98
+ uint64_t encode = value >> next_inline_.bits;
99
+ for (; write_to_ <= offset_begin_ + encode; ++write_to_) *write_to_ = index;
100
+ util::WriteInt57(base, bit_offset, next_inline_.bits, value & next_inline_.mask);
101
+ }
102
+
103
+ void FinishedLoading(const Config &config);
104
+
105
+ uint8_t InlineBits() const { return next_inline_.bits; }
106
+
107
+ private:
108
+ const util::BitsMask next_inline_;
109
+
110
+ const uint64_t *const offset_begin_;
111
+ const uint64_t *const offset_end_;
112
+
113
+ uint64_t *write_to_;
114
+
115
+ void *original_base_;
116
+ };
117
+
118
+ } // namespace trie
119
+ } // namespace ngram
120
+ } // namespace lm
121
+
122
+ #endif // LM_BHIKSHA_H
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/bhiksha.o ADDED
Binary file (24.4 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/binary_format.o ADDED
Binary file (87 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89bb1a5052a26025dee0f23bf7492c60881ae5b02ceb378b78905a1e166926cc
3
+ size 1367920
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary_main.o ADDED
Binary file (127 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/config.o ADDED
Binary file (2.63 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e3bb1750b2c843ff2ad2b81b01b70f7e52c34abd8ce296575a19e10f9769b31
3
+ size 1367912
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment_main.o ADDED
Binary file (55 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b00c5cb3fc290d10f4dd59c4e3c3472199ebe32cb9dcd25963e07a5e3227af89
3
+ size 1412248
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark_main.o ADDED
Binary file (211 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/lm_exception.o ADDED
Binary file (11.7 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/model.o ADDED
Binary file (297 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/quantize.o ADDED
Binary file (42.2 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dee549913c814cfa80d753c6dd7cb494099bd5c7da5cda664eedd73f7acc8f72
3
+ size 1388928
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query_main.o ADDED
Binary file (167 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/read_arpa.o ADDED
Binary file (105 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_hashed.o ADDED
Binary file (169 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_trie.o ADDED
Binary file (196 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/sizes.o ADDED
Binary file (10.8 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie.o ADDED
Binary file (35.5 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie_sort.o ADDED
Binary file (118 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/value_build.o ADDED
Binary file (81.7 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/virtual_interface.o ADDED
Binary file (5.9 kB). View file
 
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/vocab.o ADDED
Binary file (124 kB). View file
 
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c226dcb9b7f76bb74bb4114ca69a955381ad9790c361f5ef642b772bb9b0b434
3
+ size 2458688
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.o ADDED
Binary file (698 kB). View file
 
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.output ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
2
+ and should be placed after all Boost.Test arguments and the -- separator.
3
+ For example: left_test --random -- lm/test.arpa
4
+ Running 6 test cases...
5
+
6
+ *** No errors detected
7
+
8
+ EXIT STATUS: 0
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.run ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
2
+ and should be placed after all Boost.Test arguments and the -- separator.
3
+ For example: left_test --random -- lm/test.arpa
4
+ Running 6 test cases...
5
+
6
+ *** No errors detected
7
+
8
+ EXIT STATUS: 0
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.test ADDED
@@ -0,0 +1 @@
 
 
1
+ passed
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5f8ac4d5e3d965ef934f3075a93f9c59093e1cb6ee44ef27d71542a43cbb50f
3
+ size 2890976
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.o ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db0276d839aa0274ab47aed2de1e3d1089c83eedfbb92ac58e9b17b6096940e1
3
+ size 1513008
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.output ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
2
+ and should be placed after all Boost.Test arguments and the -- separator.
3
+ For example: model_test --random -- lm/test.arpa
4
+ Boost.Test WARNING: token "lm/test_nounk.arpa" does not correspond to the Boost.Test argument
5
+ and should be placed after all Boost.Test arguments and the -- separator.
6
+ For example: model_test --random -- lm/test_nounk.arpa
7
+ Running 12 test cases...
8
+
9
+ *** No errors detected
10
+
11
+ EXIT STATUS: 0
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.run ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
2
+ and should be placed after all Boost.Test arguments and the -- separator.
3
+ For example: model_test --random -- lm/test.arpa
4
+ Boost.Test WARNING: token "lm/test_nounk.arpa" does not correspond to the Boost.Test argument
5
+ and should be placed after all Boost.Test arguments and the -- separator.
6
+ For example: model_test --random -- lm/test_nounk.arpa
7
+ Running 12 test cases...
8
+
9
+ *** No errors detected
10
+
11
+ EXIT STATUS: 0
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.test ADDED
@@ -0,0 +1 @@
 
 
1
+ passed
mosesdecoder/lm/bin/order.log ADDED
@@ -0,0 +1 @@
 
 
1
+ <define>KENLM_MAX_ORDER=6
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50e0176970c570eea64c3cf5ce6ac7f3432f91e605676d7cbbf8e055a1e307b4
3
+ size 2254664
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.o ADDED
Binary file (211 kB). View file
 
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.output ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
2
+ and should be placed after all Boost.Test arguments and the -- separator.
3
+ For example: partial_test --random -- lm/test.arpa
4
+ Running 4 test cases...
5
+
6
+ *** No errors detected
7
+
8
+ EXIT STATUS: 0
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.run ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
2
+ and should be placed after all Boost.Test arguments and the -- separator.
3
+ For example: partial_test --random -- lm/test.arpa
4
+ Running 4 test cases...
5
+
6
+ *** No errors detected
7
+
8
+ EXIT STATUS: 0
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.test ADDED
@@ -0,0 +1 @@
 
 
1
+ passed
mosesdecoder/lm/binary_format.cc ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "lm/binary_format.hh"
2
+
3
+ #include "lm/lm_exception.hh"
4
+ #include "util/file.hh"
5
+ #include "util/file_piece.hh"
6
+
7
+ #include <cstddef>
8
+ #include <cstring>
9
+ #include <limits>
10
+ #include <string>
11
+ #include <cstdlib>
12
+
13
+ #include <stdint.h>
14
+
15
+ namespace lm {
16
+ namespace ngram {
17
+
18
+ const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
19
+
20
+ namespace {
21
+ const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
22
+ const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
23
+ // This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
24
+ const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";
25
+ const long int kMagicVersion = 5;
26
+
27
+ // Old binary files built on 32-bit machines have this header.
28
+ // TODO: eliminate with next binary release.
29
+ struct OldSanity {
30
+ char magic[sizeof(kMagicBytes)];
31
+ float zero_f, one_f, minus_half_f;
32
+ WordIndex one_word_index, max_word_index;
33
+ uint64_t one_uint64;
34
+
35
+ void SetToReference() {
36
+ std::memset(this, 0, sizeof(OldSanity));
37
+ std::memcpy(magic, kMagicBytes, sizeof(magic));
38
+ zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5;
39
+ one_word_index = 1;
40
+ max_word_index = std::numeric_limits<WordIndex>::max();
41
+ one_uint64 = 1;
42
+ }
43
+ };
44
+
45
+
46
+ // Test values aligned to 8 bytes.
47
+ struct Sanity {
48
+ char magic[ALIGN8(sizeof(kMagicBytes))];
49
+ float zero_f, one_f, minus_half_f;
50
+ WordIndex one_word_index, max_word_index, padding_to_8;
51
+ uint64_t one_uint64;
52
+
53
+ void SetToReference() {
54
+ std::memset(this, 0, sizeof(Sanity));
55
+ std::memcpy(magic, kMagicBytes, sizeof(kMagicBytes));
56
+ zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5;
57
+ one_word_index = 1;
58
+ max_word_index = std::numeric_limits<WordIndex>::max();
59
+ padding_to_8 = 0;
60
+ one_uint64 = 1;
61
+ }
62
+ };
63
+
64
+ std::size_t TotalHeaderSize(unsigned char order) {
65
+ return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
66
+ }
67
+
68
+ void WriteHeader(void *to, const Parameters &params) {
69
+ Sanity header = Sanity();
70
+ header.SetToReference();
71
+ std::memcpy(to, &header, sizeof(Sanity));
72
+ char *out = reinterpret_cast<char*>(to) + sizeof(Sanity);
73
+
74
+ *reinterpret_cast<FixedWidthParameters*>(out) = params.fixed;
75
+ out += sizeof(FixedWidthParameters);
76
+
77
+ uint64_t *counts = reinterpret_cast<uint64_t*>(out);
78
+ for (std::size_t i = 0; i < params.counts.size(); ++i) {
79
+ counts[i] = params.counts[i];
80
+ }
81
+ }
82
+
83
+ } // namespace
84
+
85
+ bool IsBinaryFormat(int fd) {
86
+ const uint64_t size = util::SizeFile(fd);
87
+ if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
88
+ // Try reading the header.
89
+ util::scoped_memory memory;
90
+ try {
91
+ util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory);
92
+ } catch (const util::Exception &e) {
93
+ return false;
94
+ }
95
+ Sanity reference_header = Sanity();
96
+ reference_header.SetToReference();
97
+ if (!std::memcmp(memory.get(), &reference_header, sizeof(Sanity))) return true;
98
+ if (!std::memcmp(memory.get(), kMagicIncomplete, strlen(kMagicIncomplete))) {
99
+ UTIL_THROW(FormatLoadException, "This binary file did not finish building");
100
+ }
101
+ if (!std::memcmp(memory.get(), kMagicBeforeVersion, strlen(kMagicBeforeVersion))) {
102
+ char *end_ptr;
103
+ const char *begin_version = static_cast<const char*>(memory.get()) + strlen(kMagicBeforeVersion);
104
+ long int version = std::strtol(begin_version, &end_ptr, 10);
105
+ if ((end_ptr != begin_version) && version != kMagicVersion) {
106
+ UTIL_THROW(FormatLoadException, "Binary file has version " << version << " but this implementation expects version " << kMagicVersion << " so you'll have to use the ARPA to rebuild your binary");
107
+ }
108
+
109
+ OldSanity old_sanity = OldSanity();
110
+ old_sanity.SetToReference();
111
+ UTIL_THROW_IF(!std::memcmp(memory.get(), &old_sanity, sizeof(OldSanity)), FormatLoadException, "Looks like this is an old 32-bit format. The old 32-bit format has been removed so that 64-bit and 32-bit files are exchangeable.");
112
+ UTIL_THROW(FormatLoadException, "File looks like it should be loaded with mmap, but the test values don't match. Try rebuilding the binary format LM using the same code revision, compiler, and architecture");
113
+ }
114
+ return false;
115
+ }
116
+
117
+ void ReadHeader(int fd, Parameters &out) {
118
+ util::SeekOrThrow(fd, sizeof(Sanity));
119
+ util::ReadOrThrow(fd, &out.fixed, sizeof(out.fixed));
120
+ if (out.fixed.probing_multiplier < 1.0)
121
+ UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0.");
122
+
123
+ out.counts.resize(static_cast<std::size_t>(out.fixed.order));
124
+ if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
125
+ }
126
+
127
+ void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params) {
128
+ if (params.fixed.model_type != model_type) {
129
+ if (static_cast<unsigned int>(params.fixed.model_type) >= (sizeof(kModelNames) / sizeof(const char *)))
130
+ UTIL_THROW(FormatLoadException, "The binary file claims to be model type " << static_cast<unsigned int>(params.fixed.model_type) << " but this is not implemented for in this inference code.");
131
+ UTIL_THROW(FormatLoadException, "The binary file was built for " << kModelNames[params.fixed.model_type] << " but the inference code is trying to load " << kModelNames[model_type]);
132
+ }
133
+ UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version);
134
+ }
135
+
136
+ const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
137
+
138
+ BinaryFormat::BinaryFormat(const Config &config)
139
+ : write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
140
+ header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}
141
+
142
+ void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params) {
143
+ file_.reset(fd);
144
+ write_mmap_ = NULL; // Ignore write requests; this is already in binary format.
145
+ ReadHeader(fd, params);
146
+ MatchCheck(model_type, search_version, params);
147
+ header_size_ = TotalHeaderSize(params.counts.size());
148
+ }
149
+
150
+ void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const {
151
+ assert(header_size_ != kInvalidSize);
152
+ util::ErsatzPRead(file_.get(), to, amount, offset_excluding_header + header_size_);
153
+ }
154
+
155
+ void *BinaryFormat::LoadBinary(std::size_t size) {
156
+ assert(header_size_ != kInvalidSize);
157
+ const uint64_t file_size = util::SizeFile(file_.get());
158
+ // The header is smaller than a page, so we have to map the whole header as well.
159
+ uint64_t total_map = static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(size);
160
+ UTIL_THROW_IF(file_size != util::kBadSize && file_size < total_map, FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
161
+
162
+ util::MapRead(load_method_, file_.get(), 0, util::CheckOverflow(total_map), mapping_);
163
+
164
+ vocab_string_offset_ = total_map;
165
+ return reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
166
+ }
167
+
168
+ void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
169
+ vocab_size_ = memory_size;
170
+ if (!write_mmap_) {
171
+ header_size_ = 0;
172
+ util::HugeMalloc(memory_size, true, memory_vocab_);
173
+ return reinterpret_cast<uint8_t*>(memory_vocab_.get());
174
+ }
175
+ header_size_ = TotalHeaderSize(order);
176
+ std::size_t total = util::CheckOverflow(static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(memory_size));
177
+ file_.reset(util::CreateOrThrow(write_mmap_));
178
+ // some gccs complain about uninitialized variables even though all enum values are covered.
179
+ void *vocab_base = NULL;
180
+ switch (write_method_) {
181
+ case Config::WRITE_MMAP:
182
+ mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
183
+ util::AdviseHugePages(vocab_base, total);
184
+ vocab_base = mapping_.get();
185
+ break;
186
+ case Config::WRITE_AFTER:
187
+ util::ResizeOrThrow(file_.get(), 0);
188
+ util::HugeMalloc(total, true, memory_vocab_);
189
+ vocab_base = memory_vocab_.get();
190
+ break;
191
+ }
192
+ strncpy(reinterpret_cast<char*>(vocab_base), kMagicIncomplete, header_size_);
193
+ return reinterpret_cast<uint8_t*>(vocab_base) + header_size_;
194
+ }
195
+
196
+ void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base) {
197
+ assert(vocab_size_ != kInvalidSize);
198
+ vocab_pad_ = vocab_pad;
199
+ std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size;
200
+ vocab_string_offset_ = new_size;
201
+ if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) {
202
+ util::HugeMalloc(memory_size, true, memory_search_);
203
+ assert(header_size_ == 0 || write_mmap_);
204
+ vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
205
+ util::AdviseHugePages(memory_search_.get(), memory_size);
206
+ return reinterpret_cast<uint8_t*>(memory_search_.get());
207
+ }
208
+
209
+ assert(write_method_ == Config::WRITE_MMAP);
210
+ // Also known as total size without vocab words.
211
+ // Grow the file to accomodate the search, using zeros.
212
+ // According to man mmap, behavior is undefined when the file is resized
213
+ // underneath a mmap that is not a multiple of the page size. So to be
214
+ // safe, we'll unmap it and map it again.
215
+ mapping_.reset();
216
+ util::ResizeOrThrow(file_.get(), new_size);
217
+ void *ret;
218
+ MapFile(vocab_base, ret);
219
+ util::AdviseHugePages(ret, new_size);
220
+ return ret;
221
+ }
222
+
223
+ void BinaryFormat::WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base) {
224
+ // Checking Config's include_vocab is the responsibility of the caller.
225
+ assert(header_size_ != kInvalidSize && vocab_size_ != kInvalidSize);
226
+ if (!write_mmap_) {
227
+ // Unchanged base.
228
+ vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get());
229
+ search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
230
+ return;
231
+ }
232
+ if (write_method_ == Config::WRITE_MMAP) {
233
+ mapping_.reset();
234
+ }
235
+ util::SeekOrThrow(file_.get(), VocabStringReadingOffset());
236
+ util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
237
+ if (write_method_ == Config::WRITE_MMAP) {
238
+ MapFile(vocab_base, search_base);
239
+ } else {
240
+ vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
241
+ search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
242
+ }
243
+ }
244
+
245
+ void BinaryFormat::FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts) {
246
+ if (!write_mmap_) return;
247
+ switch (write_method_) {
248
+ case Config::WRITE_MMAP:
249
+ util::SyncOrThrow(mapping_.get(), mapping_.size());
250
+ break;
251
+ case Config::WRITE_AFTER:
252
+ util::SeekOrThrow(file_.get(), 0);
253
+ util::WriteOrThrow(file_.get(), memory_vocab_.get(), memory_vocab_.size());
254
+ util::SeekOrThrow(file_.get(), header_size_ + vocab_size_ + vocab_pad_);
255
+ util::WriteOrThrow(file_.get(), memory_search_.get(), memory_search_.size());
256
+ util::FSyncOrThrow(file_.get());
257
+ break;
258
+ }
259
+ // header and vocab share the same mmap.
260
+ Parameters params = Parameters();
261
+ memset(&params, 0, sizeof(Parameters));
262
+ params.counts = counts;
263
+ params.fixed.order = counts.size();
264
+ params.fixed.probing_multiplier = config.probing_multiplier;
265
+ params.fixed.model_type = model_type;
266
+ params.fixed.has_vocabulary = config.include_vocab;
267
+ params.fixed.search_version = search_version;
268
+ switch (write_method_) {
269
+ case Config::WRITE_MMAP:
270
+ WriteHeader(mapping_.get(), params);
271
+ util::SyncOrThrow(mapping_.get(), mapping_.size());
272
+ break;
273
+ case Config::WRITE_AFTER:
274
+ {
275
+ std::vector<uint8_t> buffer(TotalHeaderSize(counts.size()));
276
+ WriteHeader(&buffer[0], params);
277
+ util::SeekOrThrow(file_.get(), 0);
278
+ util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
279
+ }
280
+ break;
281
+ }
282
+ }
283
+
284
+ void BinaryFormat::MapFile(void *&vocab_base, void *&search_base) {
285
+ mapping_.reset(util::MapOrThrow(vocab_string_offset_, true, util::kFileFlags, false, file_.get()), vocab_string_offset_, util::scoped_memory::MMAP_ALLOCATED);
286
+ vocab_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
287
+ search_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_ + vocab_size_ + vocab_pad_;
288
+ }
289
+
290
+ bool RecognizeBinary(const char *file, ModelType &recognized) {
291
+ util::scoped_fd fd(util::OpenReadOrThrow(file));
292
+ if (!IsBinaryFormat(fd.get())) {
293
+ return false;
294
+ }
295
+ Parameters params;
296
+ ReadHeader(fd.get(), params);
297
+ recognized = params.fixed.model_type;
298
+ return true;
299
+ }
300
+
301
+ } // namespace ngram
302
+ } // namespace lm
mosesdecoder/lm/binary_format.hh ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef LM_BINARY_FORMAT_H
2
+ #define LM_BINARY_FORMAT_H
3
+
4
+ #include "lm/config.hh"
5
+ #include "lm/model_type.hh"
6
+ #include "lm/read_arpa.hh"
7
+
8
+ #include "util/file_piece.hh"
9
+ #include "util/mmap.hh"
10
+ #include "util/scoped.hh"
11
+
12
+ #include <cstddef>
13
+ #include <vector>
14
+
15
+ #include <stdint.h>
16
+
17
+ namespace lm {
18
+ namespace ngram {
19
+
20
+ extern const char *kModelNames[6];
21
+
22
+ /*Inspect a file to determine if it is a binary lm. If not, return false.
23
+ * If so, return true and set recognized to the type. This is the only API in
24
+ * this header designed for use by decoder authors.
25
+ */
26
+ bool RecognizeBinary(const char *file, ModelType &recognized);
27
+
28
+ struct FixedWidthParameters {
29
+ unsigned char order;
30
+ float probing_multiplier;
31
+ // What type of model is this?
32
+ ModelType model_type;
33
+ // Does the end of the file have the actual strings in the vocabulary?
34
+ bool has_vocabulary;
35
+ unsigned int search_version;
36
+ };
37
+
38
+ // This is a macro instead of an inline function so constants can be assigned using it.
39
+ #define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
40
+
41
+ // Parameters stored in the header of a binary file.
42
+ struct Parameters {
43
+ FixedWidthParameters fixed;
44
+ std::vector<uint64_t> counts;
45
+ };
46
+
47
+ class BinaryFormat {
48
+ public:
49
+ explicit BinaryFormat(const Config &config);
50
+
51
+ // Reading a binary file:
52
+ // Takes ownership of fd
53
+ void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params);
54
+ // Used to read parts of the file to update the config object before figuring out full size.
55
+ void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
56
+ // Actually load the binary file and return a pointer to the beginning of the search area.
57
+ void *LoadBinary(std::size_t size);
58
+
59
+ uint64_t VocabStringReadingOffset() const {
60
+ assert(vocab_string_offset_ != kInvalidOffset);
61
+ return vocab_string_offset_;
62
+ }
63
+
64
+ // Writing a binary file or initializing in RAM from ARPA:
65
+ // Size for vocabulary.
66
+ void *SetupJustVocab(std::size_t memory_size, uint8_t order);
67
+ // Warning: can change the vocaulary base pointer.
68
+ void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
69
+ // Warning: can change vocabulary and search base addresses.
70
+ void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
71
+ // Write the header at the beginning of the file.
72
+ void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
73
+
74
+ private:
75
+ void MapFile(void *&vocab_base, void *&search_base);
76
+
77
+ // Copied from configuration.
78
+ const Config::WriteMethod write_method_;
79
+ const char *write_mmap_;
80
+ util::LoadMethod load_method_;
81
+
82
+ // File behind memory, if any.
83
+ util::scoped_fd file_;
84
+
85
+ // If there is a file involved, a single mapping.
86
+ util::scoped_memory mapping_;
87
+
88
+ // If the data is only in memory, separately allocate each because the trie
89
+ // knows vocab's size before it knows search's size (because SRILM might
90
+ // have pruned).
91
+ util::scoped_memory memory_vocab_, memory_search_;
92
+
93
+ // Memory ranges. Note that these may not be contiguous and may not all
94
+ // exist.
95
+ std::size_t header_size_, vocab_size_, vocab_pad_;
96
+ // aka end of search.
97
+ uint64_t vocab_string_offset_;
98
+
99
+ static const uint64_t kInvalidOffset = (uint64_t)-1;
100
+ };
101
+
102
+ bool IsBinaryFormat(int fd);
103
+
104
+ } // namespace ngram
105
+ } // namespace lm
106
+ #endif // LM_BINARY_FORMAT_H
mosesdecoder/lm/blank.hh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef LM_BLANK_H
2
+ #define LM_BLANK_H
3
+
4
+ #include <limits>
5
+ #include <stdint.h>
6
+ #include <cmath>
7
+
8
+ namespace lm {
9
+ namespace ngram {
10
+
11
+ /* Suppose "foo bar" appears with zero backoff but there is no trigram
12
+ * beginning with these words. Then, when scoring "foo bar", the model could
13
+ * return out_state containing "bar" or even null context if "bar" also has no
14
+ * backoff and is never followed by another word. Then the backoff is set to
15
+ * kNoExtensionBackoff. If the n-gram might be extended, then out_state must
16
+ * contain the full n-gram, in which case kExtensionBackoff is set. In any
17
+ * case, if an n-gram has non-zero backoff, the full state is returned so
18
+ * backoff can be properly charged.
19
+ * These differ only in sign bit because the backoff is in fact zero in either
20
+ * case.
21
+ */
22
+ const float kNoExtensionBackoff = -0.0;
23
+ const float kExtensionBackoff = 0.0;
24
+ const uint64_t kNoExtensionQuant = 0;
25
+ const uint64_t kExtensionQuant = 1;
26
+
27
+ inline void SetExtension(float &backoff) {
28
+ if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
29
+ }
30
+
31
+ // This compiles down nicely.
32
+ inline bool HasExtension(const float &backoff) {
33
+ typedef union { float f; uint32_t i; } UnionValue;
34
+ UnionValue compare, interpret;
35
+ compare.f = kNoExtensionBackoff;
36
+ interpret.f = backoff;
37
+ return compare.i != interpret.i;
38
+ }
39
+
40
+ } // namespace ngram
41
+ } // namespace lm
42
+ #endif // LM_BLANK_H
mosesdecoder/lm/build_binary_main.cc ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "lm/model.hh"
2
+ #include "lm/sizes.hh"
3
+ #include "util/file_piece.hh"
4
+ #include "util/usage.hh"
5
+
6
+ #include <algorithm>
7
+ #include <cstdlib>
8
+ #include <exception>
9
+ #include <iostream>
10
+ #include <iomanip>
11
+ #include <limits>
12
+ #include <cmath>
13
+ #include <cstdlib>
14
+
15
+ #ifdef WIN32
16
+ #include "util/getopt.hh"
17
+ #else
18
+ #include <unistd.h>
19
+ #endif
20
+
21
+ namespace lm {
22
+ namespace ngram {
23
+ namespace {
24
+
25
+ void Usage(const char *name, const char *default_mem) {
26
+ std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
27
+ "-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
28
+ " Default is -100. The ARPA file will always take precedence.\n"
29
+ "-s allows models to be built even if they do not have <s> and </s>.\n"
30
+ "-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
31
+ "-w mmap|after determines how writing is done.\n"
32
+ " mmap maps the binary file and writes to it. Default for trie.\n"
33
+ " after allocates anonymous memory, builds, and writes. Default for probing.\n"
34
+ "-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n"
35
+ " model files. order1.arpa must be an ARPA file. All others may be ARPA or\n"
36
+ " the same data structure as being built. All files must have the same\n"
37
+ " vocabulary. For probing, the unigrams must be in the same order.\n\n"
38
+ "type is either probing or trie. Default is probing.\n\n"
39
+ "probing uses a probing hash table. It is the fastest but uses the most memory.\n"
40
+ "-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n"
41
+ "trie is a straightforward trie with bit-level packing. It uses the least\n"
42
+ "memory and is still faster than SRI or IRST. Building the trie format uses an\n"
43
+ "on-disk sort to save memory.\n"
44
+ "-T is the temporary directory prefix. Default is the output file name.\n"
45
+ "-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n"
46
+ " with GNU sort. The number is followed by a unit: \% for percent of physical\n"
47
+ " memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n"
48
+ " Default unit is K for Kilobytes.\n"
49
+ "-q turns quantization on and sets the number of bits (e.g. -q 8).\n"
50
+ "-b sets backoff quantization bits. Requires -q and defaults to that value.\n"
51
+ "-a compresses pointers using an array of offsets. The parameter is the\n"
52
+ " maximum number of bits encoded by the array. Memory is minimized subject\n"
53
+ " to the maximum, so pick 255 to minimize memory.\n\n"
54
+ "-h print this help message.\n\n"
55
+ "Get a memory estimate by passing an ARPA file without an output file name.\n";
56
+ exit(1);
57
+ }
58
+
59
+ // I could really use boost::lexical_cast right about now.
60
+ float ParseFloat(const char *from) {
61
+ char *end;
62
+ float ret = strtod(from, &end);
63
+ if (*end) throw util::ParseNumberException(from);
64
+ return ret;
65
+ }
66
+ unsigned long int ParseUInt(const char *from) {
67
+ char *end;
68
+ unsigned long int ret = strtoul(from, &end, 10);
69
+ if (*end) throw util::ParseNumberException(from);
70
+ return ret;
71
+ }
72
+
73
+ uint8_t ParseBitCount(const char *from) {
74
+ unsigned long val = ParseUInt(from);
75
+ if (val > 25) {
76
+ util::ParseNumberException e(from);
77
+ e << " bit counts are limited to 25.";
78
+ }
79
+ return val;
80
+ }
81
+
82
+ void ParseFileList(const char *from, std::vector<std::string> &to) {
83
+ to.clear();
84
+ while (true) {
85
+ const char *i;
86
+ for (i = from; *i && *i != ' '; ++i) {}
87
+ to.push_back(std::string(from, i - from));
88
+ if (!*i) break;
89
+ from = i + 1;
90
+ }
91
+ }
92
+
93
+ void ProbingQuantizationUnsupported() {
94
+ std::cerr << "Quantization is only implemented in the trie data structure." << std::endl;
95
+ exit(1);
96
+ }
97
+
98
+ } // namespace ngram
99
+ } // namespace lm
100
+ } // namespace
101
+
102
+ int main(int argc, char *argv[]) {
103
+ using namespace lm::ngram;
104
+
105
+ const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G";
106
+
107
+ if (argc == 2 && !strcmp(argv[1], "--help"))
108
+ Usage(argv[0], default_mem);
109
+
110
+ try {
111
+ bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false;
112
+ lm::ngram::Config config;
113
+ config.building_memory = util::ParseSize(default_mem);
114
+ int opt;
115
+ while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:h")) != -1) {
116
+ switch(opt) {
117
+ case 'q':
118
+ config.prob_bits = ParseBitCount(optarg);
119
+ if (!set_backoff_bits) config.backoff_bits = config.prob_bits;
120
+ quantize = true;
121
+ break;
122
+ case 'b':
123
+ config.backoff_bits = ParseBitCount(optarg);
124
+ set_backoff_bits = true;
125
+ break;
126
+ case 'a':
127
+ config.pointer_bhiksha_bits = ParseBitCount(optarg);
128
+ bhiksha = true;
129
+ break;
130
+ case 'u':
131
+ config.unknown_missing_logprob = ParseFloat(optarg);
132
+ break;
133
+ case 'p':
134
+ config.probing_multiplier = ParseFloat(optarg);
135
+ break;
136
+ case 't': // legacy
137
+ case 'T':
138
+ config.temporary_directory_prefix = optarg;
139
+ util::NormalizeTempPrefix(config.temporary_directory_prefix);
140
+ break;
141
+ case 'm': // legacy
142
+ config.building_memory = ParseUInt(optarg) * 1048576;
143
+ break;
144
+ case 'S':
145
+ config.building_memory = std::min(static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), util::ParseSize(optarg));
146
+ break;
147
+ case 'w':
148
+ set_write_method = true;
149
+ if (!strcmp(optarg, "mmap")) {
150
+ config.write_method = Config::WRITE_MMAP;
151
+ } else if (!strcmp(optarg, "after")) {
152
+ config.write_method = Config::WRITE_AFTER;
153
+ } else {
154
+ Usage(argv[0], default_mem);
155
+ }
156
+ break;
157
+ case 's':
158
+ config.sentence_marker_missing = lm::SILENT;
159
+ break;
160
+ case 'i':
161
+ config.positive_log_probability = lm::SILENT;
162
+ break;
163
+ case 'r':
164
+ rest = true;
165
+ ParseFileList(optarg, config.rest_lower_files);
166
+ config.rest_function = Config::REST_LOWER;
167
+ break;
168
+ case 'h': // help
169
+ default:
170
+ Usage(argv[0], default_mem);
171
+ }
172
+ }
173
+ if (!quantize && set_backoff_bits) {
174
+ std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl;
175
+ abort();
176
+ }
177
+ if (optind + 1 == argc) {
178
+ ShowSizes(argv[optind], config);
179
+ return 0;
180
+ }
181
+ const char *model_type;
182
+ const char *from_file;
183
+
184
+ if (optind + 2 == argc) {
185
+ model_type = "probing";
186
+ from_file = argv[optind];
187
+ config.write_mmap = argv[optind + 1];
188
+ } else if (optind + 3 == argc) {
189
+ model_type = argv[optind];
190
+ from_file = argv[optind + 1];
191
+ config.write_mmap = argv[optind + 2];
192
+ } else {
193
+ Usage(argv[0], default_mem);
194
+ return 1;
195
+ }
196
+ if (!strcmp(model_type, "probing")) {
197
+ if (!set_write_method) config.write_method = Config::WRITE_AFTER;
198
+ if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
199
+ if (rest) {
200
+ RestProbingModel(from_file, config);
201
+ } else {
202
+ ProbingModel(from_file, config);
203
+ }
204
+ } else if (!strcmp(model_type, "trie")) {
205
+ if (rest) {
206
+ std::cerr << "Rest + trie is not supported yet." << std::endl;
207
+ return 1;
208
+ }
209
+ if (!set_write_method) config.write_method = Config::WRITE_MMAP;
210
+ if (quantize) {
211
+ if (bhiksha) {
212
+ QuantArrayTrieModel(from_file, config);
213
+ } else {
214
+ QuantTrieModel(from_file, config);
215
+ }
216
+ } else {
217
+ if (bhiksha) {
218
+ ArrayTrieModel(from_file, config);
219
+ } else {
220
+ TrieModel(from_file, config);
221
+ }
222
+ }
223
+ } else {
224
+ Usage(argv[0], default_mem);
225
+ }
226
+ }
227
+ catch (const std::exception &e) {
228
+ std::cerr << e.what() << std::endl;
229
+ std::cerr << "ERROR" << std::endl;
230
+ return 1;
231
+ }
232
+ std::cerr << "SUCCESS" << std::endl;
233
+ return 0;
234
+ }
mosesdecoder/lm/builder/CMakeLists.txt ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 2.8.8)
2
+ #
3
+ # The KenLM cmake files make use of add_library(... OBJECTS ...)
4
+ #
5
+ # This syntax allows grouping of source files when compiling
6
+ # (effectively creating "fake" libraries based on source subdirs).
7
+ #
8
+ # This syntax was only added in cmake version 2.8.8
9
+ #
10
+ # see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
11
+
12
+
13
+ # This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
14
+
15
+ # Explicitly list the source files for this subdirectory
16
+ #
17
+ # If you add any source files to this subdirectory
18
+ # that should be included in the kenlm library,
19
+ # (this excludes any unit test files)
20
+ # you should add them to the following list:
21
+ #
22
+ # In order to set correct paths to these files
23
+ # in case this variable is referenced by CMake files in the parent directory,
24
+ # we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
25
+ #
26
+ set(KENLM_BUILDER_SOURCE
27
+ ${CMAKE_CURRENT_SOURCE_DIR}/adjust_counts.cc
28
+ ${CMAKE_CURRENT_SOURCE_DIR}/corpus_count.cc
29
+ ${CMAKE_CURRENT_SOURCE_DIR}/initial_probabilities.cc
30
+ ${CMAKE_CURRENT_SOURCE_DIR}/interpolate.cc
31
+ ${CMAKE_CURRENT_SOURCE_DIR}/output.cc
32
+ ${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cc
33
+ )
34
+
35
+
36
+ # Group these objects together for later use.
37
+ #
38
+ # Given add_library(foo OBJECT ${my_foo_sources}),
39
+ # refer to these objects as $<TARGET_OBJECTS:foo>
40
+ #
41
+ add_library(kenlm_builder OBJECT ${KENLM_BUILDER_SOURCE})
42
+
43
+
44
+ # Compile the executable, linking against the requisite dependent object files
45
+ add_executable(lmplz lmplz_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_common> $<TARGET_OBJECTS:kenlm_builder> $<TARGET_OBJECTS:kenlm_util>)
46
+
47
+ # Link the executable against boost
48
+ target_link_libraries(lmplz ${Boost_LIBRARIES} pthread)
49
+
50
+ # Group executables together
51
+ set_target_properties(lmplz PROPERTIES FOLDER executables)
52
+
53
+ if(BUILD_TESTING)
54
+
55
+ # Explicitly list the Boost test files to be compiled
56
+ set(KENLM_BOOST_TESTS_LIST
57
+ adjust_counts_test
58
+ corpus_count_test
59
+ )
60
+
61
+ AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
62
+ DEPENDS $<TARGET_OBJECTS:kenlm>
63
+ $<TARGET_OBJECTS:kenlm_common>
64
+ $<TARGET_OBJECTS:kenlm_util>
65
+ $<TARGET_OBJECTS:kenlm_builder>
66
+ LIBRARIES ${Boost_LIBRARIES} pthread)
67
+ endif()
mosesdecoder/lm/builder/Jamfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fakelib builder : [ glob *.cc : *test.cc *main.cc ]
2
+ ../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm ../common//common
3
+ : : : <library>/top//boost_thread $(timer-link) ;
4
+
5
+ exe lmplz : lmplz_main.cc builder /top//boost_program_options ;
6
+
7
+ exe dump_counts : dump_counts_main.cc builder ;
8
+
9
+ alias programs : lmplz dump_counts ;
10
+
11
+ import testing ;
12
+ unit-test corpus_count_test : corpus_count_test.cc builder /top//boost_unit_test_framework ;
13
+ unit-test adjust_counts_test : adjust_counts_test.cc builder /top//boost_unit_test_framework ;