Upload 172 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +11 -0
- mosesdecoder/lm/CMakeLists.txt +90 -0
- mosesdecoder/lm/Jamfile +40 -0
- mosesdecoder/lm/bhiksha.cc +94 -0
- mosesdecoder/lm/bhiksha.hh +122 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/bhiksha.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/binary_format.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary +3 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary_main.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/config.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment +3 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment_main.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark +3 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark_main.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/lm_exception.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/model.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/quantize.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query +3 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query_main.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/read_arpa.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_hashed.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_trie.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/sizes.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie_sort.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/value_build.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/virtual_interface.o +0 -0
- mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/vocab.o +0 -0
- mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test +3 -0
- mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.o +0 -0
- mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.output +8 -0
- mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.run +8 -0
- mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.test +1 -0
- mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test +3 -0
- mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.o +3 -0
- mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.output +11 -0
- mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.run +11 -0
- mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.test +1 -0
- mosesdecoder/lm/bin/order.log +1 -0
- mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test +3 -0
- mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.o +0 -0
- mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.output +8 -0
- mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.run +8 -0
- mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.test +1 -0
- mosesdecoder/lm/binary_format.cc +302 -0
- mosesdecoder/lm/binary_format.hh +106 -0
- mosesdecoder/lm/blank.hh +42 -0
- mosesdecoder/lm/build_binary_main.cc +234 -0
- mosesdecoder/lm/builder/CMakeLists.txt +67 -0
- mosesdecoder/lm/builder/Jamfile +13 -0
.gitattributes
CHANGED
|
@@ -49,3 +49,14 @@ mosesdecoder/lib/libmoses2_lib.a filter=lfs diff=lfs merge=lfs -text
|
|
| 49 |
mosesdecoder/lib/libmoses2decoder.a filter=lfs diff=lfs merge=lfs -text
|
| 50 |
mosesdecoder/lib/libprobingpt.a filter=lfs diff=lfs merge=lfs -text
|
| 51 |
mosesdecoder/lib/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
mosesdecoder/lib/libmoses2decoder.a filter=lfs diff=lfs merge=lfs -text
|
| 50 |
mosesdecoder/lib/libprobingpt.a filter=lfs diff=lfs merge=lfs -text
|
| 51 |
mosesdecoder/lib/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.o filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
mosesdecoder/lm/builder/bin/gcc-9/release/link-static/threading-multi/dump_counts filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
mosesdecoder/lm/builder/bin/gcc-9/release/link-static/threading-multi/lmplz filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
mosesdecoder/lm/filter/bin/gcc-9/release/link-static/threading-multi/filter filter=lfs diff=lfs merge=lfs -text
|
mosesdecoder/lm/CMakeLists.txt
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cmake_minimum_required(VERSION 2.8.8)
|
| 2 |
+
#
|
| 3 |
+
# The KenLM cmake files make use of add_library(... OBJECTS ...)
|
| 4 |
+
#
|
| 5 |
+
# This syntax allows grouping of source files when compiling
|
| 6 |
+
# (effectively creating "fake" libraries based on source subdirs).
|
| 7 |
+
#
|
| 8 |
+
# This syntax was only added in cmake version 2.8.8
|
| 9 |
+
#
|
| 10 |
+
# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
set(KENLM_MAX_ORDER 6 CACHE STRING "Maximum supported ngram order")
|
| 17 |
+
|
| 18 |
+
add_definitions(-DKENLM_MAX_ORDER=${KENLM_MAX_ORDER})
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Explicitly list the source files for this subdirectory
|
| 22 |
+
#
|
| 23 |
+
# If you add any source files to this subdirectory
|
| 24 |
+
# that should be included in the kenlm library,
|
| 25 |
+
# (this excludes any unit test files)
|
| 26 |
+
# you should add them to the following list:
|
| 27 |
+
set(KENLM_SOURCE
|
| 28 |
+
bhiksha.cc
|
| 29 |
+
binary_format.cc
|
| 30 |
+
config.cc
|
| 31 |
+
lm_exception.cc
|
| 32 |
+
model.cc
|
| 33 |
+
quantize.cc
|
| 34 |
+
read_arpa.cc
|
| 35 |
+
search_hashed.cc
|
| 36 |
+
search_trie.cc
|
| 37 |
+
sizes.cc
|
| 38 |
+
trie.cc
|
| 39 |
+
trie_sort.cc
|
| 40 |
+
value_build.cc
|
| 41 |
+
virtual_interface.cc
|
| 42 |
+
vocab.cc
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# Group these objects together for later use.
|
| 47 |
+
#
|
| 48 |
+
# Given add_library(foo OBJECT ${my_foo_sources}),
|
| 49 |
+
# refer to these objects as $<TARGET_OBJECTS:foo>
|
| 50 |
+
#
|
| 51 |
+
add_library(kenlm OBJECT ${KENLM_SOURCE})
|
| 52 |
+
|
| 53 |
+
# This directory has children that need to be processed
|
| 54 |
+
add_subdirectory(builder)
|
| 55 |
+
add_subdirectory(common)
|
| 56 |
+
add_subdirectory(filter)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# Explicitly list the executable files to be compiled
|
| 61 |
+
set(EXE_LIST
|
| 62 |
+
query
|
| 63 |
+
fragment
|
| 64 |
+
build_binary
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
AddExes(EXES ${EXE_LIST}
|
| 68 |
+
DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
|
| 69 |
+
LIBRARIES ${Boost_LIBRARIES} pthread)
|
| 70 |
+
|
| 71 |
+
# Conditionally build the interpolation code
|
| 72 |
+
if(BUILD_INTERPOLATE)
|
| 73 |
+
add_subdirectory(interpolate)
|
| 74 |
+
endif()
|
| 75 |
+
|
| 76 |
+
if(BUILD_TESTING)
|
| 77 |
+
|
| 78 |
+
set(KENLM_BOOST_TESTS_LIST left_test partial_test)
|
| 79 |
+
AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
|
| 80 |
+
DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
|
| 81 |
+
LIBRARIES ${Boost_LIBRARIES} pthread
|
| 82 |
+
TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa)
|
| 83 |
+
|
| 84 |
+
# model_test requires an extra command line parameter
|
| 85 |
+
KenLMAddTest(TEST model_test
|
| 86 |
+
DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
|
| 87 |
+
LIBRARIES ${Boost_LIBRARIES} pthread
|
| 88 |
+
TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa
|
| 89 |
+
${CMAKE_CURRENT_SOURCE_DIR}/test_nounk.arpa)
|
| 90 |
+
endif()
|
mosesdecoder/lm/Jamfile
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# If you need higher order, change this option
|
| 2 |
+
# Having this limit means that State can be
|
| 3 |
+
# (KENLM_MAX_ORDER - 1) * sizeof(float) bytes instead of
|
| 4 |
+
# sizeof(float*) + (KENLM_MAX_ORDER - 1) * sizeof(float) + malloc overhead
|
| 5 |
+
max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
|
| 6 |
+
if ( $(max-order) != 6 ) {
|
| 7 |
+
echo "Setting KenLM maximum n-gram order to $(max-order)" ;
|
| 8 |
+
}
|
| 9 |
+
max-order = <define>KENLM_MAX_ORDER=$(max-order) ;
|
| 10 |
+
|
| 11 |
+
path-constant ORDER-LOG : bin/order.log ;
|
| 12 |
+
update-if-changed $(ORDER-LOG) $(max-order) ;
|
| 13 |
+
|
| 14 |
+
max-order += <dependency>$(ORDER-LOG) ;
|
| 15 |
+
|
| 16 |
+
wrappers = ;
|
| 17 |
+
local with-nplm = [ option.get "with-nplm" ] ;
|
| 18 |
+
if $(with-nplm) {
|
| 19 |
+
lib nplm : : <search>$(with-nplm)/src ;
|
| 20 |
+
obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp <include>$(with-nplm)/3rdparty/eigen <define>NPLM_DOUBLE_PRECISION=0 ;
|
| 21 |
+
alias nplm-all : nplm.o nplm ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
|
| 22 |
+
wrappers += nplm-all ;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
fakelib kenlm : $(wrappers) [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : <include>.. $(max-order) : : <include>.. $(max-order) ;
|
| 26 |
+
|
| 27 |
+
import testing ;
|
| 28 |
+
|
| 29 |
+
run left_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ;
|
| 30 |
+
run model_test.cc kenlm /top//boost_unit_test_framework : : test.arpa test_nounk.arpa ;
|
| 31 |
+
run partial_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ;
|
| 32 |
+
|
| 33 |
+
exes = ;
|
| 34 |
+
for local p in [ glob *_main.cc ] {
|
| 35 |
+
local name = [ MATCH "(.*)\_main.cc" : $(p) ] ;
|
| 36 |
+
exe $(name) : $(p) kenlm ;
|
| 37 |
+
exes += $(name) ;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
alias programs : $(exes) filter//filter filter//phrase_table_vocab builder//dump_counts : <threading>multi:<source>builder//lmplz ;
|
mosesdecoder/lm/bhiksha.cc
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "lm/bhiksha.hh"
|
| 2 |
+
|
| 3 |
+
#include "lm/binary_format.hh"
|
| 4 |
+
#include "lm/config.hh"
|
| 5 |
+
#include "util/file.hh"
|
| 6 |
+
#include "util/exception.hh"
|
| 7 |
+
|
| 8 |
+
#include <limits>
|
| 9 |
+
|
| 10 |
+
namespace lm {
|
| 11 |
+
namespace ngram {
|
| 12 |
+
namespace trie {
|
| 13 |
+
|
| 14 |
+
DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
|
| 15 |
+
next_(util::BitsMask::ByMax(max_next)) {}
|
| 16 |
+
|
| 17 |
+
const uint8_t kArrayBhikshaVersion = 0;
|
| 18 |
+
|
| 19 |
+
// TODO: put this in binary file header instead when I change the binary file format again.
|
| 20 |
+
void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
|
| 21 |
+
uint8_t buffer[2];
|
| 22 |
+
file.ReadForConfig(buffer, 2, offset);
|
| 23 |
+
uint8_t version = buffer[0];
|
| 24 |
+
uint8_t configured_bits = buffer[1];
|
| 25 |
+
if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion);
|
| 26 |
+
config.pointer_bhiksha_bits = configured_bits;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
namespace {
|
| 30 |
+
|
| 31 |
+
// Find argmin_{chopped \in [0, RequiredBits(max_next)]} ChoppedDelta(max_offset)
|
| 32 |
+
uint8_t ChopBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
|
| 33 |
+
uint8_t required = util::RequiredBits(max_next);
|
| 34 |
+
uint8_t best_chop = 0;
|
| 35 |
+
int64_t lowest_change = std::numeric_limits<int64_t>::max();
|
| 36 |
+
// There are probably faster ways but I don't care because this is only done once per order at construction time.
|
| 37 |
+
for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) {
|
| 38 |
+
int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */
|
| 39 |
+
- max_offset * static_cast<int64_t>(chop); /* savings in bits*/
|
| 40 |
+
if (change < lowest_change) {
|
| 41 |
+
lowest_change = change;
|
| 42 |
+
best_chop = chop;
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
return best_chop;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
std::size_t ArrayCount(uint64_t max_offset, uint64_t max_next, const Config &config) {
|
| 49 |
+
uint8_t required = util::RequiredBits(max_next);
|
| 50 |
+
uint8_t chopping = ChopBits(max_offset, max_next, config);
|
| 51 |
+
return (max_next >> (required - chopping)) + 1 /* we store 0 too */;
|
| 52 |
+
}
|
| 53 |
+
} // namespace
|
| 54 |
+
|
| 55 |
+
uint64_t ArrayBhiksha::Size(uint64_t max_offset, uint64_t max_next, const Config &config) {
|
| 56 |
+
return sizeof(uint64_t) * (1 /* header */ + ArrayCount(max_offset, max_next, config)) + 7 /* 8-byte alignment */;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
uint8_t ArrayBhiksha::InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
|
| 60 |
+
return util::RequiredBits(max_next) - ChopBits(max_offset, max_next, config);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
namespace {
|
| 64 |
+
|
| 65 |
+
void *AlignTo8(void *from) {
|
| 66 |
+
uint8_t *val = reinterpret_cast<uint8_t*>(from);
|
| 67 |
+
std::size_t remainder = reinterpret_cast<std::size_t>(val) & 7;
|
| 68 |
+
if (!remainder) return val;
|
| 69 |
+
return val + 8 - remainder;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
} // namespace
|
| 73 |
+
|
| 74 |
+
ArrayBhiksha::ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_next, const Config &config)
|
| 75 |
+
: next_inline_(util::BitsMask::ByBits(InlineBits(max_offset, max_next, config))),
|
| 76 |
+
offset_begin_(reinterpret_cast<const uint64_t*>(AlignTo8(base)) + 1 /* 8-byte header */),
|
| 77 |
+
offset_end_(offset_begin_ + ArrayCount(max_offset, max_next, config)),
|
| 78 |
+
write_to_(reinterpret_cast<uint64_t*>(AlignTo8(base)) + 1 /* 8-byte header */ + 1 /* first entry is 0 */),
|
| 79 |
+
original_base_(base) {}
|
| 80 |
+
|
| 81 |
+
void ArrayBhiksha::FinishedLoading(const Config &config) {
|
| 82 |
+
// *offset_begin_ = 0 but without a const_cast.
|
| 83 |
+
*(write_to_ - (write_to_ - offset_begin_)) = 0;
|
| 84 |
+
|
| 85 |
+
if (write_to_ != offset_end_) UTIL_THROW(util::Exception, "Did not get all the array entries that were expected.");
|
| 86 |
+
|
| 87 |
+
uint8_t *head_write = reinterpret_cast<uint8_t*>(original_base_);
|
| 88 |
+
*(head_write++) = kArrayBhikshaVersion;
|
| 89 |
+
*(head_write++) = config.pointer_bhiksha_bits;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
} // namespace trie
|
| 93 |
+
} // namespace ngram
|
| 94 |
+
} // namespace lm
|
mosesdecoder/lm/bhiksha.hh
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Simple implementation of
|
| 2 |
+
* @inproceedings{bhikshacompression,
|
| 3 |
+
* author={Bhiksha Raj and Ed Whittaker},
|
| 4 |
+
* year={2003},
|
| 5 |
+
* title={Lossless Compression of Language Model Structure and Word Identifiers},
|
| 6 |
+
* booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing},
|
| 7 |
+
* pages={388--391},
|
| 8 |
+
* }
|
| 9 |
+
*
|
| 10 |
+
* Currently only used for next pointers.
|
| 11 |
+
*/
|
| 12 |
+
|
| 13 |
+
#ifndef LM_BHIKSHA_H
|
| 14 |
+
#define LM_BHIKSHA_H
|
| 15 |
+
|
| 16 |
+
#include "lm/model_type.hh"
|
| 17 |
+
#include "lm/trie.hh"
|
| 18 |
+
#include "util/bit_packing.hh"
|
| 19 |
+
#include "util/sorted_uniform.hh"
|
| 20 |
+
|
| 21 |
+
#include <algorithm>
|
| 22 |
+
#include <stdint.h>
|
| 23 |
+
#include <cassert>
|
| 24 |
+
|
| 25 |
+
namespace lm {
|
| 26 |
+
namespace ngram {
|
| 27 |
+
struct Config;
|
| 28 |
+
class BinaryFormat;
|
| 29 |
+
|
| 30 |
+
namespace trie {
|
| 31 |
+
|
| 32 |
+
class DontBhiksha {
|
| 33 |
+
public:
|
| 34 |
+
static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
|
| 35 |
+
|
| 36 |
+
static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {}
|
| 37 |
+
|
| 38 |
+
static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; }
|
| 39 |
+
|
| 40 |
+
static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) {
|
| 41 |
+
return util::RequiredBits(max_next);
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
DontBhiksha(const void *base, uint64_t max_offset, uint64_t max_next, const Config &config);
|
| 45 |
+
|
| 46 |
+
void ReadNext(const void *base, uint64_t bit_offset, uint64_t /*index*/, uint8_t total_bits, NodeRange &out) const {
|
| 47 |
+
out.begin = util::ReadInt57(base, bit_offset, next_.bits, next_.mask);
|
| 48 |
+
out.end = util::ReadInt57(base, bit_offset + total_bits, next_.bits, next_.mask);
|
| 49 |
+
//assert(out.end >= out.begin);
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
void WriteNext(void *base, uint64_t bit_offset, uint64_t /*index*/, uint64_t value) {
|
| 53 |
+
util::WriteInt57(base, bit_offset, next_.bits, value);
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
void FinishedLoading(const Config &/*config*/) {}
|
| 57 |
+
|
| 58 |
+
uint8_t InlineBits() const { return next_.bits; }
|
| 59 |
+
|
| 60 |
+
private:
|
| 61 |
+
util::BitsMask next_;
|
| 62 |
+
};
|
| 63 |
+
|
| 64 |
+
class ArrayBhiksha {
|
| 65 |
+
public:
|
| 66 |
+
static const ModelType kModelTypeAdd = kArrayAdd;
|
| 67 |
+
|
| 68 |
+
static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);
|
| 69 |
+
|
| 70 |
+
static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config);
|
| 71 |
+
|
| 72 |
+
static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config);
|
| 73 |
+
|
| 74 |
+
ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_value, const Config &config);
|
| 75 |
+
|
| 76 |
+
void ReadNext(const void *base, uint64_t bit_offset, uint64_t index, uint8_t total_bits, NodeRange &out) const {
|
| 77 |
+
// Some assertions are commented out because they are expensive.
|
| 78 |
+
// assert(*offset_begin_ == 0);
|
| 79 |
+
// std::upper_bound returns the first element that is greater. Want the
|
| 80 |
+
// last element that is <= to the index.
|
| 81 |
+
const uint64_t *begin_it = std::upper_bound(offset_begin_, offset_end_, index) - 1;
|
| 82 |
+
// Since *offset_begin_ == 0, the position should be in range.
|
| 83 |
+
// assert(begin_it >= offset_begin_);
|
| 84 |
+
const uint64_t *end_it;
|
| 85 |
+
for (end_it = begin_it + 1; (end_it < offset_end_) && (*end_it <= index + 1); ++end_it) {}
|
| 86 |
+
// assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1));
|
| 87 |
+
--end_it;
|
| 88 |
+
// assert(end_it >= begin_it);
|
| 89 |
+
out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
|
| 90 |
+
util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask);
|
| 91 |
+
out.end = ((end_it - offset_begin_) << next_inline_.bits) |
|
| 92 |
+
util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask);
|
| 93 |
+
// If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052
|
| 94 |
+
assert(out.end >= out.begin);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
void WriteNext(void *base, uint64_t bit_offset, uint64_t index, uint64_t value) {
|
| 98 |
+
uint64_t encode = value >> next_inline_.bits;
|
| 99 |
+
for (; write_to_ <= offset_begin_ + encode; ++write_to_) *write_to_ = index;
|
| 100 |
+
util::WriteInt57(base, bit_offset, next_inline_.bits, value & next_inline_.mask);
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
void FinishedLoading(const Config &config);
|
| 104 |
+
|
| 105 |
+
uint8_t InlineBits() const { return next_inline_.bits; }
|
| 106 |
+
|
| 107 |
+
private:
|
| 108 |
+
const util::BitsMask next_inline_;
|
| 109 |
+
|
| 110 |
+
const uint64_t *const offset_begin_;
|
| 111 |
+
const uint64_t *const offset_end_;
|
| 112 |
+
|
| 113 |
+
uint64_t *write_to_;
|
| 114 |
+
|
| 115 |
+
void *original_base_;
|
| 116 |
+
};
|
| 117 |
+
|
| 118 |
+
} // namespace trie
|
| 119 |
+
} // namespace ngram
|
| 120 |
+
} // namespace lm
|
| 121 |
+
|
| 122 |
+
#endif // LM_BHIKSHA_H
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/bhiksha.o
ADDED
|
Binary file (24.4 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/binary_format.o
ADDED
|
Binary file (87 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:89bb1a5052a26025dee0f23bf7492c60881ae5b02ceb378b78905a1e166926cc
|
| 3 |
+
size 1367920
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary_main.o
ADDED
|
Binary file (127 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/config.o
ADDED
|
Binary file (2.63 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e3bb1750b2c843ff2ad2b81b01b70f7e52c34abd8ce296575a19e10f9769b31
|
| 3 |
+
size 1367912
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment_main.o
ADDED
|
Binary file (55 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b00c5cb3fc290d10f4dd59c4e3c3472199ebe32cb9dcd25963e07a5e3227af89
|
| 3 |
+
size 1412248
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark_main.o
ADDED
|
Binary file (211 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/lm_exception.o
ADDED
|
Binary file (11.7 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/model.o
ADDED
|
Binary file (297 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/quantize.o
ADDED
|
Binary file (42.2 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dee549913c814cfa80d753c6dd7cb494099bd5c7da5cda664eedd73f7acc8f72
|
| 3 |
+
size 1388928
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query_main.o
ADDED
|
Binary file (167 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/read_arpa.o
ADDED
|
Binary file (105 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_hashed.o
ADDED
|
Binary file (169 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_trie.o
ADDED
|
Binary file (196 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/sizes.o
ADDED
|
Binary file (10.8 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie.o
ADDED
|
Binary file (35.5 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie_sort.o
ADDED
|
Binary file (118 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/value_build.o
ADDED
|
Binary file (81.7 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/virtual_interface.o
ADDED
|
Binary file (5.9 kB). View file
|
|
|
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/vocab.o
ADDED
|
Binary file (124 kB). View file
|
|
|
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c226dcb9b7f76bb74bb4114ca69a955381ad9790c361f5ef642b772bb9b0b434
|
| 3 |
+
size 2458688
|
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.o
ADDED
|
Binary file (698 kB). View file
|
|
|
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.output
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
|
| 2 |
+
and should be placed after all Boost.Test arguments and the -- separator.
|
| 3 |
+
For example: left_test --random -- lm/test.arpa
|
| 4 |
+
Running 6 test cases...
|
| 5 |
+
|
| 6 |
+
*** No errors detected
|
| 7 |
+
|
| 8 |
+
EXIT STATUS: 0
|
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.run
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
|
| 2 |
+
and should be placed after all Boost.Test arguments and the -- separator.
|
| 3 |
+
For example: left_test --random -- lm/test.arpa
|
| 4 |
+
Running 6 test cases...
|
| 5 |
+
|
| 6 |
+
*** No errors detected
|
| 7 |
+
|
| 8 |
+
EXIT STATUS: 0
|
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.test
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
passed
|
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5f8ac4d5e3d965ef934f3075a93f9c59093e1cb6ee44ef27d71542a43cbb50f
|
| 3 |
+
size 2890976
|
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.o
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db0276d839aa0274ab47aed2de1e3d1089c83eedfbb92ac58e9b17b6096940e1
|
| 3 |
+
size 1513008
|
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.output
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
|
| 2 |
+
and should be placed after all Boost.Test arguments and the -- separator.
|
| 3 |
+
For example: model_test --random -- lm/test.arpa
|
| 4 |
+
Boost.Test WARNING: token "lm/test_nounk.arpa" does not correspond to the Boost.Test argument
|
| 5 |
+
and should be placed after all Boost.Test arguments and the -- separator.
|
| 6 |
+
For example: model_test --random -- lm/test_nounk.arpa
|
| 7 |
+
Running 12 test cases...
|
| 8 |
+
|
| 9 |
+
*** No errors detected
|
| 10 |
+
|
| 11 |
+
EXIT STATUS: 0
|
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.run
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
|
| 2 |
+
and should be placed after all Boost.Test arguments and the -- separator.
|
| 3 |
+
For example: model_test --random -- lm/test.arpa
|
| 4 |
+
Boost.Test WARNING: token "lm/test_nounk.arpa" does not correspond to the Boost.Test argument
|
| 5 |
+
and should be placed after all Boost.Test arguments and the -- separator.
|
| 6 |
+
For example: model_test --random -- lm/test_nounk.arpa
|
| 7 |
+
Running 12 test cases...
|
| 8 |
+
|
| 9 |
+
*** No errors detected
|
| 10 |
+
|
| 11 |
+
EXIT STATUS: 0
|
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.test
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
passed
|
mosesdecoder/lm/bin/order.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
<define>KENLM_MAX_ORDER=6
|
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50e0176970c570eea64c3cf5ce6ac7f3432f91e605676d7cbbf8e055a1e307b4
|
| 3 |
+
size 2254664
|
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.o
ADDED
|
Binary file (211 kB). View file
|
|
|
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.output
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
|
| 2 |
+
and should be placed after all Boost.Test arguments and the -- separator.
|
| 3 |
+
For example: partial_test --random -- lm/test.arpa
|
| 4 |
+
Running 4 test cases...
|
| 5 |
+
|
| 6 |
+
*** No errors detected
|
| 7 |
+
|
| 8 |
+
EXIT STATUS: 0
|
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.run
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
|
| 2 |
+
and should be placed after all Boost.Test arguments and the -- separator.
|
| 3 |
+
For example: partial_test --random -- lm/test.arpa
|
| 4 |
+
Running 4 test cases...
|
| 5 |
+
|
| 6 |
+
*** No errors detected
|
| 7 |
+
|
| 8 |
+
EXIT STATUS: 0
|
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.test
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
passed
|
mosesdecoder/lm/binary_format.cc
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "lm/binary_format.hh"
|
| 2 |
+
|
| 3 |
+
#include "lm/lm_exception.hh"
|
| 4 |
+
#include "util/file.hh"
|
| 5 |
+
#include "util/file_piece.hh"
|
| 6 |
+
|
| 7 |
+
#include <cstddef>
|
| 8 |
+
#include <cstring>
|
| 9 |
+
#include <limits>
|
| 10 |
+
#include <string>
|
| 11 |
+
#include <cstdlib>
|
| 12 |
+
|
| 13 |
+
#include <stdint.h>
|
| 14 |
+
|
| 15 |
+
namespace lm {
|
| 16 |
+
namespace ngram {
|
| 17 |
+
|
| 18 |
+
const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
|
| 19 |
+
|
| 20 |
+
namespace {
|
| 21 |
+
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
|
| 22 |
+
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
|
| 23 |
+
// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
|
| 24 |
+
const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";
|
| 25 |
+
const long int kMagicVersion = 5;
|
| 26 |
+
|
| 27 |
+
// Old binary files built on 32-bit machines have this header.
|
| 28 |
+
// TODO: eliminate with next binary release.
|
| 29 |
+
struct OldSanity {
|
| 30 |
+
char magic[sizeof(kMagicBytes)];
|
| 31 |
+
float zero_f, one_f, minus_half_f;
|
| 32 |
+
WordIndex one_word_index, max_word_index;
|
| 33 |
+
uint64_t one_uint64;
|
| 34 |
+
|
| 35 |
+
void SetToReference() {
|
| 36 |
+
std::memset(this, 0, sizeof(OldSanity));
|
| 37 |
+
std::memcpy(magic, kMagicBytes, sizeof(magic));
|
| 38 |
+
zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5;
|
| 39 |
+
one_word_index = 1;
|
| 40 |
+
max_word_index = std::numeric_limits<WordIndex>::max();
|
| 41 |
+
one_uint64 = 1;
|
| 42 |
+
}
|
| 43 |
+
};
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
// Test values aligned to 8 bytes.
|
| 47 |
+
struct Sanity {
|
| 48 |
+
char magic[ALIGN8(sizeof(kMagicBytes))];
|
| 49 |
+
float zero_f, one_f, minus_half_f;
|
| 50 |
+
WordIndex one_word_index, max_word_index, padding_to_8;
|
| 51 |
+
uint64_t one_uint64;
|
| 52 |
+
|
| 53 |
+
void SetToReference() {
|
| 54 |
+
std::memset(this, 0, sizeof(Sanity));
|
| 55 |
+
std::memcpy(magic, kMagicBytes, sizeof(kMagicBytes));
|
| 56 |
+
zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5;
|
| 57 |
+
one_word_index = 1;
|
| 58 |
+
max_word_index = std::numeric_limits<WordIndex>::max();
|
| 59 |
+
padding_to_8 = 0;
|
| 60 |
+
one_uint64 = 1;
|
| 61 |
+
}
|
| 62 |
+
};
|
| 63 |
+
|
| 64 |
+
std::size_t TotalHeaderSize(unsigned char order) {
|
| 65 |
+
return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
void WriteHeader(void *to, const Parameters ¶ms) {
|
| 69 |
+
Sanity header = Sanity();
|
| 70 |
+
header.SetToReference();
|
| 71 |
+
std::memcpy(to, &header, sizeof(Sanity));
|
| 72 |
+
char *out = reinterpret_cast<char*>(to) + sizeof(Sanity);
|
| 73 |
+
|
| 74 |
+
*reinterpret_cast<FixedWidthParameters*>(out) = params.fixed;
|
| 75 |
+
out += sizeof(FixedWidthParameters);
|
| 76 |
+
|
| 77 |
+
uint64_t *counts = reinterpret_cast<uint64_t*>(out);
|
| 78 |
+
for (std::size_t i = 0; i < params.counts.size(); ++i) {
|
| 79 |
+
counts[i] = params.counts[i];
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
} // namespace
|
| 84 |
+
|
| 85 |
+
bool IsBinaryFormat(int fd) {
|
| 86 |
+
const uint64_t size = util::SizeFile(fd);
|
| 87 |
+
if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
|
| 88 |
+
// Try reading the header.
|
| 89 |
+
util::scoped_memory memory;
|
| 90 |
+
try {
|
| 91 |
+
util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory);
|
| 92 |
+
} catch (const util::Exception &e) {
|
| 93 |
+
return false;
|
| 94 |
+
}
|
| 95 |
+
Sanity reference_header = Sanity();
|
| 96 |
+
reference_header.SetToReference();
|
| 97 |
+
if (!std::memcmp(memory.get(), &reference_header, sizeof(Sanity))) return true;
|
| 98 |
+
if (!std::memcmp(memory.get(), kMagicIncomplete, strlen(kMagicIncomplete))) {
|
| 99 |
+
UTIL_THROW(FormatLoadException, "This binary file did not finish building");
|
| 100 |
+
}
|
| 101 |
+
if (!std::memcmp(memory.get(), kMagicBeforeVersion, strlen(kMagicBeforeVersion))) {
|
| 102 |
+
char *end_ptr;
|
| 103 |
+
const char *begin_version = static_cast<const char*>(memory.get()) + strlen(kMagicBeforeVersion);
|
| 104 |
+
long int version = std::strtol(begin_version, &end_ptr, 10);
|
| 105 |
+
if ((end_ptr != begin_version) && version != kMagicVersion) {
|
| 106 |
+
UTIL_THROW(FormatLoadException, "Binary file has version " << version << " but this implementation expects version " << kMagicVersion << " so you'll have to use the ARPA to rebuild your binary");
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
OldSanity old_sanity = OldSanity();
|
| 110 |
+
old_sanity.SetToReference();
|
| 111 |
+
UTIL_THROW_IF(!std::memcmp(memory.get(), &old_sanity, sizeof(OldSanity)), FormatLoadException, "Looks like this is an old 32-bit format. The old 32-bit format has been removed so that 64-bit and 32-bit files are exchangeable.");
|
| 112 |
+
UTIL_THROW(FormatLoadException, "File looks like it should be loaded with mmap, but the test values don't match. Try rebuilding the binary format LM using the same code revision, compiler, and architecture");
|
| 113 |
+
}
|
| 114 |
+
return false;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
void ReadHeader(int fd, Parameters &out) {
|
| 118 |
+
util::SeekOrThrow(fd, sizeof(Sanity));
|
| 119 |
+
util::ReadOrThrow(fd, &out.fixed, sizeof(out.fixed));
|
| 120 |
+
if (out.fixed.probing_multiplier < 1.0)
|
| 121 |
+
UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0.");
|
| 122 |
+
|
| 123 |
+
out.counts.resize(static_cast<std::size_t>(out.fixed.order));
|
| 124 |
+
if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms) {
|
| 128 |
+
if (params.fixed.model_type != model_type) {
|
| 129 |
+
if (static_cast<unsigned int>(params.fixed.model_type) >= (sizeof(kModelNames) / sizeof(const char *)))
|
| 130 |
+
UTIL_THROW(FormatLoadException, "The binary file claims to be model type " << static_cast<unsigned int>(params.fixed.model_type) << " but this is not implemented for in this inference code.");
|
| 131 |
+
UTIL_THROW(FormatLoadException, "The binary file was built for " << kModelNames[params.fixed.model_type] << " but the inference code is trying to load " << kModelNames[model_type]);
|
| 132 |
+
}
|
| 133 |
+
UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
|
| 137 |
+
|
| 138 |
+
BinaryFormat::BinaryFormat(const Config &config)
|
| 139 |
+
: write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
|
| 140 |
+
header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}
|
| 141 |
+
|
| 142 |
+
void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms) {
|
| 143 |
+
file_.reset(fd);
|
| 144 |
+
write_mmap_ = NULL; // Ignore write requests; this is already in binary format.
|
| 145 |
+
ReadHeader(fd, params);
|
| 146 |
+
MatchCheck(model_type, search_version, params);
|
| 147 |
+
header_size_ = TotalHeaderSize(params.counts.size());
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const {
|
| 151 |
+
assert(header_size_ != kInvalidSize);
|
| 152 |
+
util::ErsatzPRead(file_.get(), to, amount, offset_excluding_header + header_size_);
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
void *BinaryFormat::LoadBinary(std::size_t size) {
|
| 156 |
+
assert(header_size_ != kInvalidSize);
|
| 157 |
+
const uint64_t file_size = util::SizeFile(file_.get());
|
| 158 |
+
// The header is smaller than a page, so we have to map the whole header as well.
|
| 159 |
+
uint64_t total_map = static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(size);
|
| 160 |
+
UTIL_THROW_IF(file_size != util::kBadSize && file_size < total_map, FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
|
| 161 |
+
|
| 162 |
+
util::MapRead(load_method_, file_.get(), 0, util::CheckOverflow(total_map), mapping_);
|
| 163 |
+
|
| 164 |
+
vocab_string_offset_ = total_map;
|
| 165 |
+
return reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
|
| 169 |
+
vocab_size_ = memory_size;
|
| 170 |
+
if (!write_mmap_) {
|
| 171 |
+
header_size_ = 0;
|
| 172 |
+
util::HugeMalloc(memory_size, true, memory_vocab_);
|
| 173 |
+
return reinterpret_cast<uint8_t*>(memory_vocab_.get());
|
| 174 |
+
}
|
| 175 |
+
header_size_ = TotalHeaderSize(order);
|
| 176 |
+
std::size_t total = util::CheckOverflow(static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(memory_size));
|
| 177 |
+
file_.reset(util::CreateOrThrow(write_mmap_));
|
| 178 |
+
// some gccs complain about uninitialized variables even though all enum values are covered.
|
| 179 |
+
void *vocab_base = NULL;
|
| 180 |
+
switch (write_method_) {
|
| 181 |
+
case Config::WRITE_MMAP:
|
| 182 |
+
mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
|
| 183 |
+
util::AdviseHugePages(vocab_base, total);
|
| 184 |
+
vocab_base = mapping_.get();
|
| 185 |
+
break;
|
| 186 |
+
case Config::WRITE_AFTER:
|
| 187 |
+
util::ResizeOrThrow(file_.get(), 0);
|
| 188 |
+
util::HugeMalloc(total, true, memory_vocab_);
|
| 189 |
+
vocab_base = memory_vocab_.get();
|
| 190 |
+
break;
|
| 191 |
+
}
|
| 192 |
+
strncpy(reinterpret_cast<char*>(vocab_base), kMagicIncomplete, header_size_);
|
| 193 |
+
return reinterpret_cast<uint8_t*>(vocab_base) + header_size_;
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base) {
|
| 197 |
+
assert(vocab_size_ != kInvalidSize);
|
| 198 |
+
vocab_pad_ = vocab_pad;
|
| 199 |
+
std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size;
|
| 200 |
+
vocab_string_offset_ = new_size;
|
| 201 |
+
if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) {
|
| 202 |
+
util::HugeMalloc(memory_size, true, memory_search_);
|
| 203 |
+
assert(header_size_ == 0 || write_mmap_);
|
| 204 |
+
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
|
| 205 |
+
util::AdviseHugePages(memory_search_.get(), memory_size);
|
| 206 |
+
return reinterpret_cast<uint8_t*>(memory_search_.get());
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
assert(write_method_ == Config::WRITE_MMAP);
|
| 210 |
+
// Also known as total size without vocab words.
|
| 211 |
+
// Grow the file to accomodate the search, using zeros.
|
| 212 |
+
// According to man mmap, behavior is undefined when the file is resized
|
| 213 |
+
// underneath a mmap that is not a multiple of the page size. So to be
|
| 214 |
+
// safe, we'll unmap it and map it again.
|
| 215 |
+
mapping_.reset();
|
| 216 |
+
util::ResizeOrThrow(file_.get(), new_size);
|
| 217 |
+
void *ret;
|
| 218 |
+
MapFile(vocab_base, ret);
|
| 219 |
+
util::AdviseHugePages(ret, new_size);
|
| 220 |
+
return ret;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
void BinaryFormat::WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base) {
|
| 224 |
+
// Checking Config's include_vocab is the responsibility of the caller.
|
| 225 |
+
assert(header_size_ != kInvalidSize && vocab_size_ != kInvalidSize);
|
| 226 |
+
if (!write_mmap_) {
|
| 227 |
+
// Unchanged base.
|
| 228 |
+
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get());
|
| 229 |
+
search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
|
| 230 |
+
return;
|
| 231 |
+
}
|
| 232 |
+
if (write_method_ == Config::WRITE_MMAP) {
|
| 233 |
+
mapping_.reset();
|
| 234 |
+
}
|
| 235 |
+
util::SeekOrThrow(file_.get(), VocabStringReadingOffset());
|
| 236 |
+
util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
|
| 237 |
+
if (write_method_ == Config::WRITE_MMAP) {
|
| 238 |
+
MapFile(vocab_base, search_base);
|
| 239 |
+
} else {
|
| 240 |
+
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
|
| 241 |
+
search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
|
| 242 |
+
}
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
void BinaryFormat::FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts) {
|
| 246 |
+
if (!write_mmap_) return;
|
| 247 |
+
switch (write_method_) {
|
| 248 |
+
case Config::WRITE_MMAP:
|
| 249 |
+
util::SyncOrThrow(mapping_.get(), mapping_.size());
|
| 250 |
+
break;
|
| 251 |
+
case Config::WRITE_AFTER:
|
| 252 |
+
util::SeekOrThrow(file_.get(), 0);
|
| 253 |
+
util::WriteOrThrow(file_.get(), memory_vocab_.get(), memory_vocab_.size());
|
| 254 |
+
util::SeekOrThrow(file_.get(), header_size_ + vocab_size_ + vocab_pad_);
|
| 255 |
+
util::WriteOrThrow(file_.get(), memory_search_.get(), memory_search_.size());
|
| 256 |
+
util::FSyncOrThrow(file_.get());
|
| 257 |
+
break;
|
| 258 |
+
}
|
| 259 |
+
// header and vocab share the same mmap.
|
| 260 |
+
Parameters params = Parameters();
|
| 261 |
+
memset(¶ms, 0, sizeof(Parameters));
|
| 262 |
+
params.counts = counts;
|
| 263 |
+
params.fixed.order = counts.size();
|
| 264 |
+
params.fixed.probing_multiplier = config.probing_multiplier;
|
| 265 |
+
params.fixed.model_type = model_type;
|
| 266 |
+
params.fixed.has_vocabulary = config.include_vocab;
|
| 267 |
+
params.fixed.search_version = search_version;
|
| 268 |
+
switch (write_method_) {
|
| 269 |
+
case Config::WRITE_MMAP:
|
| 270 |
+
WriteHeader(mapping_.get(), params);
|
| 271 |
+
util::SyncOrThrow(mapping_.get(), mapping_.size());
|
| 272 |
+
break;
|
| 273 |
+
case Config::WRITE_AFTER:
|
| 274 |
+
{
|
| 275 |
+
std::vector<uint8_t> buffer(TotalHeaderSize(counts.size()));
|
| 276 |
+
WriteHeader(&buffer[0], params);
|
| 277 |
+
util::SeekOrThrow(file_.get(), 0);
|
| 278 |
+
util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
|
| 279 |
+
}
|
| 280 |
+
break;
|
| 281 |
+
}
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
void BinaryFormat::MapFile(void *&vocab_base, void *&search_base) {
|
| 285 |
+
mapping_.reset(util::MapOrThrow(vocab_string_offset_, true, util::kFileFlags, false, file_.get()), vocab_string_offset_, util::scoped_memory::MMAP_ALLOCATED);
|
| 286 |
+
vocab_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
|
| 287 |
+
search_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_ + vocab_size_ + vocab_pad_;
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
bool RecognizeBinary(const char *file, ModelType &recognized) {
|
| 291 |
+
util::scoped_fd fd(util::OpenReadOrThrow(file));
|
| 292 |
+
if (!IsBinaryFormat(fd.get())) {
|
| 293 |
+
return false;
|
| 294 |
+
}
|
| 295 |
+
Parameters params;
|
| 296 |
+
ReadHeader(fd.get(), params);
|
| 297 |
+
recognized = params.fixed.model_type;
|
| 298 |
+
return true;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
} // namespace ngram
|
| 302 |
+
} // namespace lm
|
mosesdecoder/lm/binary_format.hh
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef LM_BINARY_FORMAT_H
|
| 2 |
+
#define LM_BINARY_FORMAT_H
|
| 3 |
+
|
| 4 |
+
#include "lm/config.hh"
|
| 5 |
+
#include "lm/model_type.hh"
|
| 6 |
+
#include "lm/read_arpa.hh"
|
| 7 |
+
|
| 8 |
+
#include "util/file_piece.hh"
|
| 9 |
+
#include "util/mmap.hh"
|
| 10 |
+
#include "util/scoped.hh"
|
| 11 |
+
|
| 12 |
+
#include <cstddef>
|
| 13 |
+
#include <vector>
|
| 14 |
+
|
| 15 |
+
#include <stdint.h>
|
| 16 |
+
|
| 17 |
+
namespace lm {
|
| 18 |
+
namespace ngram {
|
| 19 |
+
|
| 20 |
+
extern const char *kModelNames[6];
|
| 21 |
+
|
| 22 |
+
/*Inspect a file to determine if it is a binary lm. If not, return false.
|
| 23 |
+
* If so, return true and set recognized to the type. This is the only API in
|
| 24 |
+
* this header designed for use by decoder authors.
|
| 25 |
+
*/
|
| 26 |
+
bool RecognizeBinary(const char *file, ModelType &recognized);
|
| 27 |
+
|
| 28 |
+
struct FixedWidthParameters {
|
| 29 |
+
unsigned char order;
|
| 30 |
+
float probing_multiplier;
|
| 31 |
+
// What type of model is this?
|
| 32 |
+
ModelType model_type;
|
| 33 |
+
// Does the end of the file have the actual strings in the vocabulary?
|
| 34 |
+
bool has_vocabulary;
|
| 35 |
+
unsigned int search_version;
|
| 36 |
+
};
|
| 37 |
+
|
| 38 |
+
// This is a macro instead of an inline function so constants can be assigned using it.
|
| 39 |
+
#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
|
| 40 |
+
|
| 41 |
+
// Parameters stored in the header of a binary file.
|
| 42 |
+
struct Parameters {
|
| 43 |
+
FixedWidthParameters fixed;
|
| 44 |
+
std::vector<uint64_t> counts;
|
| 45 |
+
};
|
| 46 |
+
|
| 47 |
+
class BinaryFormat {
|
| 48 |
+
public:
|
| 49 |
+
explicit BinaryFormat(const Config &config);
|
| 50 |
+
|
| 51 |
+
// Reading a binary file:
|
| 52 |
+
// Takes ownership of fd
|
| 53 |
+
void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms);
|
| 54 |
+
// Used to read parts of the file to update the config object before figuring out full size.
|
| 55 |
+
void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
|
| 56 |
+
// Actually load the binary file and return a pointer to the beginning of the search area.
|
| 57 |
+
void *LoadBinary(std::size_t size);
|
| 58 |
+
|
| 59 |
+
uint64_t VocabStringReadingOffset() const {
|
| 60 |
+
assert(vocab_string_offset_ != kInvalidOffset);
|
| 61 |
+
return vocab_string_offset_;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
// Writing a binary file or initializing in RAM from ARPA:
|
| 65 |
+
// Size for vocabulary.
|
| 66 |
+
void *SetupJustVocab(std::size_t memory_size, uint8_t order);
|
| 67 |
+
// Warning: can change the vocaulary base pointer.
|
| 68 |
+
void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
|
| 69 |
+
// Warning: can change vocabulary and search base addresses.
|
| 70 |
+
void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
|
| 71 |
+
// Write the header at the beginning of the file.
|
| 72 |
+
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
|
| 73 |
+
|
| 74 |
+
private:
|
| 75 |
+
void MapFile(void *&vocab_base, void *&search_base);
|
| 76 |
+
|
| 77 |
+
// Copied from configuration.
|
| 78 |
+
const Config::WriteMethod write_method_;
|
| 79 |
+
const char *write_mmap_;
|
| 80 |
+
util::LoadMethod load_method_;
|
| 81 |
+
|
| 82 |
+
// File behind memory, if any.
|
| 83 |
+
util::scoped_fd file_;
|
| 84 |
+
|
| 85 |
+
// If there is a file involved, a single mapping.
|
| 86 |
+
util::scoped_memory mapping_;
|
| 87 |
+
|
| 88 |
+
// If the data is only in memory, separately allocate each because the trie
|
| 89 |
+
// knows vocab's size before it knows search's size (because SRILM might
|
| 90 |
+
// have pruned).
|
| 91 |
+
util::scoped_memory memory_vocab_, memory_search_;
|
| 92 |
+
|
| 93 |
+
// Memory ranges. Note that these may not be contiguous and may not all
|
| 94 |
+
// exist.
|
| 95 |
+
std::size_t header_size_, vocab_size_, vocab_pad_;
|
| 96 |
+
// aka end of search.
|
| 97 |
+
uint64_t vocab_string_offset_;
|
| 98 |
+
|
| 99 |
+
static const uint64_t kInvalidOffset = (uint64_t)-1;
|
| 100 |
+
};
|
| 101 |
+
|
| 102 |
+
bool IsBinaryFormat(int fd);
|
| 103 |
+
|
| 104 |
+
} // namespace ngram
|
| 105 |
+
} // namespace lm
|
| 106 |
+
#endif // LM_BINARY_FORMAT_H
|
mosesdecoder/lm/blank.hh
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef LM_BLANK_H
|
| 2 |
+
#define LM_BLANK_H
|
| 3 |
+
|
| 4 |
+
#include <limits>
|
| 5 |
+
#include <stdint.h>
|
| 6 |
+
#include <cmath>
|
| 7 |
+
|
| 8 |
+
namespace lm {
|
| 9 |
+
namespace ngram {
|
| 10 |
+
|
| 11 |
+
/* Suppose "foo bar" appears with zero backoff but there is no trigram
|
| 12 |
+
* beginning with these words. Then, when scoring "foo bar", the model could
|
| 13 |
+
* return out_state containing "bar" or even null context if "bar" also has no
|
| 14 |
+
* backoff and is never followed by another word. Then the backoff is set to
|
| 15 |
+
* kNoExtensionBackoff. If the n-gram might be extended, then out_state must
|
| 16 |
+
* contain the full n-gram, in which case kExtensionBackoff is set. In any
|
| 17 |
+
* case, if an n-gram has non-zero backoff, the full state is returned so
|
| 18 |
+
* backoff can be properly charged.
|
| 19 |
+
* These differ only in sign bit because the backoff is in fact zero in either
|
| 20 |
+
* case.
|
| 21 |
+
*/
|
| 22 |
+
const float kNoExtensionBackoff = -0.0;
|
| 23 |
+
const float kExtensionBackoff = 0.0;
|
| 24 |
+
const uint64_t kNoExtensionQuant = 0;
|
| 25 |
+
const uint64_t kExtensionQuant = 1;
|
| 26 |
+
|
| 27 |
+
inline void SetExtension(float &backoff) {
|
| 28 |
+
if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
// This compiles down nicely.
|
| 32 |
+
inline bool HasExtension(const float &backoff) {
|
| 33 |
+
typedef union { float f; uint32_t i; } UnionValue;
|
| 34 |
+
UnionValue compare, interpret;
|
| 35 |
+
compare.f = kNoExtensionBackoff;
|
| 36 |
+
interpret.f = backoff;
|
| 37 |
+
return compare.i != interpret.i;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
} // namespace ngram
|
| 41 |
+
} // namespace lm
|
| 42 |
+
#endif // LM_BLANK_H
|
mosesdecoder/lm/build_binary_main.cc
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "lm/model.hh"
|
| 2 |
+
#include "lm/sizes.hh"
|
| 3 |
+
#include "util/file_piece.hh"
|
| 4 |
+
#include "util/usage.hh"
|
| 5 |
+
|
| 6 |
+
#include <algorithm>
|
| 7 |
+
#include <cstdlib>
|
| 8 |
+
#include <exception>
|
| 9 |
+
#include <iostream>
|
| 10 |
+
#include <iomanip>
|
| 11 |
+
#include <limits>
|
| 12 |
+
#include <cmath>
|
| 13 |
+
#include <cstdlib>
|
| 14 |
+
|
| 15 |
+
#ifdef WIN32
|
| 16 |
+
#include "util/getopt.hh"
|
| 17 |
+
#else
|
| 18 |
+
#include <unistd.h>
|
| 19 |
+
#endif
|
| 20 |
+
|
| 21 |
+
namespace lm {
|
| 22 |
+
namespace ngram {
|
| 23 |
+
namespace {
|
| 24 |
+
|
| 25 |
+
void Usage(const char *name, const char *default_mem) {
|
| 26 |
+
std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
|
| 27 |
+
"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
|
| 28 |
+
" Default is -100. The ARPA file will always take precedence.\n"
|
| 29 |
+
"-s allows models to be built even if they do not have <s> and </s>.\n"
|
| 30 |
+
"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
|
| 31 |
+
"-w mmap|after determines how writing is done.\n"
|
| 32 |
+
" mmap maps the binary file and writes to it. Default for trie.\n"
|
| 33 |
+
" after allocates anonymous memory, builds, and writes. Default for probing.\n"
|
| 34 |
+
"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n"
|
| 35 |
+
" model files. order1.arpa must be an ARPA file. All others may be ARPA or\n"
|
| 36 |
+
" the same data structure as being built. All files must have the same\n"
|
| 37 |
+
" vocabulary. For probing, the unigrams must be in the same order.\n\n"
|
| 38 |
+
"type is either probing or trie. Default is probing.\n\n"
|
| 39 |
+
"probing uses a probing hash table. It is the fastest but uses the most memory.\n"
|
| 40 |
+
"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n"
|
| 41 |
+
"trie is a straightforward trie with bit-level packing. It uses the least\n"
|
| 42 |
+
"memory and is still faster than SRI or IRST. Building the trie format uses an\n"
|
| 43 |
+
"on-disk sort to save memory.\n"
|
| 44 |
+
"-T is the temporary directory prefix. Default is the output file name.\n"
|
| 45 |
+
"-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n"
|
| 46 |
+
" with GNU sort. The number is followed by a unit: \% for percent of physical\n"
|
| 47 |
+
" memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n"
|
| 48 |
+
" Default unit is K for Kilobytes.\n"
|
| 49 |
+
"-q turns quantization on and sets the number of bits (e.g. -q 8).\n"
|
| 50 |
+
"-b sets backoff quantization bits. Requires -q and defaults to that value.\n"
|
| 51 |
+
"-a compresses pointers using an array of offsets. The parameter is the\n"
|
| 52 |
+
" maximum number of bits encoded by the array. Memory is minimized subject\n"
|
| 53 |
+
" to the maximum, so pick 255 to minimize memory.\n\n"
|
| 54 |
+
"-h print this help message.\n\n"
|
| 55 |
+
"Get a memory estimate by passing an ARPA file without an output file name.\n";
|
| 56 |
+
exit(1);
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
// I could really use boost::lexical_cast right about now.
|
| 60 |
+
float ParseFloat(const char *from) {
|
| 61 |
+
char *end;
|
| 62 |
+
float ret = strtod(from, &end);
|
| 63 |
+
if (*end) throw util::ParseNumberException(from);
|
| 64 |
+
return ret;
|
| 65 |
+
}
|
| 66 |
+
unsigned long int ParseUInt(const char *from) {
|
| 67 |
+
char *end;
|
| 68 |
+
unsigned long int ret = strtoul(from, &end, 10);
|
| 69 |
+
if (*end) throw util::ParseNumberException(from);
|
| 70 |
+
return ret;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
uint8_t ParseBitCount(const char *from) {
|
| 74 |
+
unsigned long val = ParseUInt(from);
|
| 75 |
+
if (val > 25) {
|
| 76 |
+
util::ParseNumberException e(from);
|
| 77 |
+
e << " bit counts are limited to 25.";
|
| 78 |
+
}
|
| 79 |
+
return val;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
void ParseFileList(const char *from, std::vector<std::string> &to) {
|
| 83 |
+
to.clear();
|
| 84 |
+
while (true) {
|
| 85 |
+
const char *i;
|
| 86 |
+
for (i = from; *i && *i != ' '; ++i) {}
|
| 87 |
+
to.push_back(std::string(from, i - from));
|
| 88 |
+
if (!*i) break;
|
| 89 |
+
from = i + 1;
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
void ProbingQuantizationUnsupported() {
|
| 94 |
+
std::cerr << "Quantization is only implemented in the trie data structure." << std::endl;
|
| 95 |
+
exit(1);
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
} // namespace ngram
|
| 99 |
+
} // namespace lm
|
| 100 |
+
} // namespace
|
| 101 |
+
|
| 102 |
+
int main(int argc, char *argv[]) {
|
| 103 |
+
using namespace lm::ngram;
|
| 104 |
+
|
| 105 |
+
const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G";
|
| 106 |
+
|
| 107 |
+
if (argc == 2 && !strcmp(argv[1], "--help"))
|
| 108 |
+
Usage(argv[0], default_mem);
|
| 109 |
+
|
| 110 |
+
try {
|
| 111 |
+
bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false;
|
| 112 |
+
lm::ngram::Config config;
|
| 113 |
+
config.building_memory = util::ParseSize(default_mem);
|
| 114 |
+
int opt;
|
| 115 |
+
while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:h")) != -1) {
|
| 116 |
+
switch(opt) {
|
| 117 |
+
case 'q':
|
| 118 |
+
config.prob_bits = ParseBitCount(optarg);
|
| 119 |
+
if (!set_backoff_bits) config.backoff_bits = config.prob_bits;
|
| 120 |
+
quantize = true;
|
| 121 |
+
break;
|
| 122 |
+
case 'b':
|
| 123 |
+
config.backoff_bits = ParseBitCount(optarg);
|
| 124 |
+
set_backoff_bits = true;
|
| 125 |
+
break;
|
| 126 |
+
case 'a':
|
| 127 |
+
config.pointer_bhiksha_bits = ParseBitCount(optarg);
|
| 128 |
+
bhiksha = true;
|
| 129 |
+
break;
|
| 130 |
+
case 'u':
|
| 131 |
+
config.unknown_missing_logprob = ParseFloat(optarg);
|
| 132 |
+
break;
|
| 133 |
+
case 'p':
|
| 134 |
+
config.probing_multiplier = ParseFloat(optarg);
|
| 135 |
+
break;
|
| 136 |
+
case 't': // legacy
|
| 137 |
+
case 'T':
|
| 138 |
+
config.temporary_directory_prefix = optarg;
|
| 139 |
+
util::NormalizeTempPrefix(config.temporary_directory_prefix);
|
| 140 |
+
break;
|
| 141 |
+
case 'm': // legacy
|
| 142 |
+
config.building_memory = ParseUInt(optarg) * 1048576;
|
| 143 |
+
break;
|
| 144 |
+
case 'S':
|
| 145 |
+
config.building_memory = std::min(static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), util::ParseSize(optarg));
|
| 146 |
+
break;
|
| 147 |
+
case 'w':
|
| 148 |
+
set_write_method = true;
|
| 149 |
+
if (!strcmp(optarg, "mmap")) {
|
| 150 |
+
config.write_method = Config::WRITE_MMAP;
|
| 151 |
+
} else if (!strcmp(optarg, "after")) {
|
| 152 |
+
config.write_method = Config::WRITE_AFTER;
|
| 153 |
+
} else {
|
| 154 |
+
Usage(argv[0], default_mem);
|
| 155 |
+
}
|
| 156 |
+
break;
|
| 157 |
+
case 's':
|
| 158 |
+
config.sentence_marker_missing = lm::SILENT;
|
| 159 |
+
break;
|
| 160 |
+
case 'i':
|
| 161 |
+
config.positive_log_probability = lm::SILENT;
|
| 162 |
+
break;
|
| 163 |
+
case 'r':
|
| 164 |
+
rest = true;
|
| 165 |
+
ParseFileList(optarg, config.rest_lower_files);
|
| 166 |
+
config.rest_function = Config::REST_LOWER;
|
| 167 |
+
break;
|
| 168 |
+
case 'h': // help
|
| 169 |
+
default:
|
| 170 |
+
Usage(argv[0], default_mem);
|
| 171 |
+
}
|
| 172 |
+
}
|
| 173 |
+
if (!quantize && set_backoff_bits) {
|
| 174 |
+
std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl;
|
| 175 |
+
abort();
|
| 176 |
+
}
|
| 177 |
+
if (optind + 1 == argc) {
|
| 178 |
+
ShowSizes(argv[optind], config);
|
| 179 |
+
return 0;
|
| 180 |
+
}
|
| 181 |
+
const char *model_type;
|
| 182 |
+
const char *from_file;
|
| 183 |
+
|
| 184 |
+
if (optind + 2 == argc) {
|
| 185 |
+
model_type = "probing";
|
| 186 |
+
from_file = argv[optind];
|
| 187 |
+
config.write_mmap = argv[optind + 1];
|
| 188 |
+
} else if (optind + 3 == argc) {
|
| 189 |
+
model_type = argv[optind];
|
| 190 |
+
from_file = argv[optind + 1];
|
| 191 |
+
config.write_mmap = argv[optind + 2];
|
| 192 |
+
} else {
|
| 193 |
+
Usage(argv[0], default_mem);
|
| 194 |
+
return 1;
|
| 195 |
+
}
|
| 196 |
+
if (!strcmp(model_type, "probing")) {
|
| 197 |
+
if (!set_write_method) config.write_method = Config::WRITE_AFTER;
|
| 198 |
+
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
|
| 199 |
+
if (rest) {
|
| 200 |
+
RestProbingModel(from_file, config);
|
| 201 |
+
} else {
|
| 202 |
+
ProbingModel(from_file, config);
|
| 203 |
+
}
|
| 204 |
+
} else if (!strcmp(model_type, "trie")) {
|
| 205 |
+
if (rest) {
|
| 206 |
+
std::cerr << "Rest + trie is not supported yet." << std::endl;
|
| 207 |
+
return 1;
|
| 208 |
+
}
|
| 209 |
+
if (!set_write_method) config.write_method = Config::WRITE_MMAP;
|
| 210 |
+
if (quantize) {
|
| 211 |
+
if (bhiksha) {
|
| 212 |
+
QuantArrayTrieModel(from_file, config);
|
| 213 |
+
} else {
|
| 214 |
+
QuantTrieModel(from_file, config);
|
| 215 |
+
}
|
| 216 |
+
} else {
|
| 217 |
+
if (bhiksha) {
|
| 218 |
+
ArrayTrieModel(from_file, config);
|
| 219 |
+
} else {
|
| 220 |
+
TrieModel(from_file, config);
|
| 221 |
+
}
|
| 222 |
+
}
|
| 223 |
+
} else {
|
| 224 |
+
Usage(argv[0], default_mem);
|
| 225 |
+
}
|
| 226 |
+
}
|
| 227 |
+
catch (const std::exception &e) {
|
| 228 |
+
std::cerr << e.what() << std::endl;
|
| 229 |
+
std::cerr << "ERROR" << std::endl;
|
| 230 |
+
return 1;
|
| 231 |
+
}
|
| 232 |
+
std::cerr << "SUCCESS" << std::endl;
|
| 233 |
+
return 0;
|
| 234 |
+
}
|
mosesdecoder/lm/builder/CMakeLists.txt
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cmake_minimum_required(VERSION 2.8.8)
|
| 2 |
+
#
|
| 3 |
+
# The KenLM cmake files make use of add_library(... OBJECTS ...)
|
| 4 |
+
#
|
| 5 |
+
# This syntax allows grouping of source files when compiling
|
| 6 |
+
# (effectively creating "fake" libraries based on source subdirs).
|
| 7 |
+
#
|
| 8 |
+
# This syntax was only added in cmake version 2.8.8
|
| 9 |
+
#
|
| 10 |
+
# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
|
| 14 |
+
|
| 15 |
+
# Explicitly list the source files for this subdirectory
|
| 16 |
+
#
|
| 17 |
+
# If you add any source files to this subdirectory
|
| 18 |
+
# that should be included in the kenlm library,
|
| 19 |
+
# (this excludes any unit test files)
|
| 20 |
+
# you should add them to the following list:
|
| 21 |
+
#
|
| 22 |
+
# In order to set correct paths to these files
|
| 23 |
+
# in case this variable is referenced by CMake files in the parent directory,
|
| 24 |
+
# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
|
| 25 |
+
#
|
| 26 |
+
set(KENLM_BUILDER_SOURCE
|
| 27 |
+
${CMAKE_CURRENT_SOURCE_DIR}/adjust_counts.cc
|
| 28 |
+
${CMAKE_CURRENT_SOURCE_DIR}/corpus_count.cc
|
| 29 |
+
${CMAKE_CURRENT_SOURCE_DIR}/initial_probabilities.cc
|
| 30 |
+
${CMAKE_CURRENT_SOURCE_DIR}/interpolate.cc
|
| 31 |
+
${CMAKE_CURRENT_SOURCE_DIR}/output.cc
|
| 32 |
+
${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cc
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Group these objects together for later use.
|
| 37 |
+
#
|
| 38 |
+
# Given add_library(foo OBJECT ${my_foo_sources}),
|
| 39 |
+
# refer to these objects as $<TARGET_OBJECTS:foo>
|
| 40 |
+
#
|
| 41 |
+
add_library(kenlm_builder OBJECT ${KENLM_BUILDER_SOURCE})
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Compile the executable, linking against the requisite dependent object files
|
| 45 |
+
add_executable(lmplz lmplz_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_common> $<TARGET_OBJECTS:kenlm_builder> $<TARGET_OBJECTS:kenlm_util>)
|
| 46 |
+
|
| 47 |
+
# Link the executable against boost
|
| 48 |
+
target_link_libraries(lmplz ${Boost_LIBRARIES} pthread)
|
| 49 |
+
|
| 50 |
+
# Group executables together
|
| 51 |
+
set_target_properties(lmplz PROPERTIES FOLDER executables)
|
| 52 |
+
|
| 53 |
+
if(BUILD_TESTING)
|
| 54 |
+
|
| 55 |
+
# Explicitly list the Boost test files to be compiled
|
| 56 |
+
set(KENLM_BOOST_TESTS_LIST
|
| 57 |
+
adjust_counts_test
|
| 58 |
+
corpus_count_test
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
|
| 62 |
+
DEPENDS $<TARGET_OBJECTS:kenlm>
|
| 63 |
+
$<TARGET_OBJECTS:kenlm_common>
|
| 64 |
+
$<TARGET_OBJECTS:kenlm_util>
|
| 65 |
+
$<TARGET_OBJECTS:kenlm_builder>
|
| 66 |
+
LIBRARIES ${Boost_LIBRARIES} pthread)
|
| 67 |
+
endif()
|
mosesdecoder/lm/builder/Jamfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fakelib builder : [ glob *.cc : *test.cc *main.cc ]
|
| 2 |
+
../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm ../common//common
|
| 3 |
+
: : : <library>/top//boost_thread $(timer-link) ;
|
| 4 |
+
|
| 5 |
+
exe lmplz : lmplz_main.cc builder /top//boost_program_options ;
|
| 6 |
+
|
| 7 |
+
exe dump_counts : dump_counts_main.cc builder ;
|
| 8 |
+
|
| 9 |
+
alias programs : lmplz dump_counts ;
|
| 10 |
+
|
| 11 |
+
import testing ;
|
| 12 |
+
unit-test corpus_count_test : corpus_count_test.cc builder /top//boost_unit_test_framework ;
|
| 13 |
+
unit-test adjust_counts_test : adjust_counts_test.cc builder /top//boost_unit_test_framework ;
|