Spaces:

suricodes
/

hindi-sindhi-translator

Paused

App Files Files Community

suricodes commited on Oct 18, 2024

Commit

0a39881

verified ·

1 Parent(s): ac1e89a

Upload 172 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
mosesdecoder/lm/CMakeLists.txt +90 -0
mosesdecoder/lm/Jamfile +40 -0
mosesdecoder/lm/bhiksha.cc +94 -0
mosesdecoder/lm/bhiksha.hh +122 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/bhiksha.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/binary_format.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary +3 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary_main.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/config.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment +3 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment_main.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark +3 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark_main.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/lm_exception.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/model.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/quantize.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query +3 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query_main.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/read_arpa.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_hashed.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_trie.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/sizes.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie_sort.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/value_build.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/virtual_interface.o +0 -0
mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/vocab.o +0 -0
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test +3 -0
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.o +0 -0
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.output +8 -0
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.run +8 -0
mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.test +1 -0
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test +3 -0
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.o +3 -0
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.output +11 -0
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.run +11 -0
mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.test +1 -0
mosesdecoder/lm/bin/order.log +1 -0
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test +3 -0
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.o +0 -0
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.output +8 -0
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.run +8 -0
mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.test +1 -0
mosesdecoder/lm/binary_format.cc +302 -0
mosesdecoder/lm/binary_format.hh +106 -0
mosesdecoder/lm/blank.hh +42 -0
mosesdecoder/lm/build_binary_main.cc +234 -0
mosesdecoder/lm/builder/CMakeLists.txt +67 -0
mosesdecoder/lm/builder/Jamfile +13 -0

.gitattributes CHANGED Viewed

@@ -49,3 +49,14 @@ mosesdecoder/lib/libmoses2_lib.a filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/lib/libmoses2decoder.a filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/lib/libprobingpt.a filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/lib/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text

 mosesdecoder/lib/libmoses2decoder.a filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/lib/libprobingpt.a filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/lib/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.o filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/lm/builder/bin/gcc-9/release/link-static/threading-multi/dump_counts filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/lm/builder/bin/gcc-9/release/link-static/threading-multi/lmplz filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/lm/filter/bin/gcc-9/release/link-static/threading-multi/filter filter=lfs diff=lfs merge=lfs -text

mosesdecoder/lm/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,90 @@

+cmake_minimum_required(VERSION 2.8.8)
+#
+# The KenLM cmake files make use of add_library(... OBJECTS ...)
+#
+# This syntax allows grouping of source files when compiling
+# (effectively creating "fake" libraries based on source subdirs).
+#
+# This syntax was only added in cmake version 2.8.8
+#
+# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
+# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
+set(KENLM_MAX_ORDER 6 CACHE STRING "Maximum supported ngram order")
+add_definitions(-DKENLM_MAX_ORDER=${KENLM_MAX_ORDER})
+# Explicitly list the source files for this subdirectory
+#
+# If you add any source files to this subdirectory
+#    that should be included in the kenlm library,
+#        (this excludes any unit test files)
+#    you should add them to the following list:
+set(KENLM_SOURCE
+	bhiksha.cc
+	binary_format.cc
+	config.cc
+	lm_exception.cc
+	model.cc
+	quantize.cc
+	read_arpa.cc
+	search_hashed.cc
+	search_trie.cc
+	sizes.cc
+	trie.cc
+	trie_sort.cc
+	value_build.cc
+	virtual_interface.cc
+	vocab.cc
+)
+# Group these objects together for later use.
+#
+# Given add_library(foo OBJECT ${my_foo_sources}),
+# refer to these objects as $<TARGET_OBJECTS:foo>
+#
+add_library(kenlm OBJECT ${KENLM_SOURCE})
+# This directory has children that need to be processed
+add_subdirectory(builder)
+add_subdirectory(common)
+add_subdirectory(filter)
+# Explicitly list the executable files to be compiled
+set(EXE_LIST
+  query
+  fragment
+  build_binary
+)
+AddExes(EXES ${EXE_LIST}
+        DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
+        LIBRARIES ${Boost_LIBRARIES} pthread)
+# Conditionally build the interpolation code
+if(BUILD_INTERPOLATE)
+  add_subdirectory(interpolate)
+endif()
+if(BUILD_TESTING)
+  set(KENLM_BOOST_TESTS_LIST left_test partial_test)
+  AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
+           DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
+           LIBRARIES ${Boost_LIBRARIES} pthread
+           TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa)
+  # model_test requires an extra command line parameter
+  KenLMAddTest(TEST model_test
+               DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
+               LIBRARIES ${Boost_LIBRARIES} pthread
+               TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa
+                         ${CMAKE_CURRENT_SOURCE_DIR}/test_nounk.arpa)
+endif()

mosesdecoder/lm/Jamfile ADDED Viewed

	@@ -0,0 +1,40 @@

+# If you need higher order, change this option
+# Having this limit means that State can be
+# (KENLM_MAX_ORDER - 1) * sizeof(float) bytes instead of
+# sizeof(float*) + (KENLM_MAX_ORDER - 1) * sizeof(float) + malloc overhead
+max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
+if ( $(max-order) != 6 ) {
+   echo "Setting KenLM maximum n-gram order to $(max-order)" ;
+}
+max-order = <define>KENLM_MAX_ORDER=$(max-order) ;
+path-constant ORDER-LOG : bin/order.log ;
+update-if-changed $(ORDER-LOG) $(max-order) ;
+max-order += <dependency>$(ORDER-LOG) ;
+wrappers = ;
+local with-nplm = [ option.get "with-nplm" ] ;
+if $(with-nplm) {
+  lib nplm : : <search>$(with-nplm)/src ;
+  obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp <include>$(with-nplm)/3rdparty/eigen <define>NPLM_DOUBLE_PRECISION=0 ;
+  alias nplm-all : nplm.o nplm ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
+  wrappers += nplm-all ;
+}
+fakelib kenlm : $(wrappers) [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : <include>.. $(max-order) : : <include>.. $(max-order) ;
+import testing ;
+run left_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ;
+run model_test.cc kenlm /top//boost_unit_test_framework : : test.arpa test_nounk.arpa ;
+run partial_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ;
+exes = ;
+for local p in [ glob *_main.cc ] {
+  local name = [ MATCH "(.*)\_main.cc" : $(p) ] ;
+  exe $(name) : $(p) kenlm ;
+  exes += $(name) ;
+}
+alias programs : $(exes) filter//filter filter//phrase_table_vocab builder//dump_counts : <threading>multi:<source>builder//lmplz ;

mosesdecoder/lm/bhiksha.cc ADDED Viewed

	@@ -0,0 +1,94 @@

+#include "lm/bhiksha.hh"
+#include "lm/binary_format.hh"
+#include "lm/config.hh"
+#include "util/file.hh"
+#include "util/exception.hh"
+#include <limits>
+namespace lm {
+namespace ngram {
+namespace trie {
+DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
+  next_(util::BitsMask::ByMax(max_next)) {}
+const uint8_t kArrayBhikshaVersion = 0;
+// TODO: put this in binary file header instead when I change the binary file format again.
+void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
+  uint8_t buffer[2];
+  file.ReadForConfig(buffer, 2, offset);
+  uint8_t version = buffer[0];
+  uint8_t configured_bits = buffer[1];
+  if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion);
+  config.pointer_bhiksha_bits = configured_bits;
+}
+namespace {
+// Find argmin_{chopped \in [0, RequiredBits(max_next)]} ChoppedDelta(max_offset)
+uint8_t ChopBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
+  uint8_t required = util::RequiredBits(max_next);
+  uint8_t best_chop = 0;
+  int64_t lowest_change = std::numeric_limits<int64_t>::max();
+  // There are probably faster ways but I don't care because this is only done once per order at construction time.
+  for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) {
+    int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */
+      - max_offset * static_cast<int64_t>(chop); /* savings in bits*/
+    if (change < lowest_change) {
+      lowest_change = change;
+      best_chop = chop;
+    }
+  }
+  return best_chop;
+}
+std::size_t ArrayCount(uint64_t max_offset, uint64_t max_next, const Config &config) {
+  uint8_t required = util::RequiredBits(max_next);
+  uint8_t chopping = ChopBits(max_offset, max_next, config);
+  return (max_next >> (required - chopping)) + 1 /* we store 0 too */;
+}
+} // namespace
+uint64_t ArrayBhiksha::Size(uint64_t max_offset, uint64_t max_next, const Config &config) {
+  return sizeof(uint64_t) * (1 /* header */ + ArrayCount(max_offset, max_next, config)) + 7 /* 8-byte alignment */;
+}
+uint8_t ArrayBhiksha::InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
+  return util::RequiredBits(max_next) - ChopBits(max_offset, max_next, config);
+}
+namespace {
+void *AlignTo8(void *from) {
+  uint8_t *val = reinterpret_cast<uint8_t*>(from);
+  std::size_t remainder = reinterpret_cast<std::size_t>(val) & 7;
+  if (!remainder) return val;
+  return val + 8 - remainder;
+}
+} // namespace
+ArrayBhiksha::ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_next, const Config &config)
+  : next_inline_(util::BitsMask::ByBits(InlineBits(max_offset, max_next, config))),
+    offset_begin_(reinterpret_cast<const uint64_t*>(AlignTo8(base)) + 1 /* 8-byte header */),
+    offset_end_(offset_begin_ + ArrayCount(max_offset, max_next, config)),
+    write_to_(reinterpret_cast<uint64_t*>(AlignTo8(base)) + 1 /* 8-byte header */ + 1 /* first entry is 0 */),
+    original_base_(base) {}
+void ArrayBhiksha::FinishedLoading(const Config &config) {
+  // *offset_begin_ = 0 but without a const_cast.
+  *(write_to_ - (write_to_ - offset_begin_)) = 0;
+  if (write_to_ != offset_end_) UTIL_THROW(util::Exception, "Did not get all the array entries that were expected.");
+  uint8_t *head_write = reinterpret_cast<uint8_t*>(original_base_);
+  *(head_write++) = kArrayBhikshaVersion;
+  *(head_write++) = config.pointer_bhiksha_bits;
+}
+} // namespace trie
+} // namespace ngram
+} // namespace lm

mosesdecoder/lm/bhiksha.hh ADDED Viewed

	@@ -0,0 +1,122 @@

+/* Simple implementation of
+ * @inproceedings{bhikshacompression,
+ *  author={Bhiksha Raj and Ed Whittaker},
+ *  year={2003},
+ *  title={Lossless Compression of Language Model Structure and Word Identifiers},
+ *  booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing},
+ *  pages={388--391},
+ *  }
+ *
+ *  Currently only used for next pointers.
+ */
+#ifndef LM_BHIKSHA_H
+#define LM_BHIKSHA_H
+#include "lm/model_type.hh"
+#include "lm/trie.hh"
+#include "util/bit_packing.hh"
+#include "util/sorted_uniform.hh"
+#include <algorithm>
+#include <stdint.h>
+#include <cassert>
+namespace lm {
+namespace ngram {
+struct Config;
+class BinaryFormat;
+namespace trie {
+class DontBhiksha {
+  public:
+    static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
+    static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {}
+    static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; }
+    static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) {
+      return util::RequiredBits(max_next);
+    }
+    DontBhiksha(const void *base, uint64_t max_offset, uint64_t max_next, const Config &config);
+    void ReadNext(const void *base, uint64_t bit_offset, uint64_t /*index*/, uint8_t total_bits, NodeRange &out) const {
+      out.begin = util::ReadInt57(base, bit_offset, next_.bits, next_.mask);
+      out.end = util::ReadInt57(base, bit_offset + total_bits, next_.bits, next_.mask);
+      //assert(out.end >= out.begin);
+    }
+    void WriteNext(void *base, uint64_t bit_offset, uint64_t /*index*/, uint64_t value) {
+      util::WriteInt57(base, bit_offset, next_.bits, value);
+    }
+    void FinishedLoading(const Config &/*config*/) {}
+    uint8_t InlineBits() const { return next_.bits; }
+  private:
+    util::BitsMask next_;
+};
+class ArrayBhiksha {
+  public:
+    static const ModelType kModelTypeAdd = kArrayAdd;
+    static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);
+    static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config);
+    static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config);
+    ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_value, const Config &config);
+    void ReadNext(const void *base, uint64_t bit_offset, uint64_t index, uint8_t total_bits, NodeRange &out) const {
+      // Some assertions are commented out because they are expensive.
+      // assert(*offset_begin_ == 0);
+      // std::upper_bound returns the first element that is greater.  Want the
+      // last element that is <= to the index.
+      const uint64_t *begin_it = std::upper_bound(offset_begin_, offset_end_, index) - 1;
+      // Since *offset_begin_ == 0, the position should be in range.
+      // assert(begin_it >= offset_begin_);
+      const uint64_t *end_it;
+      for (end_it = begin_it + 1; (end_it < offset_end_) && (*end_it <= index + 1); ++end_it) {}
+      // assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1));
+      --end_it;
+      // assert(end_it >= begin_it);
+      out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
+        util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask);
+      out.end = ((end_it - offset_begin_) << next_inline_.bits) |
+        util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask);
+      // If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052
+      assert(out.end >= out.begin);
+    }
+    void WriteNext(void *base, uint64_t bit_offset, uint64_t index, uint64_t value) {
+      uint64_t encode = value >> next_inline_.bits;
+      for (; write_to_ <= offset_begin_ + encode; ++write_to_) *write_to_ = index;
+      util::WriteInt57(base, bit_offset, next_inline_.bits, value & next_inline_.mask);
+    }
+    void FinishedLoading(const Config &config);
+    uint8_t InlineBits() const { return next_inline_.bits; }
+  private:
+    const util::BitsMask next_inline_;
+    const uint64_t *const offset_begin_;
+    const uint64_t *const offset_end_;
+    uint64_t *write_to_;
+    void *original_base_;
+};
+} // namespace trie
+} // namespace ngram
+} // namespace lm
+#endif // LM_BHIKSHA_H

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/bhiksha.o ADDED Viewed

Binary file (24.4 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/binary_format.o ADDED Viewed

Binary file (87 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89bb1a5052a26025dee0f23bf7492c60881ae5b02ceb378b78905a1e166926cc
+size 1367920

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/build_binary_main.o ADDED Viewed

Binary file (127 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/config.o ADDED Viewed

Binary file (2.63 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e3bb1750b2c843ff2ad2b81b01b70f7e52c34abd8ce296575a19e10f9769b31
+size 1367912

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/fragment_main.o ADDED Viewed

Binary file (55 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b00c5cb3fc290d10f4dd59c4e3c3472199ebe32cb9dcd25963e07a5e3227af89
+size 1412248

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/kenlm_benchmark_main.o ADDED Viewed

Binary file (211 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/lm_exception.o ADDED Viewed

Binary file (11.7 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/model.o ADDED Viewed

Binary file (297 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/quantize.o ADDED Viewed

Binary file (42.2 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dee549913c814cfa80d753c6dd7cb494099bd5c7da5cda664eedd73f7acc8f72
+size 1388928

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/query_main.o ADDED Viewed

Binary file (167 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/read_arpa.o ADDED Viewed

Binary file (105 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_hashed.o ADDED Viewed

Binary file (169 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/search_trie.o ADDED Viewed

Binary file (196 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/sizes.o ADDED Viewed

Binary file (10.8 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie.o ADDED Viewed

Binary file (35.5 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/trie_sort.o ADDED Viewed

Binary file (118 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/value_build.o ADDED Viewed

Binary file (81.7 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/virtual_interface.o ADDED Viewed

Binary file (5.9 kB). View file

mosesdecoder/lm/bin/gcc-9/release/link-static/threading-multi/vocab.o ADDED Viewed

Binary file (124 kB). View file

mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c226dcb9b7f76bb74bb4114ca69a955381ad9790c361f5ef642b772bb9b0b434
+size 2458688

mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.o ADDED Viewed

Binary file (698 kB). View file

mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.output ADDED Viewed

	@@ -0,0 +1,8 @@

+Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
+                    and should be placed after all Boost.Test arguments and the -- separator.
+                    For example: left_test --random -- lm/test.arpa
+Running 6 test cases...
+*** No errors detected
+EXIT STATUS: 0

mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.run ADDED Viewed

	@@ -0,0 +1,8 @@

+Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
+                    and should be placed after all Boost.Test arguments and the -- separator.
+                    For example: left_test --random -- lm/test.arpa
+Running 6 test cases...
+*** No errors detected
+EXIT STATUS: 0

mosesdecoder/lm/bin/left_test.test/gcc-9/release/link-static/threading-multi/left_test.test ADDED Viewed

	@@ -0,0 +1 @@


1	+ passed

mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5f8ac4d5e3d965ef934f3075a93f9c59093e1cb6ee44ef27d71542a43cbb50f
+size 2890976

mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.o ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db0276d839aa0274ab47aed2de1e3d1089c83eedfbb92ac58e9b17b6096940e1
+size 1513008

mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.output ADDED Viewed

	@@ -0,0 +1,11 @@

+Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
+                    and should be placed after all Boost.Test arguments and the -- separator.
+                    For example: model_test --random -- lm/test.arpa
+Boost.Test WARNING: token "lm/test_nounk.arpa" does not correspond to the Boost.Test argument
+                    and should be placed after all Boost.Test arguments and the -- separator.
+                    For example: model_test --random -- lm/test_nounk.arpa
+Running 12 test cases...
+*** No errors detected
+EXIT STATUS: 0

mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.run ADDED Viewed

	@@ -0,0 +1,11 @@

+Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
+                    and should be placed after all Boost.Test arguments and the -- separator.
+                    For example: model_test --random -- lm/test.arpa
+Boost.Test WARNING: token "lm/test_nounk.arpa" does not correspond to the Boost.Test argument
+                    and should be placed after all Boost.Test arguments and the -- separator.
+                    For example: model_test --random -- lm/test_nounk.arpa
+Running 12 test cases...
+*** No errors detected
+EXIT STATUS: 0

mosesdecoder/lm/bin/model_test.test/gcc-9/release/link-static/threading-multi/model_test.test ADDED Viewed

	@@ -0,0 +1 @@


1	+ passed

mosesdecoder/lm/bin/order.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ <define>KENLM_MAX_ORDER=6

mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50e0176970c570eea64c3cf5ce6ac7f3432f91e605676d7cbbf8e055a1e307b4
+size 2254664

mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.o ADDED Viewed

Binary file (211 kB). View file

mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.output ADDED Viewed

	@@ -0,0 +1,8 @@

+Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
+                    and should be placed after all Boost.Test arguments and the -- separator.
+                    For example: partial_test --random -- lm/test.arpa
+Running 4 test cases...
+*** No errors detected
+EXIT STATUS: 0

mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.run ADDED Viewed

	@@ -0,0 +1,8 @@

+Boost.Test WARNING: token "lm/test.arpa" does not correspond to the Boost.Test argument
+                    and should be placed after all Boost.Test arguments and the -- separator.
+                    For example: partial_test --random -- lm/test.arpa
+Running 4 test cases...
+*** No errors detected
+EXIT STATUS: 0

mosesdecoder/lm/bin/partial_test.test/gcc-9/release/link-static/threading-multi/partial_test.test ADDED Viewed

	@@ -0,0 +1 @@


1	+ passed

mosesdecoder/lm/binary_format.cc ADDED Viewed

	@@ -0,0 +1,302 @@

+#include "lm/binary_format.hh"
+#include "lm/lm_exception.hh"
+#include "util/file.hh"
+#include "util/file_piece.hh"
+#include <cstddef>
+#include <cstring>
+#include <limits>
+#include <string>
+#include <cstdlib>
+#include <stdint.h>
+namespace lm {
+namespace ngram {
+const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
+namespace {
+const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
+const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
+// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
+const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";
+const long int kMagicVersion = 5;
+// Old binary files built on 32-bit machines have this header.
+// TODO: eliminate with next binary release.
+struct OldSanity {
+  char magic[sizeof(kMagicBytes)];
+  float zero_f, one_f, minus_half_f;
+  WordIndex one_word_index, max_word_index;
+  uint64_t one_uint64;
+  void SetToReference() {
+    std::memset(this, 0, sizeof(OldSanity));
+    std::memcpy(magic, kMagicBytes, sizeof(magic));
+    zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5;
+    one_word_index = 1;
+    max_word_index = std::numeric_limits<WordIndex>::max();
+    one_uint64 = 1;
+  }
+};
+// Test values aligned to 8 bytes.
+struct Sanity {
+  char magic[ALIGN8(sizeof(kMagicBytes))];
+  float zero_f, one_f, minus_half_f;
+  WordIndex one_word_index, max_word_index, padding_to_8;
+  uint64_t one_uint64;
+  void SetToReference() {
+    std::memset(this, 0, sizeof(Sanity));
+    std::memcpy(magic, kMagicBytes, sizeof(kMagicBytes));
+    zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5;
+    one_word_index = 1;
+    max_word_index = std::numeric_limits<WordIndex>::max();
+    padding_to_8 = 0;
+    one_uint64 = 1;
+  }
+};
+std::size_t TotalHeaderSize(unsigned char order) {
+  return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
+}
+void WriteHeader(void *to, const Parameters &params) {
+  Sanity header = Sanity();
+  header.SetToReference();
+  std::memcpy(to, &header, sizeof(Sanity));
+  char *out = reinterpret_cast<char*>(to) + sizeof(Sanity);
+  *reinterpret_cast<FixedWidthParameters*>(out) = params.fixed;
+  out += sizeof(FixedWidthParameters);
+  uint64_t *counts = reinterpret_cast<uint64_t*>(out);
+  for (std::size_t i = 0; i < params.counts.size(); ++i) {
+    counts[i] = params.counts[i];
+  }
+}
+} // namespace
+bool IsBinaryFormat(int fd) {
+  const uint64_t size = util::SizeFile(fd);
+  if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
+  // Try reading the header.
+  util::scoped_memory memory;
+  try {
+    util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory);
+  } catch (const util::Exception &e) {
+    return false;
+  }
+  Sanity reference_header = Sanity();
+  reference_header.SetToReference();
+  if (!std::memcmp(memory.get(), &reference_header, sizeof(Sanity))) return true;
+  if (!std::memcmp(memory.get(), kMagicIncomplete, strlen(kMagicIncomplete))) {
+    UTIL_THROW(FormatLoadException, "This binary file did not finish building");
+  }
+  if (!std::memcmp(memory.get(), kMagicBeforeVersion, strlen(kMagicBeforeVersion))) {
+    char *end_ptr;
+    const char *begin_version = static_cast<const char*>(memory.get()) + strlen(kMagicBeforeVersion);
+    long int version = std::strtol(begin_version, &end_ptr, 10);
+    if ((end_ptr != begin_version) && version != kMagicVersion) {
+      UTIL_THROW(FormatLoadException, "Binary file has version " << version << " but this implementation expects version " << kMagicVersion << " so you'll have to use the ARPA to rebuild your binary");
+    }
+    OldSanity old_sanity = OldSanity();
+    old_sanity.SetToReference();
+    UTIL_THROW_IF(!std::memcmp(memory.get(), &old_sanity, sizeof(OldSanity)), FormatLoadException, "Looks like this is an old 32-bit format.  The old 32-bit format has been removed so that 64-bit and 32-bit files are exchangeable.");
+    UTIL_THROW(FormatLoadException, "File looks like it should be loaded with mmap, but the test values don't match.  Try rebuilding the binary format LM using the same code revision, compiler, and architecture");
+  }
+  return false;
+}
+void ReadHeader(int fd, Parameters &out) {
+  util::SeekOrThrow(fd, sizeof(Sanity));
+  util::ReadOrThrow(fd, &out.fixed, sizeof(out.fixed));
+  if (out.fixed.probing_multiplier < 1.0)
+    UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0.");
+  out.counts.resize(static_cast<std::size_t>(out.fixed.order));
+  if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
+}
+void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params) {
+  if (params.fixed.model_type != model_type) {
+    if (static_cast<unsigned int>(params.fixed.model_type) >= (sizeof(kModelNames) / sizeof(const char *)))
+      UTIL_THROW(FormatLoadException, "The binary file claims to be model type " << static_cast<unsigned int>(params.fixed.model_type) << " but this is not implemented for in this inference code.");
+    UTIL_THROW(FormatLoadException, "The binary file was built for " << kModelNames[params.fixed.model_type] << " but the inference code is trying to load " << kModelNames[model_type]);
+  }
+  UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version);
+}
+const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
+BinaryFormat::BinaryFormat(const Config &config)
+  : write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
+    header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}
+void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params) {
+  file_.reset(fd);
+  write_mmap_ = NULL; // Ignore write requests; this is already in binary format.
+  ReadHeader(fd, params);
+  MatchCheck(model_type, search_version, params);
+  header_size_ = TotalHeaderSize(params.counts.size());
+}
+void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const {
+  assert(header_size_ != kInvalidSize);
+  util::ErsatzPRead(file_.get(), to, amount, offset_excluding_header + header_size_);
+}
+void *BinaryFormat::LoadBinary(std::size_t size) {
+  assert(header_size_ != kInvalidSize);
+  const uint64_t file_size = util::SizeFile(file_.get());
+  // The header is smaller than a page, so we have to map the whole header as well.
+  uint64_t total_map = static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(size);
+  UTIL_THROW_IF(file_size != util::kBadSize && file_size < total_map, FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
+  util::MapRead(load_method_, file_.get(), 0, util::CheckOverflow(total_map), mapping_);
+  vocab_string_offset_ = total_map;
+  return reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
+}
+void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
+  vocab_size_ = memory_size;
+  if (!write_mmap_) {
+    header_size_ = 0;
+    util::HugeMalloc(memory_size, true, memory_vocab_);
+    return reinterpret_cast<uint8_t*>(memory_vocab_.get());
+  }
+  header_size_ = TotalHeaderSize(order);
+  std::size_t total = util::CheckOverflow(static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(memory_size));
+  file_.reset(util::CreateOrThrow(write_mmap_));
+  // some gccs complain about uninitialized variables even though all enum values are covered.
+  void *vocab_base = NULL;
+  switch (write_method_) {
+    case Config::WRITE_MMAP:
+      mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
+      util::AdviseHugePages(vocab_base, total);
+      vocab_base = mapping_.get();
+      break;
+    case Config::WRITE_AFTER:
+      util::ResizeOrThrow(file_.get(), 0);
+      util::HugeMalloc(total, true, memory_vocab_);
+      vocab_base = memory_vocab_.get();
+      break;
+  }
+  strncpy(reinterpret_cast<char*>(vocab_base), kMagicIncomplete, header_size_);
+  return reinterpret_cast<uint8_t*>(vocab_base) + header_size_;
+}
+void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base) {
+  assert(vocab_size_ != kInvalidSize);
+  vocab_pad_ = vocab_pad;
+  std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size;
+  vocab_string_offset_ = new_size;
+  if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) {
+    util::HugeMalloc(memory_size, true, memory_search_);
+    assert(header_size_ == 0 || write_mmap_);
+    vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
+    util::AdviseHugePages(memory_search_.get(), memory_size);
+    return reinterpret_cast<uint8_t*>(memory_search_.get());
+  }
+  assert(write_method_ == Config::WRITE_MMAP);
+  // Also known as total size without vocab words.
+  // Grow the file to accomodate the search, using zeros.
+  // According to man mmap, behavior is undefined when the file is resized
+  // underneath a mmap that is not a multiple of the page size.  So to be
+  // safe, we'll unmap it and map it again.
+  mapping_.reset();
+  util::ResizeOrThrow(file_.get(), new_size);
+  void *ret;
+  MapFile(vocab_base, ret);
+  util::AdviseHugePages(ret, new_size);
+  return ret;
+}
+void BinaryFormat::WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base) {
+  // Checking Config's include_vocab is the responsibility of the caller.
+  assert(header_size_ != kInvalidSize && vocab_size_ != kInvalidSize);
+  if (!write_mmap_) {
+    // Unchanged base.
+    vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get());
+    search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
+    return;
+  }
+  if (write_method_ == Config::WRITE_MMAP) {
+    mapping_.reset();
+  }
+  util::SeekOrThrow(file_.get(), VocabStringReadingOffset());
+  util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
+  if (write_method_ == Config::WRITE_MMAP) {
+    MapFile(vocab_base, search_base);
+  } else {
+    vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
+    search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
+  }
+}
+void BinaryFormat::FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts) {
+  if (!write_mmap_) return;
+  switch (write_method_) {
+    case Config::WRITE_MMAP:
+      util::SyncOrThrow(mapping_.get(), mapping_.size());
+      break;
+    case Config::WRITE_AFTER:
+      util::SeekOrThrow(file_.get(), 0);
+      util::WriteOrThrow(file_.get(), memory_vocab_.get(), memory_vocab_.size());
+      util::SeekOrThrow(file_.get(), header_size_ + vocab_size_ + vocab_pad_);
+      util::WriteOrThrow(file_.get(), memory_search_.get(), memory_search_.size());
+      util::FSyncOrThrow(file_.get());
+      break;
+  }
+  // header and vocab share the same mmap.
+  Parameters params = Parameters();
+  memset(&params, 0, sizeof(Parameters));
+  params.counts = counts;
+  params.fixed.order = counts.size();
+  params.fixed.probing_multiplier = config.probing_multiplier;
+  params.fixed.model_type = model_type;
+  params.fixed.has_vocabulary = config.include_vocab;
+  params.fixed.search_version = search_version;
+  switch (write_method_) {
+    case Config::WRITE_MMAP:
+      WriteHeader(mapping_.get(), params);
+      util::SyncOrThrow(mapping_.get(), mapping_.size());
+      break;
+    case Config::WRITE_AFTER:
+      {
+        std::vector<uint8_t> buffer(TotalHeaderSize(counts.size()));
+        WriteHeader(&buffer[0], params);
+        util::SeekOrThrow(file_.get(), 0);
+        util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
+      }
+      break;
+  }
+}
+void BinaryFormat::MapFile(void *&vocab_base, void *&search_base) {
+  mapping_.reset(util::MapOrThrow(vocab_string_offset_, true, util::kFileFlags, false, file_.get()), vocab_string_offset_, util::scoped_memory::MMAP_ALLOCATED);
+  vocab_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
+  search_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_ + vocab_size_ + vocab_pad_;
+}
+bool RecognizeBinary(const char *file, ModelType &recognized) {
+  util::scoped_fd fd(util::OpenReadOrThrow(file));
+  if (!IsBinaryFormat(fd.get())) {
+    return false;
+  }
+  Parameters params;
+  ReadHeader(fd.get(), params);
+  recognized = params.fixed.model_type;
+  return true;
+}
+} // namespace ngram
+} // namespace lm

mosesdecoder/lm/binary_format.hh ADDED Viewed

	@@ -0,0 +1,106 @@

+#ifndef LM_BINARY_FORMAT_H
+#define LM_BINARY_FORMAT_H
+#include "lm/config.hh"
+#include "lm/model_type.hh"
+#include "lm/read_arpa.hh"
+#include "util/file_piece.hh"
+#include "util/mmap.hh"
+#include "util/scoped.hh"
+#include <cstddef>
+#include <vector>
+#include <stdint.h>
+namespace lm {
+namespace ngram {
+extern const char *kModelNames[6];
+/*Inspect a file to determine if it is a binary lm.  If not, return false.
+ * If so, return true and set recognized to the type.  This is the only API in
+ * this header designed for use by decoder authors.
+ */
+bool RecognizeBinary(const char *file, ModelType &recognized);
+struct FixedWidthParameters {
+  unsigned char order;
+  float probing_multiplier;
+  // What type of model is this?
+  ModelType model_type;
+  // Does the end of the file have the actual strings in the vocabulary?
+  bool has_vocabulary;
+  unsigned int search_version;
+};
+// This is a macro instead of an inline function so constants can be assigned using it.
+#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
+// Parameters stored in the header of a binary file.
+struct Parameters {
+  FixedWidthParameters fixed;
+  std::vector<uint64_t> counts;
+};
+class BinaryFormat {
+  public:
+    explicit BinaryFormat(const Config &config);
+    // Reading a binary file:
+    // Takes ownership of fd
+    void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params);
+    // Used to read parts of the file to update the config object before figuring out full size.
+    void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
+    // Actually load the binary file and return a pointer to the beginning of the search area.
+    void *LoadBinary(std::size_t size);
+    uint64_t VocabStringReadingOffset() const {
+      assert(vocab_string_offset_ != kInvalidOffset);
+      return vocab_string_offset_;
+    }
+    // Writing a binary file or initializing in RAM from ARPA:
+    // Size for vocabulary.
+    void *SetupJustVocab(std::size_t memory_size, uint8_t order);
+    // Warning: can change the vocaulary base pointer.
+    void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
+    // Warning: can change vocabulary and search base addresses.
+    void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
+    // Write the header at the beginning of the file.
+    void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
+  private:
+    void MapFile(void *&vocab_base, void *&search_base);
+    // Copied from configuration.
+    const Config::WriteMethod write_method_;
+    const char *write_mmap_;
+    util::LoadMethod load_method_;
+    // File behind memory, if any.
+    util::scoped_fd file_;
+    // If there is a file involved, a single mapping.
+    util::scoped_memory mapping_;
+    // If the data is only in memory, separately allocate each because the trie
+    // knows vocab's size before it knows search's size (because SRILM might
+    // have pruned).
+    util::scoped_memory memory_vocab_, memory_search_;
+    // Memory ranges.  Note that these may not be contiguous and may not all
+    // exist.
+    std::size_t header_size_, vocab_size_, vocab_pad_;
+    // aka end of search.
+    uint64_t vocab_string_offset_;
+    static const uint64_t kInvalidOffset = (uint64_t)-1;
+};
+bool IsBinaryFormat(int fd);
+} // namespace ngram
+} // namespace lm
+#endif // LM_BINARY_FORMAT_H

mosesdecoder/lm/blank.hh ADDED Viewed

	@@ -0,0 +1,42 @@

+#ifndef LM_BLANK_H
+#define LM_BLANK_H
+#include <limits>
+#include <stdint.h>
+#include <cmath>
+namespace lm {
+namespace ngram {
+/* Suppose "foo bar" appears with zero backoff but there is no trigram
+ * beginning with these words.  Then, when scoring "foo bar", the model could
+ * return out_state containing "bar" or even null context if "bar" also has no
+ * backoff and is never followed by another word.  Then the backoff is set to
+ * kNoExtensionBackoff.  If the n-gram might be extended, then out_state must
+ * contain the full n-gram, in which case kExtensionBackoff is set.  In any
+ * case, if an n-gram has non-zero backoff, the full state is returned so
+ * backoff can be properly charged.
+ * These differ only in sign bit because the backoff is in fact zero in either
+ * case.
+ */
+const float kNoExtensionBackoff = -0.0;
+const float kExtensionBackoff = 0.0;
+const uint64_t kNoExtensionQuant = 0;
+const uint64_t kExtensionQuant = 1;
+inline void SetExtension(float &backoff) {
+  if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
+}
+// This compiles down nicely.
+inline bool HasExtension(const float &backoff) {
+  typedef union { float f; uint32_t i; } UnionValue;
+  UnionValue compare, interpret;
+  compare.f = kNoExtensionBackoff;
+  interpret.f = backoff;
+  return compare.i != interpret.i;
+}
+} // namespace ngram
+} // namespace lm
+#endif // LM_BLANK_H

mosesdecoder/lm/build_binary_main.cc ADDED Viewed

	@@ -0,0 +1,234 @@

+#include "lm/model.hh"
+#include "lm/sizes.hh"
+#include "util/file_piece.hh"
+#include "util/usage.hh"
+#include <algorithm>
+#include <cstdlib>
+#include <exception>
+#include <iostream>
+#include <iomanip>
+#include <limits>
+#include <cmath>
+#include <cstdlib>
+#ifdef WIN32
+#include "util/getopt.hh"
+#else
+#include <unistd.h>
+#endif
+namespace lm {
+namespace ngram {
+namespace {
+void Usage(const char *name, const char *default_mem) {
+  std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
+"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
+"   Default is -100.  The ARPA file will always take precedence.\n"
+"-s allows models to be built even if they do not have <s> and </s>.\n"
+"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
+"-w mmap|after determines how writing is done.\n"
+"   mmap maps the binary file and writes to it.  Default for trie.\n"
+"   after allocates anonymous memory, builds, and writes.  Default for probing.\n"
+"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n"
+"   model files.  order1.arpa must be an ARPA file.  All others may be ARPA or\n"
+"   the same data structure as being built.  All files must have the same\n"
+"   vocabulary.  For probing, the unigrams must be in the same order.\n\n"
+"type is either probing or trie.  Default is probing.\n\n"
+"probing uses a probing hash table.  It is the fastest but uses the most memory.\n"
+"-p sets the space multiplier and must be >1.0.  The default is 1.5.\n\n"
+"trie is a straightforward trie with bit-level packing.  It uses the least\n"
+"memory and is still faster than SRI or IRST.  Building the trie format uses an\n"
+"on-disk sort to save memory.\n"
+"-T is the temporary directory prefix.  Default is the output file name.\n"
+"-S determines memory use for sorting.  Default is " << default_mem << ".  This is compatible\n"
+"   with GNU sort.  The number is followed by a unit: \% for percent of physical\n"
+"   memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y.  \n"
+"   Default unit is K for Kilobytes.\n"
+"-q turns quantization on and sets the number of bits (e.g. -q 8).\n"
+"-b sets backoff quantization bits.  Requires -q and defaults to that value.\n"
+"-a compresses pointers using an array of offsets.  The parameter is the\n"
+"   maximum number of bits encoded by the array.  Memory is minimized subject\n"
+"   to the maximum, so pick 255 to minimize memory.\n\n"
+"-h print this help message.\n\n"
+"Get a memory estimate by passing an ARPA file without an output file name.\n";
+  exit(1);
+}
+// I could really use boost::lexical_cast right about now.
+float ParseFloat(const char *from) {
+  char *end;
+  float ret = strtod(from, &end);
+  if (*end) throw util::ParseNumberException(from);
+  return ret;
+}
+unsigned long int ParseUInt(const char *from) {
+  char *end;
+  unsigned long int ret = strtoul(from, &end, 10);
+  if (*end) throw util::ParseNumberException(from);
+  return ret;
+}
+uint8_t ParseBitCount(const char *from) {
+  unsigned long val = ParseUInt(from);
+  if (val > 25) {
+    util::ParseNumberException e(from);
+    e << " bit counts are limited to 25.";
+  }
+  return val;
+}
+void ParseFileList(const char *from, std::vector<std::string> &to) {
+  to.clear();
+  while (true) {
+    const char *i;
+    for (i = from; *i && *i != ' '; ++i) {}
+    to.push_back(std::string(from, i - from));
+    if (!*i) break;
+    from = i + 1;
+  }
+}
+void ProbingQuantizationUnsupported() {
+  std::cerr << "Quantization is only implemented in the trie data structure." << std::endl;
+  exit(1);
+}
+} // namespace ngram
+} // namespace lm
+} // namespace
+int main(int argc, char *argv[]) {
+  using namespace lm::ngram;
+  const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G";
+  if (argc == 2 && !strcmp(argv[1], "--help"))
+    Usage(argv[0], default_mem);
+  try {
+    bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false;
+    lm::ngram::Config config;
+    config.building_memory = util::ParseSize(default_mem);
+    int opt;
+    while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:h")) != -1) {
+      switch(opt) {
+        case 'q':
+          config.prob_bits = ParseBitCount(optarg);
+          if (!set_backoff_bits) config.backoff_bits = config.prob_bits;
+          quantize = true;
+          break;
+        case 'b':
+          config.backoff_bits = ParseBitCount(optarg);
+          set_backoff_bits = true;
+          break;
+        case 'a':
+          config.pointer_bhiksha_bits = ParseBitCount(optarg);
+          bhiksha = true;
+          break;
+        case 'u':
+          config.unknown_missing_logprob = ParseFloat(optarg);
+          break;
+        case 'p':
+          config.probing_multiplier = ParseFloat(optarg);
+          break;
+        case 't': // legacy
+        case 'T':
+          config.temporary_directory_prefix = optarg;
+          util::NormalizeTempPrefix(config.temporary_directory_prefix);
+          break;
+        case 'm': // legacy
+          config.building_memory = ParseUInt(optarg) * 1048576;
+          break;
+        case 'S':
+          config.building_memory = std::min(static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), util::ParseSize(optarg));
+          break;
+        case 'w':
+          set_write_method = true;
+          if (!strcmp(optarg, "mmap")) {
+            config.write_method = Config::WRITE_MMAP;
+          } else if (!strcmp(optarg, "after")) {
+            config.write_method = Config::WRITE_AFTER;
+          } else {
+            Usage(argv[0], default_mem);
+          }
+          break;
+        case 's':
+          config.sentence_marker_missing = lm::SILENT;
+          break;
+        case 'i':
+          config.positive_log_probability = lm::SILENT;
+          break;
+        case 'r':
+          rest = true;
+          ParseFileList(optarg, config.rest_lower_files);
+          config.rest_function = Config::REST_LOWER;
+          break;
+        case 'h': // help
+        default:
+          Usage(argv[0], default_mem);
+      }
+    }
+    if (!quantize && set_backoff_bits) {
+      std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl;
+      abort();
+    }
+    if (optind + 1 == argc) {
+      ShowSizes(argv[optind], config);
+      return 0;
+    }
+    const char *model_type;
+    const char *from_file;
+    if (optind + 2 == argc) {
+      model_type = "probing";
+      from_file = argv[optind];
+      config.write_mmap = argv[optind + 1];
+    } else if (optind + 3 == argc) {
+      model_type = argv[optind];
+      from_file = argv[optind + 1];
+      config.write_mmap = argv[optind + 2];
+    } else {
+      Usage(argv[0], default_mem);
+      return 1;
+    }
+    if (!strcmp(model_type, "probing")) {
+      if (!set_write_method) config.write_method = Config::WRITE_AFTER;
+      if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
+      if (rest) {
+        RestProbingModel(from_file, config);
+      } else {
+        ProbingModel(from_file, config);
+      }
+    } else if (!strcmp(model_type, "trie")) {
+      if (rest) {
+        std::cerr << "Rest + trie is not supported yet." << std::endl;
+        return 1;
+      }
+      if (!set_write_method) config.write_method = Config::WRITE_MMAP;
+      if (quantize) {
+        if (bhiksha) {
+          QuantArrayTrieModel(from_file, config);
+        } else {
+          QuantTrieModel(from_file, config);
+        }
+      } else {
+        if (bhiksha) {
+          ArrayTrieModel(from_file, config);
+        } else {
+          TrieModel(from_file, config);
+        }
+      }
+    } else {
+      Usage(argv[0], default_mem);
+    }
+  }
+  catch (const std::exception &e) {
+    std::cerr << e.what() << std::endl;
+    std::cerr << "ERROR" << std::endl;
+    return 1;
+  }
+  std::cerr << "SUCCESS" << std::endl;
+  return 0;
+}

mosesdecoder/lm/builder/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,67 @@

+cmake_minimum_required(VERSION 2.8.8)
+#
+# The KenLM cmake files make use of add_library(... OBJECTS ...)
+#
+# This syntax allows grouping of source files when compiling
+# (effectively creating "fake" libraries based on source subdirs).
+#
+# This syntax was only added in cmake version 2.8.8
+#
+# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
+# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
+# Explicitly list the source files for this subdirectory
+#
+# If you add any source files to this subdirectory
+#    that should be included in the kenlm library,
+#        (this excludes any unit test files)
+#    you should add them to the following list:
+#
+# In order to set correct paths to these files
+#    in case this variable is referenced by CMake files in the parent directory,
+#    we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
+#
+set(KENLM_BUILDER_SOURCE
+		${CMAKE_CURRENT_SOURCE_DIR}/adjust_counts.cc
+		${CMAKE_CURRENT_SOURCE_DIR}/corpus_count.cc
+		${CMAKE_CURRENT_SOURCE_DIR}/initial_probabilities.cc
+		${CMAKE_CURRENT_SOURCE_DIR}/interpolate.cc
+		${CMAKE_CURRENT_SOURCE_DIR}/output.cc
+		${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cc
+	)
+# Group these objects together for later use.
+#
+# Given add_library(foo OBJECT ${my_foo_sources}),
+# refer to these objects as $<TARGET_OBJECTS:foo>
+#
+add_library(kenlm_builder OBJECT ${KENLM_BUILDER_SOURCE})
+# Compile the executable, linking against the requisite dependent object files
+add_executable(lmplz lmplz_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_common> $<TARGET_OBJECTS:kenlm_builder> $<TARGET_OBJECTS:kenlm_util>)
+# Link the executable against boost
+target_link_libraries(lmplz ${Boost_LIBRARIES} pthread)
+# Group executables together
+set_target_properties(lmplz PROPERTIES FOLDER executables)
+if(BUILD_TESTING)
+  # Explicitly list the Boost test files to be compiled
+  set(KENLM_BOOST_TESTS_LIST
+    adjust_counts_test
+    corpus_count_test
+  )
+  AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
+           DEPENDS $<TARGET_OBJECTS:kenlm>
+                   $<TARGET_OBJECTS:kenlm_common>
+                   $<TARGET_OBJECTS:kenlm_util>
+                   $<TARGET_OBJECTS:kenlm_builder>
+           LIBRARIES ${Boost_LIBRARIES} pthread)
+endif()

mosesdecoder/lm/builder/Jamfile ADDED Viewed

	@@ -0,0 +1,13 @@

+fakelib builder : [ glob *.cc : *test.cc *main.cc ]
+  ../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm ../common//common
+  : : : <library>/top//boost_thread $(timer-link) ;
+exe lmplz : lmplz_main.cc builder /top//boost_program_options ;
+exe dump_counts : dump_counts_main.cc builder ;
+alias programs : lmplz dump_counts ;
+import testing ;
+unit-test corpus_count_test : corpus_count_test.cc builder /top//boost_unit_test_framework ;
+unit-test adjust_counts_test : adjust_counts_test.cc builder /top//boost_unit_test_framework ;