| namespace util { | |
| class FilePiece; | |
| namespace stream { | |
| class ChainPosition; | |
| } // namespace stream | |
| } // namespace util | |
| namespace lm { | |
| namespace builder { | |
| class CorpusCount { | |
| public: | |
| // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size | |
| static float DedupeMultiplier(std::size_t order); | |
| // How much memory vocabulary will use based on estimated size of the vocab. | |
| static std::size_t VocabUsage(std::size_t vocab_estimate); | |
| // token_count: out. | |
| // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value. | |
| CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol); | |
| void Run(const util::stream::ChainPosition &position); | |
| private: | |
| util::FilePiece &from_; | |
| int vocab_write_; | |
| uint64_t &token_count_; | |
| WordIndex &type_count_; | |
| std::vector<bool>& prune_words_; | |
| const std::string& prune_vocab_filename_; | |
| std::size_t dedupe_mem_size_; | |
| util::scoped_malloc dedupe_mem_; | |
| WarningAction disallowed_symbol_action_; | |
| }; | |
| } // namespace builder | |
| } // namespace lm | |