| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | #ifndef FLANN_LSH_TABLE_H_ |
| | #define FLANN_LSH_TABLE_H_ |
| |
|
| | #include <algorithm> |
| | #include <iostream> |
| | #include <iomanip> |
| | #include <limits.h> |
| | #include <random> |
| | |
| | #if USE_UNORDERED_MAP |
| | #include <unordered_map> |
| | #else |
| | #include <map> |
| | #endif |
| | #include <math.h> |
| | #include <stddef.h> |
| |
|
| | #include "FLANN/util/dynamic_bitset.h" |
| | #include "FLANN/util/matrix.h" |
| |
|
| | namespace flann |
| | { |
| |
|
| | namespace lsh |
| | { |
| |
|
| | |
| |
|
| | |
| | |
| | typedef uint32_t FeatureIndex; |
| | |
| | |
| | typedef unsigned int BucketKey; |
| |
|
| | |
| | |
| | typedef std::vector<FeatureIndex> Bucket; |
| |
|
| | |
| |
|
| | |
| | |
| | struct LshStats |
| | { |
| | std::vector<unsigned int> bucket_sizes_; |
| | size_t n_buckets_; |
| | size_t bucket_size_mean_; |
| | size_t bucket_size_median_; |
| | size_t bucket_size_min_; |
| | size_t bucket_size_max_; |
| | size_t bucket_size_std_dev; |
| | |
| | |
| | std::vector<std::vector<unsigned int> > size_histogram_; |
| | }; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | inline std::ostream& operator <<(std::ostream& out, const LshStats& stats) |
| | { |
| | size_t w = 20; |
| | out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : " |
| | << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : " |
| | << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w) |
| | << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w) |
| | << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left) |
| | << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : " |
| | << std::setiosflags(std::ios::left) << stats.bucket_size_max_; |
| |
|
| | |
| | out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : " |
| | << std::setiosflags(std::ios::left); |
| | for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end = |
| | stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ", "; |
| |
|
| | return out; |
| | } |
| |
|
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | template<typename ElementType> |
| | class LshTable |
| | { |
| | public: |
| | |
| | |
| | #if USE_UNORDERED_MAP |
| | typedef std::unordered_map<BucketKey, Bucket> BucketsSpace; |
| | #else |
| | typedef std::map<BucketKey, Bucket> BucketsSpace; |
| | #endif |
| |
|
| | |
| | |
| | typedef std::vector<Bucket> BucketsSpeed; |
| |
|
| | |
| | |
| | LshTable() |
| | { |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | LshTable(unsigned int , unsigned int ) |
| | { |
| | std::cerr << "LSH is not implemented for that type" << std::endl; |
| | throw; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | void add(unsigned int value, const ElementType* feature) |
| | { |
| | |
| | BucketKey key = getKey(feature); |
| |
|
| | switch (speed_level_) { |
| | case kArray: |
| | |
| | buckets_speed_[key].push_back(value); |
| | break; |
| | case kBitsetHash: |
| | |
| | key_bitset_.set(key); |
| | buckets_space_[key].push_back(value); |
| | break; |
| | case kHash: |
| | { |
| | |
| | buckets_space_[key].push_back(value); |
| | break; |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | void add(const std::vector< std::pair<size_t, ElementType*> >& features) |
| | { |
| | #if USE_UNORDERED_MAP |
| | buckets_space_.rehash((buckets_space_.size() + features.size()) * 1.2); |
| | #endif |
| | |
| | for (size_t i = 0; i < features.size(); ++i) { |
| | add(features[i].first, features[i].second); |
| | } |
| | |
| | optimize(); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | inline const Bucket* getBucketFromKey(BucketKey key) const |
| | { |
| | |
| | switch (speed_level_) { |
| | case kArray: |
| | |
| | return &buckets_speed_[key]; |
| | break; |
| | case kBitsetHash: |
| | |
| | if (key_bitset_.test(key)) return &buckets_space_.find(key)->second; |
| | else return 0; |
| | break; |
| | case kHash: |
| | { |
| | |
| | BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end(); |
| | bucket_it = buckets_space_.find(key); |
| | |
| | if (bucket_it == bucket_end) return 0; |
| | else return &bucket_it->second; |
| | break; |
| | } |
| | } |
| | return 0; |
| | } |
| |
|
| | |
| | |
| | size_t getKey(const ElementType* ) const |
| | { |
| | std::cerr << "LSH is not implemented for that type" << std::endl; |
| | throw; |
| | return 1; |
| | } |
| |
|
| | |
| | |
| | |
| | LshStats getStats() const; |
| |
|
| | private: |
| | |
| | |
| | |
| | |
| | |
| | enum SpeedLevel |
| | { |
| | kArray, kBitsetHash, kHash |
| | }; |
| |
|
| | |
| | |
| | void initialize(size_t key_size) |
| | { |
| | speed_level_ = kHash; |
| | key_size_ = key_size; |
| | } |
| |
|
| | |
| | |
| | void optimize() |
| | { |
| | |
| | if (speed_level_ == kArray) return; |
| |
|
| | |
| | if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) { |
| | speed_level_ = kArray; |
| | |
| | buckets_speed_.resize(size_t(1) << key_size_); |
| | for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second; |
| |
|
| | |
| | buckets_space_.clear(); |
| | return; |
| | } |
| |
|
| | |
| | |
| | if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10 |
| | >= size_t(size_t(1) << key_size_)) || (key_size_ <= 32)) { |
| | speed_level_ = kBitsetHash; |
| | key_bitset_.resize(size_t(1) << key_size_); |
| | key_bitset_.reset(); |
| | |
| | for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first); |
| | } |
| | else { |
| | speed_level_ = kHash; |
| | key_bitset_.clear(); |
| | } |
| | } |
| |
|
| | template<typename Archive> |
| | void serialize(Archive& ar) |
| | { |
| | int val; |
| | if (Archive::is_saving::value) { |
| | val = (int)speed_level_; |
| | } |
| | ar & val; |
| | if (Archive::is_loading::value) { |
| | speed_level_ = (SpeedLevel) val; |
| | } |
| |
|
| | ar & key_size_; |
| | ar & mask_; |
| |
|
| | if (speed_level_==kArray) { |
| | ar & buckets_speed_; |
| | } |
| | if (speed_level_==kBitsetHash || speed_level_==kHash) { |
| | ar & buckets_space_; |
| | } |
| | if (speed_level_==kBitsetHash) { |
| | ar & key_bitset_; |
| | } |
| | } |
| | friend struct serialization::access; |
| |
|
| | |
| | |
| | BucketsSpeed buckets_speed_; |
| |
|
| | |
| | |
| | BucketsSpace buckets_space_; |
| |
|
| | |
| | SpeedLevel speed_level_; |
| |
|
| | |
| | |
| | |
| | DynamicBitset key_bitset_; |
| |
|
| | |
| | |
| | unsigned int key_size_; |
| |
|
| | |
| | |
| | |
| | |
| | std::vector<size_t> mask_; |
| | }; |
| |
|
| | |
| | |
| |
|
| | template<> |
| | inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size) |
| | { |
| | initialize(subsignature_size); |
| | |
| | mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0); |
| |
|
| | |
| | std::vector<size_t> indices(feature_size * CHAR_BIT); |
| | for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i; |
| | std::shuffle(indices.begin(), indices.end(), std::default_random_engine()); |
| |
|
| | |
| | for (unsigned int i = 0; i < key_size_; ++i) { |
| | size_t index = indices[i]; |
| |
|
| | |
| | size_t divisor = CHAR_BIT * sizeof(size_t); |
| | size_t idx = index / divisor; |
| | mask_[idx] |= size_t(1) << (index % divisor); |
| | } |
| |
|
| | |
| | #if 0 |
| | { |
| | size_t bcount = 0; |
| | BOOST_FOREACH(size_t mask_block, mask_){ |
| | out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block |
| | << std::endl; |
| | bcount += __builtin_popcountll(mask_block); |
| | } |
| | out << "bit count : " << std::dec << bcount << std::endl; |
| | out << "mask size : " << mask_.size() << std::endl; |
| | return out; |
| | } |
| | #endif |
| | } |
| |
|
| | |
| | |
| | |
| | template<> |
| | inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const |
| | { |
| | |
| | |
| | const size_t* feature_block_ptr = reinterpret_cast<const size_t*> (feature); |
| |
|
| | |
| | |
| | |
| | size_t subsignature = 0; |
| | size_t bit_index = 1; |
| |
|
| | for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) { |
| | |
| | size_t feature_block = *feature_block_ptr; |
| | size_t mask_block = *pmask_block; |
| | while (mask_block) { |
| | |
| | size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block); |
| | |
| | subsignature += (feature_block & lowest_bit) ? bit_index : 0; |
| | |
| | mask_block ^= lowest_bit; |
| | |
| | bit_index <<= 1; |
| | } |
| | |
| | ++feature_block_ptr; |
| | } |
| | return subsignature; |
| | } |
| |
|
| | template<> |
| | inline LshStats LshTable<unsigned char>::getStats() const |
| | { |
| | LshStats stats; |
| | stats.bucket_size_mean_ = 0; |
| | if ((buckets_speed_.empty()) && (buckets_space_.empty())) { |
| | stats.n_buckets_ = 0; |
| | stats.bucket_size_median_ = 0; |
| | stats.bucket_size_min_ = 0; |
| | stats.bucket_size_max_ = 0; |
| | return stats; |
| | } |
| |
|
| | if (!buckets_speed_.empty()) { |
| | for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) { |
| | stats.bucket_sizes_.push_back(pbucket->size()); |
| | stats.bucket_size_mean_ += pbucket->size(); |
| | } |
| | stats.bucket_size_mean_ /= buckets_speed_.size(); |
| | stats.n_buckets_ = buckets_speed_.size(); |
| | } |
| | else { |
| | for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) { |
| | stats.bucket_sizes_.push_back(x->second.size()); |
| | stats.bucket_size_mean_ += x->second.size(); |
| | } |
| | stats.bucket_size_mean_ /= buckets_space_.size(); |
| | stats.n_buckets_ = buckets_space_.size(); |
| | } |
| |
|
| | std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end()); |
| |
|
| | |
| | |
| | |
| | stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2]; |
| | stats.bucket_size_min_ = stats.bucket_sizes_.front(); |
| | stats.bucket_size_max_ = stats.bucket_sizes_.back(); |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | unsigned int bin_start = 0; |
| | unsigned int bin_end = 20; |
| | bool is_new_bin = true; |
| | for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator |
| | != end; ) |
| | if (*iterator < bin_end) { |
| | if (is_new_bin) { |
| | stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0)); |
| | stats.size_histogram_.back()[0] = bin_start; |
| | stats.size_histogram_.back()[1] = bin_end - 1; |
| | is_new_bin = false; |
| | } |
| | ++stats.size_histogram_.back()[2]; |
| | ++iterator; |
| | } |
| | else { |
| | bin_start += 20; |
| | bin_end += 20; |
| | is_new_bin = true; |
| | } |
| |
|
| | return stats; |
| | } |
| |
|
| | |
| | } |
| | } |
| |
|
| | |
| |
|
| | #endif |
| |
|