| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #ifndef FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ |
| #define FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ |
|
|
| #include <algorithm> |
| #include <string> |
| #include <map> |
| #include <cassert> |
| #include <limits> |
| #include <cmath> |
|
|
| #ifndef SIZE_MAX |
| #define SIZE_MAX ((size_t) -1) |
| #endif |
|
|
| #include "FLANN/general.h" |
| #include "FLANN/algorithms/nn_index.h" |
| #include "FLANN/algorithms/dist.h" |
| #include "FLANN/util/matrix.h" |
| #include "FLANN/util/result_set.h" |
| #include "FLANN/util/heap.h" |
| #include "FLANN/util/allocator.h" |
| #include "FLANN/util/random.h" |
| #include "FLANN/util/saving.h" |
| #include "FLANN/util/serialization.h" |
|
|
| namespace flann |
| { |
|
|
| struct HierarchicalClusteringIndexParams : public IndexParams |
| { |
| HierarchicalClusteringIndexParams(int branching = 32, |
| flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, |
| int trees = 4, int leaf_max_size = 100) |
| { |
| (*this)["algorithm"] = FLANN_INDEX_HIERARCHICAL; |
| |
| (*this)["branching"] = branching; |
| |
| (*this)["centers_init"] = centers_init; |
| |
| (*this)["trees"] = trees; |
| |
| (*this)["leaf_max_size"] = leaf_max_size; |
| } |
| }; |
|
|
|
|
|
|
| |
| |
| |
| |
| |
| |
| template <typename Distance> |
| class HierarchicalClusteringIndex : public NNIndex<Distance> |
| { |
| public: |
| typedef typename Distance::ElementType ElementType; |
| typedef typename Distance::ResultType DistanceType; |
|
|
| typedef NNIndex<Distance> BaseClass; |
|
|
| |
| |
| |
| |
| |
| |
| HierarchicalClusteringIndex(const IndexParams& index_params = HierarchicalClusteringIndexParams(), Distance d = Distance()) |
| : BaseClass(index_params, d) |
| { |
| memoryCounter_ = 0; |
|
|
| branching_ = get_param(index_params_,"branching",32); |
| centers_init_ = get_param(index_params_,"centers_init", FLANN_CENTERS_RANDOM); |
| trees_ = get_param(index_params_,"trees",4); |
| leaf_max_size_ = get_param(index_params_,"leaf_max_size",100); |
|
|
| initCenterChooser(); |
| } |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| HierarchicalClusteringIndex(const Matrix<ElementType>& inputData, const IndexParams& index_params = HierarchicalClusteringIndexParams(), |
| Distance d = Distance()) |
| : BaseClass(index_params, d) |
| { |
| memoryCounter_ = 0; |
|
|
| branching_ = get_param(index_params_,"branching",32); |
| centers_init_ = get_param(index_params_,"centers_init", FLANN_CENTERS_RANDOM); |
| trees_ = get_param(index_params_,"trees",4); |
| leaf_max_size_ = get_param(index_params_,"leaf_max_size",100); |
|
|
| initCenterChooser(); |
|
|
| setDataset(inputData); |
|
|
| chooseCenters_->setDataSize(veclen_); |
| } |
|
|
|
|
| HierarchicalClusteringIndex(const HierarchicalClusteringIndex& other) : BaseClass(other), |
| memoryCounter_(other.memoryCounter_), |
| branching_(other.branching_), |
| trees_(other.trees_), |
| centers_init_(other.centers_init_), |
| leaf_max_size_(other.leaf_max_size_) |
|
|
| { |
| initCenterChooser(); |
| tree_roots_.resize(other.tree_roots_.size()); |
| for (size_t i=0;i<tree_roots_.size();++i) { |
| copyTree(tree_roots_[i], other.tree_roots_[i]); |
| } |
| } |
|
|
| HierarchicalClusteringIndex& operator=(HierarchicalClusteringIndex other) |
| { |
| this->swap(other); |
| return *this; |
| } |
|
|
|
|
| void initCenterChooser() |
| { |
| switch(centers_init_) { |
| case FLANN_CENTERS_RANDOM: |
| chooseCenters_ = new RandomCenterChooser<Distance>(distance_, points_); |
| break; |
| case FLANN_CENTERS_GONZALES: |
| chooseCenters_ = new GonzalesCenterChooser<Distance>(distance_, points_); |
| break; |
| case FLANN_CENTERS_KMEANSPP: |
| chooseCenters_ = new KMeansppCenterChooser<Distance>(distance_, points_); |
| break; |
| case FLANN_CENTERS_GROUPWISE: |
| chooseCenters_ = new GroupWiseCenterChooser<Distance>(distance_, points_); |
| break; |
| default: |
| throw FLANNException("Unknown algorithm for choosing initial centers."); |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| virtual ~HierarchicalClusteringIndex() |
| { |
| delete chooseCenters_; |
| freeIndex(); |
| } |
|
|
| BaseClass* clone() const |
| { |
| return new HierarchicalClusteringIndex(*this); |
| } |
|
|
| |
| |
| |
| |
| int usedMemory() const |
| { |
| return pool_.usedMemory+pool_.wastedMemory+memoryCounter_; |
| } |
|
|
| using BaseClass::buildIndex; |
|
|
| void addPoints(const Matrix<ElementType>& points, float rebuild_threshold = 2) |
| { |
| assert(points.cols==veclen_); |
| size_t old_size = size_; |
|
|
| extendDataset(points); |
|
|
| if (rebuild_threshold>1 && size_at_build_*rebuild_threshold<size_) { |
| buildIndex(); |
| } |
| else { |
| for (size_t i=0;i<points.rows;++i) { |
| for (int j = 0; j < trees_; j++) { |
| addPointToTree(tree_roots_[j], old_size + i); |
| } |
| } |
| } |
| } |
|
|
|
|
| flann_algorithm_t getType() const |
| { |
| return FLANN_INDEX_HIERARCHICAL; |
| } |
|
|
|
|
| template<typename Archive> |
| void serialize(Archive& ar) |
| { |
| ar.setObject(this); |
|
|
| ar & *static_cast<NNIndex<Distance>*>(this); |
|
|
| ar & branching_; |
| ar & trees_; |
| ar & centers_init_; |
| ar & leaf_max_size_; |
|
|
| if (Archive::is_loading::value) { |
| tree_roots_.resize(trees_); |
| } |
| for (size_t i=0;i<tree_roots_.size();++i) { |
| if (Archive::is_loading::value) { |
| tree_roots_[i] = new(pool_) Node(); |
| } |
| ar & *tree_roots_[i]; |
| } |
|
|
| if (Archive::is_loading::value) { |
| index_params_["algorithm"] = getType(); |
| index_params_["branching"] = branching_; |
| index_params_["trees"] = trees_; |
| index_params_["centers_init"] = centers_init_; |
| index_params_["leaf_size"] = leaf_max_size_; |
| } |
| } |
|
|
| void saveIndex(FILE* stream) |
| { |
| serialization::SaveArchive sa(stream); |
| sa & *this; |
| } |
|
|
|
|
| void loadIndex(FILE* stream) |
| { |
| serialization::LoadArchive la(stream); |
| la & *this; |
| } |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) const |
| { |
| if (removed_) { |
| findNeighborsWithRemoved<true>(result, vec, searchParams); |
| } |
| else { |
| findNeighborsWithRemoved<false>(result, vec, searchParams); |
| } |
| } |
|
|
| protected: |
|
|
| |
| |
| |
| void buildIndexImpl() |
| { |
| chooseCenters_->setDataSize(veclen_); |
|
|
| if (branching_<2) { |
| throw FLANNException("Branching factor must be at least 2"); |
| } |
| tree_roots_.resize(trees_); |
| std::vector<int> indices(size_); |
| for (int i=0; i<trees_; ++i) { |
| for (size_t j=0; j<size_; ++j) { |
| indices[j] = j; |
| } |
| tree_roots_[i] = new(pool_) Node(); |
| computeClustering(tree_roots_[i], &indices[0], size_); |
| } |
| } |
|
|
| private: |
|
|
| struct PointInfo |
| { |
| |
| size_t index; |
| |
| ElementType* point; |
|
|
| private: |
| template<typename Archive> |
| void serialize(Archive& ar) |
| { |
| typedef HierarchicalClusteringIndex<Distance> Index; |
| Index* obj = static_cast<Index*>(ar.getObject()); |
|
|
| ar & index; |
| |
|
|
| if (Archive::is_loading::value) { |
| point = obj->points_[index]; |
| } |
| } |
| friend struct serialization::access; |
| }; |
|
|
| |
| |
| |
| struct Node |
| { |
| |
| |
| |
| ElementType* pivot; |
| size_t pivot_index; |
| |
| |
| |
| std::vector<Node*> childs; |
| |
| |
| |
| std::vector<PointInfo> points; |
|
|
| Node(){ |
| pivot = NULL; |
| pivot_index = SIZE_MAX; |
| } |
| |
| |
| |
| |
| ~Node() |
| { |
| for(size_t i=0; i<childs.size(); i++){ |
| childs[i]->~Node(); |
| pivot = NULL; |
| pivot_index = -1; |
| } |
| }; |
|
|
| private: |
| template<typename Archive> |
| void serialize(Archive& ar) |
| { |
| typedef HierarchicalClusteringIndex<Distance> Index; |
| Index* obj = static_cast<Index*>(ar.getObject()); |
| ar & pivot_index; |
| if (Archive::is_loading::value) { |
| if (pivot_index != SIZE_MAX) |
| pivot = obj->points_[pivot_index]; |
| else |
| pivot = NULL; |
| } |
| size_t childs_size; |
| if (Archive::is_saving::value) { |
| childs_size = childs.size(); |
| } |
| ar & childs_size; |
|
|
| if (childs_size==0) { |
| ar & points; |
| } |
| else { |
| if (Archive::is_loading::value) { |
| childs.resize(childs_size); |
| } |
| for (size_t i=0;i<childs_size;++i) { |
| if (Archive::is_loading::value) { |
| childs[i] = new(obj->pool_) Node(); |
| } |
| ar & *childs[i]; |
| } |
| } |
|
|
| } |
| friend struct serialization::access; |
| }; |
| typedef Node* NodePtr; |
|
|
|
|
|
|
| |
| |
| |
| typedef BranchStruct<NodePtr, DistanceType> BranchSt; |
|
|
|
|
| |
| |
| |
| |
| void freeIndex(){ |
| for (size_t i=0; i<tree_roots_.size(); ++i) { |
| tree_roots_[i]->~Node(); |
| } |
| pool_.free(); |
| } |
|
|
| void copyTree(NodePtr& dst, const NodePtr& src) |
| { |
| dst = new(pool_) Node(); |
| dst->pivot_index = src->pivot_index; |
| dst->pivot = points_[dst->pivot_index]; |
|
|
| if (src->childs.size()==0) { |
| dst->points = src->points; |
| } |
| else { |
| dst->childs.resize(src->childs.size()); |
| for (size_t i=0;i<src->childs.size();++i) { |
| copyTree(dst->childs[i], src->childs[i]); |
| } |
| } |
| } |
|
|
|
|
|
|
| void computeLabels(int* indices, int indices_length, int* centers, int centers_length, int* labels, DistanceType& cost) |
| { |
| cost = 0; |
| for (int i=0; i<indices_length; ++i) { |
| ElementType* point = points_[indices[i]]; |
| DistanceType dist = distance_(point, points_[centers[0]], veclen_); |
| labels[i] = 0; |
| for (int j=1; j<centers_length; ++j) { |
| DistanceType new_dist = distance_(point, points_[centers[j]], veclen_); |
| if (dist>new_dist) { |
| labels[i] = j; |
| dist = new_dist; |
| } |
| } |
| cost += dist; |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| void computeClustering(NodePtr node, int* indices, int indices_length) |
| { |
| if (indices_length < leaf_max_size_) { |
| node->points.resize(indices_length); |
| for (int i=0;i<indices_length;++i) { |
| node->points[i].index = indices[i]; |
| node->points[i].point = points_[indices[i]]; |
| } |
| node->childs.clear(); |
| return; |
| } |
|
|
| std::vector<int> centers(branching_); |
| std::vector<int> labels(indices_length); |
|
|
| int centers_length; |
| (*chooseCenters_)(branching_, indices, indices_length, ¢ers[0], centers_length); |
|
|
| if (centers_length<branching_) { |
| node->points.resize(indices_length); |
| for (int i=0;i<indices_length;++i) { |
| node->points[i].index = indices[i]; |
| node->points[i].point = points_[indices[i]]; |
| } |
| node->childs.clear(); |
| return; |
| } |
|
|
|
|
| |
| DistanceType cost; |
| computeLabels(indices, indices_length, ¢ers[0], centers_length, &labels[0], cost); |
|
|
| node->childs.resize(branching_); |
| int start = 0; |
| int end = start; |
| for (int i=0; i<branching_; ++i) { |
| for (int j=0; j<indices_length; ++j) { |
| if (labels[j]==i) { |
| std::swap(indices[j],indices[end]); |
| std::swap(labels[j],labels[end]); |
| end++; |
| } |
| } |
|
|
| node->childs[i] = new(pool_) Node(); |
| node->childs[i]->pivot_index = centers[i]; |
| node->childs[i]->pivot = points_[centers[i]]; |
| node->childs[i]->points.clear(); |
| computeClustering(node->childs[i],indices+start, end-start); |
| start=end; |
| } |
| } |
|
|
|
|
| template<bool with_removed> |
| void findNeighborsWithRemoved(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) const |
| { |
| int maxChecks = searchParams.checks; |
|
|
| |
| Heap<BranchSt>* heap = new Heap<BranchSt>(size_); |
|
|
| DynamicBitset checked(size_); |
| int checks = 0; |
| for (int i=0; i<trees_; ++i) { |
| findNN<with_removed>(tree_roots_[i], result, vec, checks, maxChecks, heap, checked); |
| } |
|
|
| BranchSt branch; |
| while (heap->popMin(branch) && (checks<maxChecks || !result.full())) { |
| NodePtr node = branch.node; |
| findNN<with_removed>(node, result, vec, checks, maxChecks, heap, checked); |
| } |
|
|
| delete heap; |
| } |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| template<bool with_removed> |
| void findNN(NodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks, |
| Heap<BranchSt>* heap, DynamicBitset& checked) const |
| { |
| if (node->childs.empty()) { |
| if (checks>=maxChecks) { |
| if (result.full()) return; |
| } |
|
|
| for (size_t i=0; i<node->points.size(); ++i) { |
| PointInfo& pointInfo = node->points[i]; |
| if (with_removed) { |
| if (removed_points_.test(pointInfo.index)) continue; |
| } |
| if (checked.test(pointInfo.index)) continue; |
| DistanceType dist = distance_(pointInfo.point, vec, veclen_); |
| result.addPoint(dist, pointInfo.index); |
| checked.set(pointInfo.index); |
| ++checks; |
| } |
| } |
| else { |
| DistanceType* domain_distances = new DistanceType[branching_]; |
| int best_index = 0; |
| domain_distances[best_index] = distance_(vec, node->childs[best_index]->pivot, veclen_); |
| for (int i=1; i<branching_; ++i) { |
| domain_distances[i] = distance_(vec, node->childs[i]->pivot, veclen_); |
| if (domain_distances[i]<domain_distances[best_index]) { |
| best_index = i; |
| } |
| } |
| for (int i=0; i<branching_; ++i) { |
| if (i!=best_index) { |
| heap->insert(BranchSt(node->childs[i],domain_distances[i])); |
| } |
| } |
| delete[] domain_distances; |
| findNN<with_removed>(node->childs[best_index],result,vec, checks, maxChecks, heap, checked); |
| } |
| } |
|
|
| void addPointToTree(NodePtr node, size_t index) |
| { |
| ElementType* point = points_[index]; |
|
|
| if (node->childs.empty()) { |
| PointInfo pointInfo; |
| pointInfo.point = point; |
| pointInfo.index = index; |
| node->points.push_back(pointInfo); |
|
|
| if (node->points.size()>=size_t(branching_)) { |
| std::vector<int> indices(node->points.size()); |
|
|
| for (size_t i=0;i<node->points.size();++i) { |
| indices[i] = node->points[i].index; |
| } |
| computeClustering(node, &indices[0], indices.size()); |
| } |
| } |
| else { |
| |
| int closest = 0; |
| ElementType* center = node->childs[closest]->pivot; |
| DistanceType dist = distance_(center, point, veclen_); |
| for (size_t i=1;i<size_t(branching_);++i) { |
| center = node->childs[i]->pivot; |
| DistanceType crt_dist = distance_(center, point, veclen_); |
| if (crt_dist<dist) { |
| dist = crt_dist; |
| closest = i; |
| } |
| } |
| addPointToTree(node->childs[closest], index); |
| } |
| } |
|
|
| void swap(HierarchicalClusteringIndex& other) |
| { |
| BaseClass::swap(other); |
|
|
| std::swap(tree_roots_, other.tree_roots_); |
| std::swap(pool_, other.pool_); |
| std::swap(memoryCounter_, other.memoryCounter_); |
| std::swap(branching_, other.branching_); |
| std::swap(trees_, other.trees_); |
| std::swap(centers_init_, other.centers_init_); |
| std::swap(leaf_max_size_, other.leaf_max_size_); |
| std::swap(chooseCenters_, other.chooseCenters_); |
| } |
|
|
| private: |
|
|
| |
| |
| |
| std::vector<Node*> tree_roots_; |
|
|
| |
| |
| |
| |
| |
| |
| |
| PooledAllocator pool_; |
|
|
| |
| |
| |
| int memoryCounter_; |
|
|
| |
| |
| |
| |
| int branching_; |
|
|
| |
| |
| |
| int trees_; |
|
|
| |
| |
| |
| flann_centers_init_t centers_init_; |
|
|
| |
| |
| |
| int leaf_max_size_; |
|
|
| |
| |
| |
| CenterChooser<Distance>* chooseCenters_; |
|
|
| USING_BASECLASS_SYMBOLS |
| }; |
|
|
| } |
|
|
| #endif |
|
|