#include #include #include "clustering.h" #include #include #include IndexPQ::IndexPQ(int d, int m):d(d), m(m){ k_sub = 256; d_sub = d/m; centroids.resize(m*d_sub*k_sub); }; void IndexPQ::train(int n, const float *x, bool subsampling, int seed){ if(trained) return; std::vector train_data(n * d_sub); for(int i = 0; i < m; i++){ for(int row = 0; row < n; row++){ const float* source_id = x + (row * d) + (i * d_sub); float* dest_id = train_data.data() + (row * d_sub); for(int j = 0; j < d_sub; j++){ dest_id[j] = source_id[j]; } } int maxtrain = 150000; if(n > maxtrain && subsampling){ std::mt19937 gen(seed + i); std::uniform_int_distribution dis(0, n - 1); std::vector sample_buffer(maxtrain * d_sub); for(int p = 0; p < maxtrain; p++){ int randval = dis(gen); std::memcpy(&sample_buffer[p * d_sub], &train_data[randval * d_sub], d_sub * sizeof(float)); } kmean_clustering(d_sub, maxtrain, k_sub, sample_buffer.data(), centroids.data() + (i * d_sub * k_sub),seed+i); } else { kmean_clustering(d_sub, n, k_sub, train_data.data() , centroids.data() + (i * d_sub * k_sub), seed+i); } } trained = true; } void IndexPQ::encode(const float *x, uint8_t* out){ if(!trained)return; for(int i =0; i