Spaces:
Running
Running
File size: 5,288 Bytes
e87a50a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | #include "IndexIVF.h"
#include "clustering.h"
#include <queue>
#include <iostream>
#include<immintrin.h>
IndexIVF::IndexIVF(int d, int nbucket): d(d), nbucket(nbucket), router(d){
memory.resize(nbucket);
memory_ids.resize(nbucket);
};
void IndexIVF::train(int n, const float *x){
if(trained) return;
std::vector<float>centroids(nbucket*d);
//remove seed
kmean_clustering(d, n, nbucket, x ,centroids.data(),1);
router.add(nbucket, centroids.data());
trained=true;
}
void IndexIVF::add(int n, const float *x, const uint64_t*xids){
if(!trained) return;
std::vector<int> assign(n);
std::vector<float> distances(n);
router.search(n,x,1,distances.data(), assign.data());
for(int i =0; i<n; i++){
int bucketid= assign[i];
memory[bucketid].insert(memory[bucketid].end(),x+(i*d), x+(i*d)+d);
//for metadata
memory_ids[bucketid].push_back(xids[i]);
}
ntotal+=n;
}
void IndexIVF::search(int n, const float* x, int k, int nprobe, const uint8_t *bitmask, float *distances, int *labels, const uint8_t *vecmini_L1_summary){
std::vector<int>assign(n*nprobe);
std::vector<float> centroids_distance(n*nprobe);
router.search(n,x,nprobe,centroids_distance.data(), assign.data());
for(int i = 0; i<n; i++){
//std::unordered_set <uint64_t> set;
// std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>, std::greater<std::pair<float, int>>> pq;
std::priority_queue<std::pair<float, int>> pq;
const float *specquer = x+(i*d);
for(int p= 0; p<nprobe; p++){
int bucketid = assign[i*nprobe+p];
int vectorinmemo = memory[bucketid].size()/d;
if(vectorinmemo==0)continue;
const float *bucketdata= memory[bucketid].data();
for(int j = 0; j<vectorinmemo; j++){
int prefetch_stride = 1;
if(j + prefetch_stride < vectorinmemo){
_mm_prefetch((const char*)&bucketdata[(j + prefetch_stride) * d], _MM_HINT_T0);
if (bitmask != nullptr) {
uint64_t future_id = memory_ids[bucketid][j + prefetch_stride];
// If you ever use L1 summary again, prefetch it here:
// if (vecmini_L1_summary != nullptr) _mm_prefetch((const char*)&vecmini_L1_summary[future_id / 8], _MM_HINT_T0);
// Prefetch the massive uint8_t mask byte
_mm_prefetch((const char*)&bitmask[future_id], _MM_HINT_T0);
}
}
uint64_t global_id = memory_ids[bucketid][j];
if (bitmask != nullptr && bitmask[global_id]==0 ) {
continue;
}
//removed this for simd
//benchmark for standard cosine calc->
//nullptr: 6.32857
//bitmask: 4.60353
//after simd
//nullptr: 1.3298
//bitmask: 0.918149
//added simd
float dist = 0;
int m = 0;
__m256 sumvec = _mm256_setzero_ps();
/*for(int m = 0; m<d; m++){
currcosine+=(bucketdata[j*d+m]*specquer[m]);
}*/
for(; m+7<d; m+=8){
//load 8float from the db chunk
__m256 dbvec= _mm256_loadu_ps(&bucketdata[j*d+m]);
__m256 qvec= _mm256_loadu_ps(&specquer[m]);
__m256 diff = _mm256_sub_ps(dbvec, qvec); //-> only add for un normalized vectors
sumvec = _mm256_fmadd_ps(diff, diff, sumvec);
}
__m128 upper = _mm256_extractf128_ps(sumvec, 1);
__m128 lower = _mm256_extractf128_ps(sumvec, 0);
__m128 sumbound = _mm_add_ps(upper, lower);
__m128 shifted = _mm_movehl_ps(sumbound, sumbound);
__m128 current = _mm_add_ps(sumbound, shifted);
__m128 shuffled = _mm_shuffle_ps(current, current, 1);
__m128 finalsum = _mm_add_ps(current, shuffled);
dist = _mm_cvtss_f32(finalsum);
/*
float sumarr[8];
_mm256_storeu_ps(sumarr,sumvec);
currcosine= sumarr[0]+sumarr[1]+
sumarr[2]+sumarr[3]+
sumarr[4]+sumarr[5]+
sumarr[6]+sumarr[7];
//cleanup
*/
if(pq.size()<k){
pq.push({dist, global_id});
}else{
if(dist<pq.top().first){
pq.pop();
pq.push({dist, global_id});
}
}
}
}
float *speldist = distances+(i*k);
int *spelbs = labels+(i*k);
int count = pq.size();
for(int c = count-1; c>=0; c--){
speldist[c]= pq.top().first;
spelbs[c]= pq.top().second;
pq.pop();
}
for(int step = count; step<k; step++){
speldist[step]=-1.0;
spelbs[step]= -1;
}
}
}
void IndexIVF::inject_centroids(const float* external_centroids) {
if(trained) return;
router.add(nbucket, external_centroids);
trained = true;
} |