File size: 5,288 Bytes
e87a50a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#include "IndexIVF.h"
#include "clustering.h"
#include <queue>
#include <iostream>
#include<immintrin.h>
IndexIVF::IndexIVF(int d, int nbucket): d(d), nbucket(nbucket), router(d){
    memory.resize(nbucket);
    memory_ids.resize(nbucket);
};

void IndexIVF::train(int n, const float *x){
     if(trained) return;
     std::vector<float>centroids(nbucket*d);

     //remove seed
     kmean_clustering(d, n, nbucket, x ,centroids.data(),1);
     router.add(nbucket, centroids.data());
     trained=true;
}

void IndexIVF::add(int n, const float *x, const uint64_t*xids){
     if(!trained) return;
     std::vector<int> assign(n);
     std::vector<float> distances(n);
     router.search(n,x,1,distances.data(), assign.data());
     for(int i =0; i<n; i++){
        int bucketid= assign[i];
        memory[bucketid].insert(memory[bucketid].end(),x+(i*d), x+(i*d)+d);
        //for metadata
        memory_ids[bucketid].push_back(xids[i]);
    }
    ntotal+=n;
}
void IndexIVF::search(int n, const float* x, int k, int nprobe, const uint8_t *bitmask, float *distances, int *labels, const uint8_t *vecmini_L1_summary){
    std::vector<int>assign(n*nprobe);
    std::vector<float> centroids_distance(n*nprobe);

    router.search(n,x,nprobe,centroids_distance.data(), assign.data());
    for(int i = 0; i<n; i++){
        //std::unordered_set <uint64_t> set;
//        std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>, std::greater<std::pair<float, int>>> pq;
        std::priority_queue<std::pair<float, int>> pq;
        const float *specquer = x+(i*d);
        for(int p= 0; p<nprobe; p++){
            int bucketid = assign[i*nprobe+p];
            int vectorinmemo = memory[bucketid].size()/d;
            if(vectorinmemo==0)continue;
            const float *bucketdata=  memory[bucketid].data();

        for(int j = 0; j<vectorinmemo; j++){
            int prefetch_stride = 1; 
            
            if(j + prefetch_stride < vectorinmemo){
                _mm_prefetch((const char*)&bucketdata[(j + prefetch_stride) * d], _MM_HINT_T0);
                
                if (bitmask != nullptr) {
                    uint64_t future_id = memory_ids[bucketid][j + prefetch_stride];
                    
                    // If you ever use L1 summary again, prefetch it here:
                    // if (vecmini_L1_summary != nullptr) _mm_prefetch((const char*)&vecmini_L1_summary[future_id / 8], _MM_HINT_T0);
                    
                    // Prefetch the massive uint8_t mask byte
                    _mm_prefetch((const char*)&bitmask[future_id], _MM_HINT_T0);
                }
            }

            uint64_t global_id = memory_ids[bucketid][j];

                if (bitmask != nullptr && bitmask[global_id]==0 ) {
                    continue;
                }
            
            //removed this for simd

            //benchmark for standard cosine calc->
            //nullptr: 6.32857
            //bitmask: 4.60353

            //after simd
            //nullptr: 1.3298
            //bitmask: 0.918149

            //added simd
            float dist = 0;
            int m = 0;

            __m256 sumvec = _mm256_setzero_ps();

            /*for(int m = 0; m<d; m++){
                currcosine+=(bucketdata[j*d+m]*specquer[m]);
            }*/
            for(; m+7<d; m+=8){
                //load 8float from the db chunk
                __m256 dbvec=  _mm256_loadu_ps(&bucketdata[j*d+m]);
                __m256 qvec=  _mm256_loadu_ps(&specquer[m]);
                __m256 diff = _mm256_sub_ps(dbvec, qvec); //-> only add for un normalized vectors 
                sumvec = _mm256_fmadd_ps(diff, diff, sumvec);
            }
            
            __m128 upper = _mm256_extractf128_ps(sumvec, 1);
            __m128 lower = _mm256_extractf128_ps(sumvec, 0);

            __m128 sumbound = _mm_add_ps(upper, lower);
            __m128 shifted = _mm_movehl_ps(sumbound, sumbound);
            __m128 current = _mm_add_ps(sumbound, shifted);
            __m128 shuffled = _mm_shuffle_ps(current, current, 1);
            __m128 finalsum = _mm_add_ps(current, shuffled);
            dist = _mm_cvtss_f32(finalsum);

/*
            float sumarr[8];
            _mm256_storeu_ps(sumarr,sumvec);
            currcosine= sumarr[0]+sumarr[1]+
                        sumarr[2]+sumarr[3]+
                        sumarr[4]+sumarr[5]+
                        sumarr[6]+sumarr[7];
            //cleanup       
  */
            if(pq.size()<k){
                pq.push({dist, global_id});
            }else{
                if(dist<pq.top().first){
                    pq.pop();
                    pq.push({dist, global_id});
                }
            }
        }
    }
        float *speldist = distances+(i*k);
        int *spelbs = labels+(i*k);
        int count = pq.size();
        for(int c = count-1; c>=0; c--){
            speldist[c]= pq.top().first;
            spelbs[c]= pq.top().second;
            pq.pop(); 
        }
        for(int step = count; step<k; step++){
            speldist[step]=-1.0; 
            spelbs[step]= -1;
        }
    }
}

void IndexIVF::inject_centroids(const float* external_centroids) {
    if(trained) return;
    router.add(nbucket, external_centroids);
    trained = true;
}