levanel commited on
Commit
e87a50a
·
0 Parent(s):
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.so
2
+ build/
CMakeLists.txt ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 3.22)
2
+ project(vecmini VERSION 1.0 LANGUAGES CXX)
3
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
4
+
5
+ # Use modern C++17 for cleaner syntax
6
+ set(CMAKE_CXX_STANDARD 17)
7
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
8
+ set(CMAKE_CXX_EXTENSIONS OFF)
9
+
10
+ # AVX-256 and Optimization Flags (Safe for Hugging Face cloud CPUs)
11
+ add_compile_options(-mavx2 -mfma -O3 -march=x86-64-v3)
12
+
13
+ # Generates the compile_commands.json file for tools like clangd
14
+ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
15
+
16
+ # ==========================================
17
+ # DEPENDENCIES (OpenMP & Math for your engine)
18
+ # ==========================================
19
+ find_package(OpenMP REQUIRED)
20
+ find_package(BLAS REQUIRED)
21
+ find_package(LAPACK REQUIRED)
22
+
23
+ # ==========================================
24
+ # INCLUDES
25
+ # ==========================================
26
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
27
+
28
+ # ==========================================
29
+ # PHASE 1: Build your core vector database library
30
+ # ==========================================
31
+ add_library(vectordb STATIC
32
+ src/IndexFlat.cpp
33
+ src/clustering.cpp
34
+ src/IndexIVF.cpp
35
+ src/IndexPQ.cpp
36
+ src/IndexIVFPQ.cpp
37
+ )
38
+
39
+ # ==========================================
40
+ # PHASE 2: The Python Bridge (Pybind11)
41
+ # ==========================================
42
+ find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
43
+ find_package(pybind11 CONFIG REQUIRED)
44
+
45
+ pybind11_add_module(vecmini src/bindings.cpp)
46
+
47
+ # Only link your own engine and the required math/threading backends
48
+ target_link_libraries(vecmini PRIVATE
49
+ vectordb
50
+ OpenMP::OpenMP_CXX
51
+ ${BLAS_LIBRARIES}
52
+ ${LAPACK_LIBRARIES}
53
+ )
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stage 1: Build your module
2
+ FROM python:3.10-slim AS builder
3
+
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ build-essential \
6
+ cmake \
7
+ libopenblas-dev \
8
+ liblapack-dev \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ WORKDIR /app
12
+ RUN pip install --no-cache-dir pybind11
13
+
14
+ # Copy only what your engine needs
15
+ COPY CMakeLists.txt ./
16
+ COPY src/ ./src
17
+ COPY include/ ./include
18
+
19
+ RUN mkdir build && cd build && \
20
+ cmake -DCMAKE_BUILD_TYPE=Release .. && \
21
+ make vecmini
22
+
23
+ # Stage 2: Final Runtime Environment
24
+ FROM python:3.10-slim
25
+
26
+ RUN useradd -m -u 1000 user
27
+ WORKDIR /home/user/app
28
+
29
+ # Install runtime math dependencies
30
+ RUN apt-get update && apt-get install -y --no-install-recommends \
31
+ libopenblas0 \
32
+ libomp-dev \
33
+ && rm -rf /var/lib/apt/lists/*
34
+
35
+ # Copy the compiled .so module directly into the Python environment
36
+ COPY --from=builder /app/build/vecmini*.so /home/user/app/
37
+
38
+ # Copy your frontend code (app.py)
39
+ COPY --chown=user . /home/user/app
40
+ RUN pip install --no-cache-dir gradio numpy
41
+
42
+ ENV PORT=7860
43
+ EXPOSE 7860
44
+
45
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import vecmini
5
+ import pypdf
6
+ from transformers import AutoTokenizer, AutoModel
7
+
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ torch_dtype = torch.float16
10
+
11
+ print("Loading Sentence Encoder (Bi-Encoder Only)...")
12
+ embed_id = "sentence-transformers/all-MiniLM-L6-v2"
13
+ embed_tokenizer = AutoTokenizer.from_pretrained(embed_id)
14
+ embed_model = AutoModel.from_pretrained(embed_id).to(device).to(torch_dtype)
15
+
16
+ global_chunks = []
17
+ db = None
18
+ global_nlist = 1
19
+
20
+ def mean_pooling(model_output, attention_mask):
21
+ token_embeddings = model_output[0]
22
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
23
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
24
+
25
+ def encode_texts(texts):
26
+ encoded_input = embed_tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
27
+ with torch.no_grad():
28
+ model_output = embed_model(**encoded_input)
29
+ embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
30
+ return torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu().numpy().astype(np.float32)
31
+
32
+ def process_pdf(file_obj):
33
+ global global_chunks, db, global_nlist
34
+
35
+ if file_obj is None:
36
+ return "Error: No file uploaded."
37
+
38
+ try:
39
+ reader = pypdf.PdfReader(file_obj.name)
40
+ text = ""
41
+ for page in reader.pages:
42
+ extracted = page.extract_text()
43
+ if extracted:
44
+ text += extracted + " "
45
+ except Exception as e:
46
+ return f"Failed to read PDF: {str(e)}"
47
+
48
+ if not text.strip():
49
+ return "Error: Could not extract any readable text from this PDF."
50
+
51
+ chunk_size = 200
52
+ words = text.split()
53
+ global_chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
54
+
55
+ embeddings = encode_texts(global_chunks)
56
+ d = embeddings.shape[1]
57
+ nb = embeddings.shape[0]
58
+
59
+ global_nlist = max(1, int(nb / 4))
60
+
61
+ db = vecmini.IndexIVF(d, global_nlist)
62
+ db.train(nb, embeddings)
63
+ db.add(nb, embeddings, np.arange(nb, dtype=np.uint64))
64
+
65
+ return f"Success! Extracted {nb} chunks from the PDF and built vecmini index."
66
+
67
+ def retrieve_chunks(query, top_k):
68
+ if db is None or not global_chunks:
69
+ return "Please upload and process a PDF first."
70
+ if not query.strip():
71
+ return "Please enter a search query."
72
+
73
+ query_emb = encode_texts([query])
74
+
75
+ fetch_k = min(int(top_k), len(global_chunks))
76
+ nprobe = max(1, int(global_nlist / 2))
77
+
78
+ distances, labels = db.search(1, query_emb, k=fetch_k, nprobe=nprobe, bitmask=None)
79
+
80
+ retrieved_indices = [idx for idx in labels[0] if idx < len(global_chunks)]
81
+
82
+ output_text = f"### Top {len(retrieved_indices)} Results for: *'{query}'*\n\n"
83
+
84
+ for i, idx in enumerate(retrieved_indices):
85
+ dist = distances[0][i]
86
+ chunk_text = global_chunks[idx]
87
+ output_text += f"**Result {i+1}** | Vector Distance: `{dist:.4f}` | Chunk ID: `{idx}`\n"
88
+ output_text += f"> {chunk_text}\n\n---\n\n"
89
+
90
+ return output_text
91
+
92
+ with gr.Blocks(title="Vecmini Visualizer") as demo:
93
+ gr.Markdown("# Vecmini: PDF Raw Retrieval Tester")
94
+ gr.Markdown("Upload a PDF, build the index, and see exactly what `vecmini` returns for your queries.")
95
+
96
+ with gr.Row():
97
+ with gr.Column():
98
+ pdf_input = gr.File(label="Upload PDF Document", file_types=[".pdf"])
99
+ process_btn = gr.Button("Build Vecmini Index", variant="primary")
100
+ status_out = gr.Textbox(label="Index Status", interactive=False)
101
+
102
+ with gr.Column():
103
+ query_input = gr.Textbox(label="Search Query")
104
+ k_slider = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of chunks to retrieve (K)")
105
+ search_btn = gr.Button("Search Vecmini")
106
+
107
+ results_out = gr.Markdown(label="Retrieved Chunks")
108
+
109
+ process_btn.click(fn=process_pdf, inputs=pdf_input, outputs=status_out)
110
+ search_btn.click(fn=retrieve_chunks, inputs=[query_input, k_slider], outputs=results_out)
111
+ query_input.submit(fn=retrieve_chunks, inputs=[query_input, k_slider], outputs=results_out)
112
+
113
+ if __name__ == "__main__":
114
+ demo.launch()
include/IndexFlat.h ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include <vector>
3
+
4
+ class IndexFlatL2{
5
+ int d;//dimension of vector
6
+ int ntotal=0;//no of vector in the db
7
+ std::vector<float>xb;
8
+
9
+ public:
10
+ IndexFlatL2(int d) : d(d) {}
11
+ // ingests 'n'vectors from a raw memory pointer 'x' into the database
12
+ void add(int n, const float *x);
13
+ //k->how many nearest neghbour we want
14
+ // ans is saved in distances and labels
15
+ void search(int n, const float *x, int k, float *distances, int* labels);
16
+ };
include/IndexIVF.h ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include <vector>
3
+ #include "IndexFlat.h"
4
+ #include <cstdint>
5
+
6
+ class IndexIVF {
7
+ private:
8
+ int d;
9
+ int nbucket;
10
+ int ntotal = 0;
11
+ bool trained = false;
12
+
13
+ IndexFlatL2 router;
14
+ std::vector<std::vector<float>> memory;
15
+ std::vector<std::vector<uint64_t>> memory_ids;
16
+
17
+ public: // The interface (Your benchmark script is allowed to use these)
18
+ IndexIVF(int d, int nbucket);
19
+ void train(int n, const float *x);
20
+ void add(int n, const float *x, const uint64_t*xids);
21
+ void search(int n, const float* x, int k, int nprobe, const uint8_t *bitmask, float *distances, int *labels, const uint8_t *L1_summary = nullptr);
22
+ void inject_centroids(const float* external_centroids);
23
+ };
include/IndexIVFPQ.h ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include<vector>
3
+ #include<cstdint>
4
+ #include "IndexPQ.h"
5
+ #include "IndexFlat.h"
6
+ #include <cstddef>
7
+ class IndexIVFPQ{
8
+ private:
9
+ int d;
10
+ int m;//bitquant
11
+ int nbucket; //no of centroid
12
+ int ntotal; //no of vector index
13
+ bool trained=false;
14
+ size_t nprobe;//how many voronoi i should look at
15
+ IndexFlatL2 router;
16
+ IndexPQ pq;
17
+ std::vector<float>coarse_centroids;
18
+ std::vector<std::vector<uint8_t>>codes;
19
+ std::vector<std::vector<int64_t>>ids;
20
+
21
+ public:
22
+ IndexIVFPQ(int d, int nbucket, int m);
23
+ void train(int n, const float *x, bool subsampling, int seed);
24
+ void add(int n, const float *x, const uint64_t* xids);
25
+ void search(int n, const float *query, int k, int nprobe, float* distances, int64_t* labels);
26
+ };
27
+
28
+
include/IndexPQ.h ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include <vector>
3
+ #include <cstdint>
4
+ class IndexPQ{
5
+ private:
6
+ int d;
7
+ int m;
8
+ int k_sub;
9
+ int d_sub;
10
+
11
+ std::vector<float> centroids;
12
+ bool trained = false;
13
+ public:
14
+ IndexPQ(int d, int m);
15
+ void train(int n, const float* x, bool subsampling, int seed);
16
+ void encode(const float *x, uint8_t* out);
17
+ void compute_distance_table(const float *query, float *outable);
18
+ };
include/clustering.h ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include<vector>
3
+
4
+ void kmean_clustering(
5
+ int d,
6
+ int n,
7
+ int k,
8
+ const float *x,
9
+ float *centroids,
10
+ int seed
11
+ );
src/IndexFlat.cpp ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "IndexFlat.h"
2
+ #include <queue>
3
+ #include <vector>
4
+ #include<immintrin.h>
5
+
6
+ void IndexFlatL2::add(int n, const float *x){
7
+ xb.insert(xb.end(), x, x+(n*d));
8
+ ntotal+=n;
9
+ }
10
+
11
+ void IndexFlatL2::search(int n, const float *x, int k, float *distances, int *labels){
12
+ for(int i = 0; i<n; i++){//iterate over the entire query
13
+ //old stuff
14
+ //float min_distance = 1e9;
15
+ //int bestid = -1;
16
+ std::priority_queue<std::pair<float, int>> pq;
17
+
18
+ for(int j= 0; j<ntotal; j++){//compare query against every vec in db
19
+ float curr_distance = 0;
20
+ int m = 0;
21
+
22
+ __m256 sumvec = _mm256_setzero_ps();
23
+
24
+ const float* current_db_vec = &xb[j * d];
25
+ const float* current_q_vec = &x[i * d];
26
+
27
+ for(; m + 7 < d; m += 8){
28
+ __m256 dbvec = _mm256_loadu_ps(&current_db_vec[m]);
29
+ __m256 qvec = _mm256_loadu_ps(&current_q_vec[m]);
30
+
31
+ __m256 diff = _mm256_sub_ps(dbvec, qvec);
32
+
33
+ sumvec = _mm256_fmadd_ps(diff, diff, sumvec);
34
+ }
35
+
36
+
37
+ __m128 upper = _mm256_extractf128_ps(sumvec,1);
38
+ __m128 lower = _mm256_castps256_ps128(sumvec);
39
+
40
+ __m128 sumbound = _mm_add_ps(upper, lower);
41
+ __m128 shifted = _mm_movehl_ps(sumbound,sumbound);
42
+ __m128 current = _mm_add_ps(sumbound, shifted);
43
+ __m128 shuffled = _mm_shuffle_ps(current, current, 1);
44
+ __m128 finalsum = _mm_add_ps(current, shuffled);
45
+ curr_distance = _mm_cvtss_f32(finalsum);
46
+
47
+ for(; m < d; m++){
48
+ float dist = current_db_vec[m] - current_q_vec[m];
49
+ curr_distance += (dist * dist);
50
+ }
51
+ /*
52
+ if (curr_distance<min_distance){
53
+ min_distance=curr_distance;
54
+ bestid = j;
55
+ } */
56
+
57
+
58
+
59
+ if(pq.size()<k){
60
+ pq.push({curr_distance,j});
61
+ }else{
62
+ if(curr_distance<pq.top().first){
63
+ pq.pop();
64
+ pq.push({curr_distance,j});
65
+ }
66
+ }
67
+ }
68
+ /*
69
+ distances[i] = min_distance;
70
+ labels[i] = bestid; */
71
+
72
+
73
+ //standard for loop cannot handle garbage values.
74
+ //for that we need 2 seperate for loop, one that handles the queue content properly
75
+ int count=pq.size();
76
+ for(int c = count-1; c>=0; c--){
77
+ distances[i*k+c] = pq.top().first;
78
+ labels[i*k+c] = pq.top().second;
79
+ pq.pop();
80
+ }
81
+ for(int step=count; step<k; step++){
82
+ distances[i*k+step] = -1.0;
83
+ labels[i*k+step] = -1;
84
+ }
85
+ }
86
+ }
src/IndexIVF.cpp ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "IndexIVF.h"
2
+ #include "clustering.h"
3
+ #include <queue>
4
+ #include <iostream>
5
+ #include<immintrin.h>
6
+ IndexIVF::IndexIVF(int d, int nbucket): d(d), nbucket(nbucket), router(d){
7
+ memory.resize(nbucket);
8
+ memory_ids.resize(nbucket);
9
+ };
10
+
11
+ void IndexIVF::train(int n, const float *x){
12
+ if(trained) return;
13
+ std::vector<float>centroids(nbucket*d);
14
+
15
+ //remove seed
16
+ kmean_clustering(d, n, nbucket, x ,centroids.data(),1);
17
+ router.add(nbucket, centroids.data());
18
+ trained=true;
19
+ }
20
+
21
+ void IndexIVF::add(int n, const float *x, const uint64_t*xids){
22
+ if(!trained) return;
23
+ std::vector<int> assign(n);
24
+ std::vector<float> distances(n);
25
+ router.search(n,x,1,distances.data(), assign.data());
26
+ for(int i =0; i<n; i++){
27
+ int bucketid= assign[i];
28
+ memory[bucketid].insert(memory[bucketid].end(),x+(i*d), x+(i*d)+d);
29
+ //for metadata
30
+ memory_ids[bucketid].push_back(xids[i]);
31
+ }
32
+ ntotal+=n;
33
+ }
34
+ void IndexIVF::search(int n, const float* x, int k, int nprobe, const uint8_t *bitmask, float *distances, int *labels, const uint8_t *vecmini_L1_summary){
35
+ std::vector<int>assign(n*nprobe);
36
+ std::vector<float> centroids_distance(n*nprobe);
37
+
38
+ router.search(n,x,nprobe,centroids_distance.data(), assign.data());
39
+ for(int i = 0; i<n; i++){
40
+ //std::unordered_set <uint64_t> set;
41
+ // std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>, std::greater<std::pair<float, int>>> pq;
42
+ std::priority_queue<std::pair<float, int>> pq;
43
+ const float *specquer = x+(i*d);
44
+ for(int p= 0; p<nprobe; p++){
45
+ int bucketid = assign[i*nprobe+p];
46
+ int vectorinmemo = memory[bucketid].size()/d;
47
+ if(vectorinmemo==0)continue;
48
+ const float *bucketdata= memory[bucketid].data();
49
+
50
+ for(int j = 0; j<vectorinmemo; j++){
51
+ int prefetch_stride = 1;
52
+
53
+ if(j + prefetch_stride < vectorinmemo){
54
+ _mm_prefetch((const char*)&bucketdata[(j + prefetch_stride) * d], _MM_HINT_T0);
55
+
56
+ if (bitmask != nullptr) {
57
+ uint64_t future_id = memory_ids[bucketid][j + prefetch_stride];
58
+
59
+ // If you ever use L1 summary again, prefetch it here:
60
+ // if (vecmini_L1_summary != nullptr) _mm_prefetch((const char*)&vecmini_L1_summary[future_id / 8], _MM_HINT_T0);
61
+
62
+ // Prefetch the massive uint8_t mask byte
63
+ _mm_prefetch((const char*)&bitmask[future_id], _MM_HINT_T0);
64
+ }
65
+ }
66
+
67
+ uint64_t global_id = memory_ids[bucketid][j];
68
+
69
+ if (bitmask != nullptr && bitmask[global_id]==0 ) {
70
+ continue;
71
+ }
72
+
73
+ //removed this for simd
74
+
75
+ //benchmark for standard cosine calc->
76
+ //nullptr: 6.32857
77
+ //bitmask: 4.60353
78
+
79
+ //after simd
80
+ //nullptr: 1.3298
81
+ //bitmask: 0.918149
82
+
83
+ //added simd
84
+ float dist = 0;
85
+ int m = 0;
86
+
87
+ __m256 sumvec = _mm256_setzero_ps();
88
+
89
+ /*for(int m = 0; m<d; m++){
90
+ currcosine+=(bucketdata[j*d+m]*specquer[m]);
91
+ }*/
92
+ for(; m+7<d; m+=8){
93
+ //load 8float from the db chunk
94
+ __m256 dbvec= _mm256_loadu_ps(&bucketdata[j*d+m]);
95
+ __m256 qvec= _mm256_loadu_ps(&specquer[m]);
96
+ __m256 diff = _mm256_sub_ps(dbvec, qvec); //-> only add for un normalized vectors
97
+ sumvec = _mm256_fmadd_ps(diff, diff, sumvec);
98
+ }
99
+
100
+ __m128 upper = _mm256_extractf128_ps(sumvec, 1);
101
+ __m128 lower = _mm256_extractf128_ps(sumvec, 0);
102
+
103
+ __m128 sumbound = _mm_add_ps(upper, lower);
104
+ __m128 shifted = _mm_movehl_ps(sumbound, sumbound);
105
+ __m128 current = _mm_add_ps(sumbound, shifted);
106
+ __m128 shuffled = _mm_shuffle_ps(current, current, 1);
107
+ __m128 finalsum = _mm_add_ps(current, shuffled);
108
+ dist = _mm_cvtss_f32(finalsum);
109
+
110
+ /*
111
+ float sumarr[8];
112
+ _mm256_storeu_ps(sumarr,sumvec);
113
+ currcosine= sumarr[0]+sumarr[1]+
114
+ sumarr[2]+sumarr[3]+
115
+ sumarr[4]+sumarr[5]+
116
+ sumarr[6]+sumarr[7];
117
+ //cleanup
118
+ */
119
+ if(pq.size()<k){
120
+ pq.push({dist, global_id});
121
+ }else{
122
+ if(dist<pq.top().first){
123
+ pq.pop();
124
+ pq.push({dist, global_id});
125
+ }
126
+ }
127
+ }
128
+ }
129
+ float *speldist = distances+(i*k);
130
+ int *spelbs = labels+(i*k);
131
+ int count = pq.size();
132
+ for(int c = count-1; c>=0; c--){
133
+ speldist[c]= pq.top().first;
134
+ spelbs[c]= pq.top().second;
135
+ pq.pop();
136
+ }
137
+ for(int step = count; step<k; step++){
138
+ speldist[step]=-1.0;
139
+ spelbs[step]= -1;
140
+ }
141
+ }
142
+ }
143
+
144
+ void IndexIVF::inject_centroids(const float* external_centroids) {
145
+ if(trained) return;
146
+ router.add(nbucket, external_centroids);
147
+ trained = true;
148
+ }
src/IndexIVFPQ.cpp ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "IndexIVFPQ.h"
2
+ #include "IndexIVF.h"
3
+ #include "clustering.h"
4
+ #include <queue>
5
+ #include <iostream>
6
+ #include <immintrin.h>
7
+ #include <random>
8
+ #include <cstring>
9
+ IndexIVFPQ::IndexIVFPQ(int d, int nbucket, int m): d(d), m(m), nbucket(nbucket), router(d), pq(d, m){
10
+ codes.resize(nbucket);
11
+ ids.resize(nbucket);
12
+ };
13
+
14
+ void IndexIVFPQ::train(int n, const float *x, bool subsampling, int seed){
15
+ if(trained)return;
16
+ coarse_centroids.resize(nbucket*d);
17
+
18
+ int maxtrain = 150000;
19
+ if(n>maxtrain && subsampling){
20
+ std::mt19937 gen(seed);
21
+ std::uniform_int_distribution<int>dis(0,n-1);
22
+ std::vector<float> sample_buffer(maxtrain * d);
23
+ for(int i=0; i<maxtrain; i++){
24
+ int randval = dis(gen);
25
+ std::memcpy(&sample_buffer[i*d],
26
+ &x[randval * d],
27
+ d * sizeof(float));
28
+ }
29
+ kmean_clustering(d, maxtrain, nbucket, sample_buffer.data(), coarse_centroids.data(), seed);
30
+ }else{kmean_clustering(d, n, nbucket, x, coarse_centroids.data(), seed);}
31
+
32
+ router.add(nbucket, coarse_centroids.data());
33
+ std::vector<float>residuals(n*d);
34
+ std::vector<float> distances(n);
35
+ std::vector<int> labels(n);
36
+ router.search(n,x,1,distances.data(), labels.data());
37
+ for(int i = 0;i<n; i++){
38
+ int drawerid = labels[i];
39
+ for(int j = 0; j<d; j++){
40
+ residuals[(i*d)+j] = x[(i*d)+j] - coarse_centroids[(drawerid*d)+j];
41
+ }
42
+ }
43
+ pq.train(n, residuals.data(), subsampling, seed);
44
+ trained = true;
45
+ }
46
+ void IndexIVFPQ::add(int n, const float *x, const uint64_t* xids){
47
+ if (!trained) return;
48
+ std::vector<float>residuals(n*d);
49
+ std::vector<float> distances(n);
50
+ std::vector<int> labels(n);
51
+ router.search(n,x,1,distances.data(), labels.data());
52
+ std::cout << "expected centroids size: " << nbucket * d << std::endl;
53
+ std::cout << "actual centroids size: " << coarse_centroids.size() << std::endl;
54
+ std::cout << "codes vector size: " << codes.size() << std::endl;
55
+ for(int i = 0;i<n; i++){
56
+ int drawerid = labels[i];
57
+ for(int j = 0; j<d; j++){
58
+ residuals[(i*d)+j] = x[(i*d)+j]-coarse_centroids[(drawerid*d)+j];
59
+ }
60
+ std::vector<uint8_t> zipvect(m);
61
+ pq.encode(residuals.data()+(i*d), zipvect.data());
62
+ codes[drawerid].insert(codes[drawerid].end(), zipvect.begin(), zipvect.end());
63
+ ids[drawerid].push_back(xids[i]);
64
+ }
65
+ }
66
+ void IndexIVFPQ::search(int n, const float *query, int k, int nprobe, float* distances, int64_t* labels){
67
+ std::vector<int> assign(n*nprobe);
68
+ std::vector<float> coarse_distances(n*nprobe);
69
+ router.search(n,query, nprobe, coarse_distances.data(),assign.data());
70
+ for(int i = 0; i<n; i++){
71
+ std::priority_queue<std::pair<float, int>> max_heap;
72
+ std::vector<float> query_residual(d);
73
+ for(int p=0; p<nprobe; p++){
74
+ int drawerid = assign[(i*nprobe)+p];
75
+ /*for(int j = 0; j<d; j++){
76
+ query_residual[j] = query[(i*d)+j] - coarse_centroids[(drawerid*d)+j];
77
+ }
78
+ */
79
+
80
+ for(int j=0; j<d; j+=8){
81
+ __m256 ccvec= _mm256_loadu_ps(&coarse_centroids[(drawerid*d)+j]);
82
+ __m256 qrvec= _mm256_loadu_ps(&query[(i*d)+j]);
83
+ __m256 diffvec = _mm256_sub_ps(qrvec,ccvec);
84
+ _mm256_storeu_ps(&query_residual[j], diffvec);
85
+ }
86
+
87
+
88
+ std::vector<float> distance_table(m*256);
89
+ pq.compute_distance_table(query_residual.data(), distance_table.data());
90
+ for(int v = 0; v<codes[drawerid].size()/m; v++){
91
+ float totaldistance =0.0;
92
+ for(int m_idx = 0; m_idx<m; m_idx++){
93
+ int centroid_id = codes[drawerid][(v*m)+m_idx];
94
+ totaldistance+=distance_table[centroid_id+(m_idx*256)];
95
+ }
96
+ if(max_heap.size()<k){
97
+ max_heap.push({totaldistance, ids[drawerid][v]});
98
+ }else{
99
+ if(totaldistance<max_heap.top().first){
100
+ max_heap.pop();
101
+ max_heap.push({totaldistance, ids[drawerid][v]});
102
+ }
103
+ }
104
+ }
105
+ }
106
+ float *subdist = distances+(i*k);
107
+ int64_t *sublbs = labels+(i*k);
108
+ int count = max_heap.size();
109
+ for(int c = count-1; c>=0; c--){
110
+ subdist[c] = max_heap.top().first;
111
+ sublbs[c] = max_heap.top().second;
112
+ max_heap.pop();
113
+ }
114
+ for(int fod = count; fod<k; fod++){
115
+ subdist[fod]=-1.0;
116
+ sublbs[fod]=-1;
117
+ }
118
+ }
119
+ }
src/IndexPQ.cpp ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <IndexPQ.h>
2
+ #include <vector>
3
+ #include "clustering.h"
4
+ #include<immintrin.h>
5
+ #include <random>
6
+ #include <cstring>
7
+ IndexPQ::IndexPQ(int d, int m):d(d), m(m){
8
+ k_sub = 256;
9
+ d_sub = d/m;
10
+ centroids.resize(m*d_sub*k_sub);
11
+ };
12
+ void IndexPQ::train(int n, const float *x, bool subsampling, int seed){
13
+ if(trained) return;
14
+ std::vector<float> train_data(n * d_sub);
15
+
16
+ for(int i = 0; i < m; i++){
17
+ for(int row = 0; row < n; row++){
18
+ const float* source_id = x + (row * d) + (i * d_sub);
19
+ float* dest_id = train_data.data() + (row * d_sub);
20
+ for(int j = 0; j < d_sub; j++){
21
+ dest_id[j] = source_id[j];
22
+ }
23
+ }
24
+
25
+ int maxtrain = 150000;
26
+ if(n > maxtrain && subsampling){
27
+ std::mt19937 gen(seed + i);
28
+ std::uniform_int_distribution<int> dis(0, n - 1);
29
+
30
+ std::vector<float> sample_buffer(maxtrain * d_sub);
31
+ for(int p = 0; p < maxtrain; p++){
32
+ int randval = dis(gen);
33
+ std::memcpy(&sample_buffer[p * d_sub],
34
+ &train_data[randval * d_sub],
35
+ d_sub * sizeof(float));
36
+ }
37
+ kmean_clustering(d_sub, maxtrain, k_sub, sample_buffer.data(), centroids.data() + (i * d_sub * k_sub),seed+i);
38
+ } else {
39
+ kmean_clustering(d_sub, n, k_sub, train_data.data() , centroids.data() + (i * d_sub * k_sub), seed+i);
40
+ }
41
+ }
42
+ trained = true;
43
+ }
44
+ void IndexPQ::encode(const float *x, uint8_t* out){
45
+ if(!trained)return;
46
+ for(int i =0; i<m; i++){
47
+ const float *query_chunk = x + (i*d_sub);
48
+ float mindistance = 1e9;
49
+ int bestid = 0;
50
+ for(int id=0; id<k_sub; id++){
51
+ const float* centroid_chunk = centroids.data()+(i*k_sub*d_sub)+(id*d_sub);
52
+ float dist = 0;
53
+ for(int j =0; j<d_sub; j++){
54
+ float diff = query_chunk[j]- centroid_chunk[j];
55
+ dist += diff*diff;
56
+ }
57
+ if(dist<mindistance){
58
+ mindistance = dist;
59
+ bestid = id;
60
+ }
61
+ }
62
+ out[i] = bestid;
63
+ }
64
+ }
65
+ //precalc all distance for query and centroid
66
+ void IndexPQ::compute_distance_table(const float *query, float *outable){
67
+ for(int i =0; i<m; i++){
68
+ const float* query_slice = query+(i*d_sub);
69
+ for(int j = 0; j<k_sub; j++){
70
+ float dist = 0;
71
+ const float *offset= centroids.data()+(i*k_sub*d_sub) + (j*d_sub);
72
+ /*for(int k = 0;k<d_sub; k++){
73
+ float diff = offset[k]-query_slice[k];
74
+ dist+=diff*diff;
75
+ }*/
76
+ __m256 sumvec = _mm256_setzero_ps();
77
+ for(int k =0; k<d_sub; k+=8){
78
+ __m256 offvec= _mm256_loadu_ps(&offset[k]);
79
+ __m256 querslice= _mm256_loadu_ps(&query_slice[k]);
80
+ __m256 diffvec = _mm256_sub_ps(offvec,querslice);
81
+ sumvec = _mm256_fmadd_ps(diffvec, diffvec, sumvec);
82
+ }
83
+ float unpacked[8];
84
+ _mm256_storeu_ps(unpacked, sumvec);
85
+ dist=unpacked[0]+unpacked[1]+
86
+ unpacked[2]+unpacked[3]+
87
+ unpacked[4]+unpacked[5]+
88
+ unpacked[6]+unpacked[7];
89
+ outable[(i*k_sub)+j] = dist;
90
+ }
91
+ }
92
+ }
93
+
src/bindings.cpp ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <pybind11/pybind11.h>
2
+ #include <pybind11/numpy.h>
3
+ #include "IndexIVF.h"
4
+ #include "IndexIVFPQ.h"
5
+ #include "iostream"
6
+ #include <pybind11/stl.h>
7
+ #include <vector>
8
+ namespace py = pybind11;
9
+
10
+ // "vecmini" is the name of the module you will type in python-> 'import vecmini'
11
+ PYBIND11_MODULE(vecmini, m) {
12
+ m.doc() = "Vecmini: A mini custom IVF Vector Database with Metadata Filtering";
13
+
14
+ py::class_<IndexIVF>(m, "IndexIVF")
15
+ .def(py::init<int, int>(), py::arg("d"), py::arg("nbucket"))
16
+
17
+ .def("train", [](IndexIVF &self, int n, py::array_t<float, py::array::c_style> x) {
18
+ py::buffer_info buf = x.request();
19
+ self.train(n, (const float *)buf.ptr);
20
+ }, py::arg("n"), py::arg("x").noconvert())
21
+
22
+
23
+ .def("add", [](IndexIVF &self, int n,
24
+ py::array_t<float, py::array::c_style | py::array::forcecast> x,
25
+ py::array_t<uint64_t, py::array::c_style | py::array::forcecast> xids) {
26
+
27
+ py::buffer_info buf_x = x.request();
28
+ py::buffer_info buf_xids = xids.request();
29
+
30
+ self.add(n, (const float *)buf_x.ptr, (const uint64_t *)buf_xids.ptr);
31
+ }, py::arg("n"), py::arg("x"), py::arg("xids"))
32
+
33
+ // Expose search() - UPDATED FOR NPROBE AND BITMASK
34
+ .def("search", [](IndexIVF &self, int n,
35
+ py::array_t<float, py::array::c_style | py::array::forcecast> x,
36
+ int k, int nprobe, py::object bitmask) {
37
+
38
+ py::buffer_info buf_x = x.request();
39
+
40
+ // Empty arrays to hold the answers for Python
41
+ py::array_t<float> distances({n, k});
42
+ py::array_t<int> labels({n, k});
43
+
44
+ const uint8_t* bitmask_ptr = nullptr;
45
+ py::array_t<uint8_t> bitmask_arr;
46
+
47
+ if (!bitmask.is_none()) {
48
+ bitmask_arr = bitmask.cast<py::array_t<uint8_t, py::array::c_style | py::array::forcecast>>();
49
+ bitmask_ptr = (const uint8_t*)bitmask_arr.request().ptr;
50
+ std::cout<<"recieved bitmask , *pointer address->" <<(void*)bitmask_ptr<<"\n";
51
+ } else {
52
+ std::cout<<"recieved NONE\n";
53
+ }
54
+
55
+
56
+ self.search(n, (const float *)buf_x.ptr, k, nprobe, bitmask_ptr,
57
+ distances.mutable_data(), labels.mutable_data());
58
+
59
+ return py::make_tuple(distances, labels);
60
+ }, py::arg("n"), py::arg("x"), py::arg("k"), py::arg("nprobe"), py::arg("bitmask"));
61
+
62
+ py::class_<IndexIVFPQ>(m, "IndexIVFPQ")
63
+ .def(py::init<int, int, int>(),
64
+ py::arg("d"),
65
+ py::arg("nbucket"),
66
+ py::arg("m"))
67
+
68
+ .def("train", [](IndexIVFPQ &self, int n, py::array_t<float, py::array::c_style> x, bool subsampling, bool seed) {
69
+ py::buffer_info buf = x.request();
70
+ self.train(n, static_cast<const float *>(buf.ptr), subsampling, seed);
71
+ }, py::arg("n"), py::arg("x").noconvert(), py::arg("subsampling"), py::arg("seed"))
72
+
73
+ .def("add", [](IndexIVFPQ &self,int n, py::array_t<float, py::array::c_style> x, py::array_t<uint64_t, py::array::c_style> xids){
74
+ py::buffer_info bufx = x.request();
75
+ py::buffer_info bufxids = xids.request();
76
+
77
+ self.add(n, static_cast<const float *>(bufx.ptr),static_cast<const uint64_t *>(bufxids.ptr));
78
+ }, py::arg("n"), py::arg("x").noconvert(), py::arg("xids").noconvert())
79
+
80
+ .def("search", [](IndexIVFPQ &self, int n,
81
+ py::array_t<float, py::array::c_style> query,
82
+ int k, int nprobe){
83
+ py::buffer_info buf_query = query.request();
84
+
85
+ py::array_t<float> distances({n,k});
86
+ py::array_t<int64_t> labels({n,k});
87
+
88
+ self.search(n, static_cast<const float *>(buf_query.ptr), k, nprobe, distances.mutable_data(), labels.mutable_data());
89
+
90
+ return py::make_tuple(distances, labels);
91
+ }, py::arg("n"), py::arg("query").noconvert(), py::arg("k"), py::arg("nprobe"));
92
+
93
+
94
+
95
+ py::class_<IndexFlatL2>(m, "IndexFlatL2")
96
+ .def(py::init<int>(),
97
+ py::arg("d"))
98
+
99
+ .def("add", [](IndexFlatL2 &self,int n, py::array_t<float, py::array::c_style> x){
100
+ py::buffer_info bufx = x.request();
101
+
102
+ self.add(n, static_cast<const float *>(bufx.ptr));
103
+ }, py::arg("n"), py::arg("x").noconvert())
104
+
105
+ .def("search", [](IndexFlatL2 &self, int n,
106
+ py::array_t<float, py::array::c_style> x,
107
+ int k){
108
+ py::buffer_info bufx = x.request();
109
+
110
+ py::array_t<float> distances({n,k});
111
+ py::array_t<int> labels({n,k});
112
+
113
+ self.search(n, static_cast<const float *>(bufx.ptr), k, distances.mutable_data(), labels.mutable_data());
114
+
115
+ return py::make_tuple(distances, labels);
116
+ }, py::arg("n"), py::arg("x").noconvert(), py::arg("k"));
117
+
118
+ }
src/clustering.cpp ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include<vector>
2
+ #include "clustering.h"
3
+ #include "IndexFlat.h"
4
+ #include <random>
5
+ #include <cstring>
6
+ #include <cmath>
7
+ void kmean_clustering(int d, int n, int k, const float *x, float *centroids, int seed){
8
+ std::mt19937 gen(seed);
9
+ std::uniform_int_distribution<int> distr(0, n - 1);
10
+
11
+ for (int i = 0; i < k; i++) {
12
+ int rand_idx = distr(gen);
13
+ std::memcpy(centroids + (i * d), x + (rand_idx * d), d * sizeof(float));
14
+ }
15
+
16
+ int niter = 15;
17
+ std::vector<int> assign(n);
18
+ std::vector<float> distances(n);
19
+ for(int iter = 0; iter<niter; iter++){
20
+ IndexFlatL2 index(d);
21
+ index.add(k,centroids);
22
+ index.search(n,x,1,distances.data(), assign.data());
23
+ std::vector<float> newcentroid(k*d,0.0);
24
+ std::vector<int> counts(k,0);
25
+ for(int i = 0; i<n; i++){
26
+ int c = assign[i];
27
+ counts[c]+=1;
28
+ for(int m =0; m<d; m++){
29
+ newcentroid[c*d+m] += x[i*d+m];
30
+ }
31
+ }
32
+ for(int c = 0; c<k; c++){
33
+ if (counts[c]>0){
34
+ for(int m = 0; m<d; m++){
35
+ centroids[c*d+m] = newcentroid[c*d+m]/counts[c];
36
+ }
37
+ }
38
+ }
39
+ }
40
+ }
src/rand.json ADDED
The diff for this file is too large to render. See raw diff