Spaces:

levanell
/

vecmini-engine

Running

App Files Files Community

levanel commited on 4 days ago

Commit

e87a50a

0 Parent(s):

vecmini1

Browse files

Files changed (16) hide show

.gitignore +2 -0
CMakeLists.txt +53 -0
Dockerfile +45 -0
app.py +114 -0
include/IndexFlat.h +16 -0
include/IndexIVF.h +23 -0
include/IndexIVFPQ.h +28 -0
include/IndexPQ.h +18 -0
include/clustering.h +11 -0
src/IndexFlat.cpp +86 -0
src/IndexIVF.cpp +148 -0
src/IndexIVFPQ.cpp +119 -0
src/IndexPQ.cpp +93 -0
src/bindings.cpp +118 -0
src/clustering.cpp +40 -0
src/rand.json +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.so
2	+ build/

CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,53 @@

+cmake_minimum_required(VERSION 3.22)
+project(vecmini VERSION 1.0 LANGUAGES CXX)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+# Use modern C++17 for cleaner syntax
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+# AVX-256 and Optimization Flags (Safe for Hugging Face cloud CPUs)
+add_compile_options(-mavx2 -mfma -O3 -march=x86-64-v3)
+# Generates the compile_commands.json file for tools like clangd
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+# ==========================================
+# DEPENDENCIES (OpenMP & Math for your engine)
+# ==========================================
+find_package(OpenMP REQUIRED)
+find_package(BLAS REQUIRED)
+find_package(LAPACK REQUIRED)
+# ==========================================
+# INCLUDES
+# ==========================================
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+# ==========================================
+# PHASE 1: Build your core vector database library
+# ==========================================
+add_library(vectordb STATIC
+    src/IndexFlat.cpp
+    src/clustering.cpp
+    src/IndexIVF.cpp
+    src/IndexPQ.cpp
+    src/IndexIVFPQ.cpp
+)
+# ==========================================
+# PHASE 2: The Python Bridge (Pybind11)
+# ==========================================
+find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+find_package(pybind11 CONFIG REQUIRED)
+pybind11_add_module(vecmini src/bindings.cpp)
+# Only link your own engine and the required math/threading backends
+target_link_libraries(vecmini PRIVATE
+    vectordb
+    OpenMP::OpenMP_CXX
+    ${BLAS_LIBRARIES}
+    ${LAPACK_LIBRARIES}
+)

Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+# Stage 1: Build your module
+FROM python:3.10-slim AS builder
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    libopenblas-dev \
+    liblapack-dev \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+RUN pip install --no-cache-dir pybind11
+# Copy only what your engine needs
+COPY CMakeLists.txt ./
+COPY src/ ./src
+COPY include/ ./include
+RUN mkdir build && cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
+    make vecmini
+# Stage 2: Final Runtime Environment
+FROM python:3.10-slim
+RUN useradd -m -u 1000 user
+WORKDIR /home/user/app
+# Install runtime math dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libopenblas0 \
+    libomp-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Copy the compiled .so module directly into the Python environment
+COPY --from=builder /app/build/vecmini*.so /home/user/app/
+# Copy your frontend code (app.py)
+COPY --chown=user . /home/user/app
+RUN pip install --no-cache-dir gradio numpy
+ENV PORT=7860
+EXPOSE 7860
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import gradio as gr
+import torch
+import numpy as np
+import vecmini
+import pypdf
+from transformers import AutoTokenizer, AutoModel
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16
+print("Loading Sentence Encoder (Bi-Encoder Only)...")
+embed_id = "sentence-transformers/all-MiniLM-L6-v2"
+embed_tokenizer = AutoTokenizer.from_pretrained(embed_id)
+embed_model = AutoModel.from_pretrained(embed_id).to(device).to(torch_dtype)
+global_chunks = []
+db = None
+global_nlist = 1
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0]
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+def encode_texts(texts):
+    encoded_input = embed_tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
+    with torch.no_grad():
+        model_output = embed_model(**encoded_input)
+    embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
+    return torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu().numpy().astype(np.float32)
+def process_pdf(file_obj):
+    global global_chunks, db, global_nlist
+    if file_obj is None:
+        return "Error: No file uploaded."
+    try:
+        reader = pypdf.PdfReader(file_obj.name)
+        text = ""
+        for page in reader.pages:
+            extracted = page.extract_text()
+            if extracted:
+                text += extracted + " "
+    except Exception as e:
+        return f"Failed to read PDF: {str(e)}"
+    if not text.strip():
+        return "Error: Could not extract any readable text from this PDF."
+    chunk_size = 200
+    words = text.split()
+    global_chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+    embeddings = encode_texts(global_chunks)
+    d = embeddings.shape[1]
+    nb = embeddings.shape[0]
+    global_nlist = max(1, int(nb / 4))
+    db = vecmini.IndexIVF(d, global_nlist)
+    db.train(nb, embeddings)
+    db.add(nb, embeddings, np.arange(nb, dtype=np.uint64))
+    return f"Success! Extracted {nb} chunks from the PDF and built vecmini index."
+def retrieve_chunks(query, top_k):
+    if db is None or not global_chunks:
+        return "Please upload and process a PDF first."
+    if not query.strip():
+        return "Please enter a search query."
+    query_emb = encode_texts([query])
+    fetch_k = min(int(top_k), len(global_chunks))
+    nprobe = max(1, int(global_nlist / 2))
+    distances, labels = db.search(1, query_emb, k=fetch_k, nprobe=nprobe, bitmask=None)
+    retrieved_indices = [idx for idx in labels[0] if idx < len(global_chunks)]
+    output_text = f"### Top {len(retrieved_indices)} Results for: *'{query}'*\n\n"
+    for i, idx in enumerate(retrieved_indices):
+        dist = distances[0][i]
+        chunk_text = global_chunks[idx]
+        output_text += f"**Result {i+1}** | Vector Distance: `{dist:.4f}` | Chunk ID: `{idx}`\n"
+        output_text += f"> {chunk_text}\n\n---\n\n"
+    return output_text
+with gr.Blocks(title="Vecmini Visualizer") as demo:
+    gr.Markdown("# Vecmini: PDF Raw Retrieval Tester")
+    gr.Markdown("Upload a PDF, build the index, and see exactly what `vecmini` returns for your queries.")
+    with gr.Row():
+        with gr.Column():
+            pdf_input = gr.File(label="Upload PDF Document", file_types=[".pdf"])
+            process_btn = gr.Button("Build Vecmini Index", variant="primary")
+            status_out = gr.Textbox(label="Index Status", interactive=False)
+        with gr.Column():
+            query_input = gr.Textbox(label="Search Query")
+            k_slider = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of chunks to retrieve (K)")
+            search_btn = gr.Button("Search Vecmini")
+    results_out = gr.Markdown(label="Retrieved Chunks")
+    process_btn.click(fn=process_pdf, inputs=pdf_input, outputs=status_out)
+    search_btn.click(fn=retrieve_chunks, inputs=[query_input, k_slider], outputs=results_out)
+    query_input.submit(fn=retrieve_chunks, inputs=[query_input, k_slider], outputs=results_out)
+if __name__ == "__main__":
+    demo.launch()

include/IndexFlat.h ADDED Viewed

	@@ -0,0 +1,16 @@

+#pragma once
+#include <vector>
+class IndexFlatL2{
+    int d;//dimension of vector
+    int ntotal=0;//no of vector in the db
+    std::vector<float>xb;
+    public:
+        IndexFlatL2(int d) : d(d) {}
+        // ingests 'n'vectors from a raw memory pointer 'x' into the database
+        void add(int n, const float *x);
+        //k->how many nearest neghbour we want
+        // ans is saved in distances and labels
+        void search(int n, const float *x, int k, float *distances, int* labels);
+};

include/IndexIVF.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#pragma once
+#include <vector>
+#include "IndexFlat.h"
+#include <cstdint>
+class IndexIVF {
+private:
+    int d;
+    int nbucket;
+    int ntotal = 0;
+    bool trained = false;
+    IndexFlatL2 router;
+    std::vector<std::vector<float>> memory;
+    std::vector<std::vector<uint64_t>> memory_ids;
+public: // The interface (Your benchmark script is allowed to use these)
+    IndexIVF(int d, int nbucket);
+    void train(int n, const float *x);
+    void add(int n, const float *x, const uint64_t*xids);
+    void search(int n, const float* x, int k, int nprobe, const uint8_t *bitmask, float *distances, int *labels, const uint8_t *L1_summary = nullptr);
+    void inject_centroids(const float* external_centroids);
+};

include/IndexIVFPQ.h ADDED Viewed

	@@ -0,0 +1,28 @@

+#pragma once
+#include<vector>
+#include<cstdint>
+#include "IndexPQ.h"
+#include "IndexFlat.h"
+#include <cstddef>
+class IndexIVFPQ{
+private:
+    int d;
+    int m;//bitquant
+    int nbucket; //no of centroid
+    int ntotal; //no of vector index
+    bool trained=false;
+    size_t nprobe;//how many voronoi i should look at
+    IndexFlatL2 router;
+    IndexPQ pq;
+    std::vector<float>coarse_centroids;
+    std::vector<std::vector<uint8_t>>codes;
+    std::vector<std::vector<int64_t>>ids;
+public:
+    IndexIVFPQ(int d, int nbucket, int m);
+    void train(int n, const float *x, bool subsampling, int seed);
+    void add(int n, const float *x, const uint64_t* xids);
+    void search(int n, const float *query, int k, int nprobe, float* distances, int64_t* labels);
+};

include/IndexPQ.h ADDED Viewed

	@@ -0,0 +1,18 @@

+#pragma once
+#include <vector>
+#include <cstdint>
+class IndexPQ{
+private:
+    int d;
+    int m;
+    int k_sub;
+    int d_sub;
+    std::vector<float> centroids;
+    bool trained  = false;
+public:
+    IndexPQ(int d, int m);
+    void train(int n, const float* x, bool subsampling, int seed);
+    void encode(const float *x, uint8_t* out);
+    void compute_distance_table(const float *query, float *outable);
+};

include/clustering.h ADDED Viewed

	@@ -0,0 +1,11 @@

+#pragma once
+#include<vector>
+void kmean_clustering(
+    int d,
+    int n,
+    int k,
+    const float *x,
+    float *centroids,
+    int seed
+);

src/IndexFlat.cpp ADDED Viewed

	@@ -0,0 +1,86 @@

+#include "IndexFlat.h"
+#include <queue>
+#include <vector>
+#include<immintrin.h>
+void IndexFlatL2::add(int n, const float *x){
+    xb.insert(xb.end(), x, x+(n*d));
+    ntotal+=n;
+}
+void IndexFlatL2::search(int n, const float *x, int k, float *distances, int *labels){
+    for(int i = 0; i<n; i++){//iterate over the entire query
+        //old stuff
+            //float min_distance = 1e9;
+            //int bestid = -1;
+        std::priority_queue<std::pair<float, int>> pq;
+        for(int j= 0; j<ntotal; j++){//compare query against every vec in db
+            float curr_distance = 0;
+            int m = 0;
+            __m256 sumvec = _mm256_setzero_ps();
+            const float* current_db_vec = &xb[j * d];
+            const float* current_q_vec = &x[i * d];
+            for(; m + 7 < d; m += 8){
+                __m256 dbvec = _mm256_loadu_ps(&current_db_vec[m]);
+                __m256 qvec = _mm256_loadu_ps(&current_q_vec[m]);
+                __m256 diff = _mm256_sub_ps(dbvec, qvec);
+                sumvec = _mm256_fmadd_ps(diff, diff, sumvec);
+            }
+            __m128 upper = _mm256_extractf128_ps(sumvec,1);
+            __m128 lower = _mm256_castps256_ps128(sumvec);
+            __m128 sumbound = _mm_add_ps(upper, lower);
+            __m128 shifted = _mm_movehl_ps(sumbound,sumbound);
+            __m128 current = _mm_add_ps(sumbound, shifted);
+            __m128 shuffled = _mm_shuffle_ps(current, current, 1);
+            __m128 finalsum = _mm_add_ps(current, shuffled);
+            curr_distance = _mm_cvtss_f32(finalsum);
+            for(; m < d; m++){
+                float dist = current_db_vec[m] - current_q_vec[m];
+                curr_distance += (dist * dist);
+            }
+            /*
+            if (curr_distance<min_distance){
+                min_distance=curr_distance;
+                bestid = j;
+            } */
+            if(pq.size()<k){
+                pq.push({curr_distance,j});
+            }else{
+                if(curr_distance<pq.top().first){
+                    pq.pop();
+                    pq.push({curr_distance,j});
+                }
+            }
+        }
+        /*
+        distances[i] = min_distance;
+        labels[i] = bestid; */
+        //standard for loop cannot handle garbage values.
+        //for that we need 2 seperate for loop, one that handles the queue content properly
+        int count=pq.size();
+        for(int c = count-1; c>=0; c--){
+            distances[i*k+c] = pq.top().first;
+            labels[i*k+c] = pq.top().second;
+            pq.pop();
+        }
+        for(int step=count; step<k; step++){
+            distances[i*k+step] = -1.0;
+            labels[i*k+step] = -1;
+        }
+    }
+}

src/IndexIVF.cpp ADDED Viewed

	@@ -0,0 +1,148 @@

+#include "IndexIVF.h"
+#include "clustering.h"
+#include <queue>
+#include <iostream>
+#include<immintrin.h>
+IndexIVF::IndexIVF(int d, int nbucket): d(d), nbucket(nbucket), router(d){
+    memory.resize(nbucket);
+    memory_ids.resize(nbucket);
+};
+void IndexIVF::train(int n, const float *x){
+     if(trained) return;
+     std::vector<float>centroids(nbucket*d);
+     //remove seed
+     kmean_clustering(d, n, nbucket, x ,centroids.data(),1);
+     router.add(nbucket, centroids.data());
+     trained=true;
+}
+void IndexIVF::add(int n, const float *x, const uint64_t*xids){
+     if(!trained) return;
+     std::vector<int> assign(n);
+     std::vector<float> distances(n);
+     router.search(n,x,1,distances.data(), assign.data());
+     for(int i =0; i<n; i++){
+        int bucketid= assign[i];
+        memory[bucketid].insert(memory[bucketid].end(),x+(i*d), x+(i*d)+d);
+        //for metadata
+        memory_ids[bucketid].push_back(xids[i]);
+    }
+    ntotal+=n;
+}
+void IndexIVF::search(int n, const float* x, int k, int nprobe, const uint8_t *bitmask, float *distances, int *labels, const uint8_t *vecmini_L1_summary){
+    std::vector<int>assign(n*nprobe);
+    std::vector<float> centroids_distance(n*nprobe);
+    router.search(n,x,nprobe,centroids_distance.data(), assign.data());
+    for(int i = 0; i<n; i++){
+        //std::unordered_set <uint64_t> set;
+//        std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>, std::greater<std::pair<float, int>>> pq;
+        std::priority_queue<std::pair<float, int>> pq;
+        const float *specquer = x+(i*d);
+        for(int p= 0; p<nprobe; p++){
+            int bucketid = assign[i*nprobe+p];
+            int vectorinmemo = memory[bucketid].size()/d;
+            if(vectorinmemo==0)continue;
+            const float *bucketdata=  memory[bucketid].data();
+        for(int j = 0; j<vectorinmemo; j++){
+            int prefetch_stride = 1;
+            if(j + prefetch_stride < vectorinmemo){
+                _mm_prefetch((const char*)&bucketdata[(j + prefetch_stride) * d], _MM_HINT_T0);
+                if (bitmask != nullptr) {
+                    uint64_t future_id = memory_ids[bucketid][j + prefetch_stride];
+                    // If you ever use L1 summary again, prefetch it here:
+                    // if (vecmini_L1_summary != nullptr) _mm_prefetch((const char*)&vecmini_L1_summary[future_id / 8], _MM_HINT_T0);
+                    // Prefetch the massive uint8_t mask byte
+                    _mm_prefetch((const char*)&bitmask[future_id], _MM_HINT_T0);
+                }
+            }
+            uint64_t global_id = memory_ids[bucketid][j];
+                if (bitmask != nullptr && bitmask[global_id]==0 ) {
+                    continue;
+                }
+            //removed this for simd
+            //benchmark for standard cosine calc->
+            //nullptr: 6.32857
+            //bitmask: 4.60353
+            //after simd
+            //nullptr: 1.3298
+            //bitmask: 0.918149
+            //added simd
+            float dist = 0;
+            int m = 0;
+            __m256 sumvec = _mm256_setzero_ps();
+            /*for(int m = 0; m<d; m++){
+                currcosine+=(bucketdata[j*d+m]*specquer[m]);
+            }*/
+            for(; m+7<d; m+=8){
+                //load 8float from the db chunk
+                __m256 dbvec=  _mm256_loadu_ps(&bucketdata[j*d+m]);
+                __m256 qvec=  _mm256_loadu_ps(&specquer[m]);
+                __m256 diff = _mm256_sub_ps(dbvec, qvec); //-> only add for un normalized vectors
+                sumvec = _mm256_fmadd_ps(diff, diff, sumvec);
+            }
+            __m128 upper = _mm256_extractf128_ps(sumvec, 1);
+            __m128 lower = _mm256_extractf128_ps(sumvec, 0);
+            __m128 sumbound = _mm_add_ps(upper, lower);
+            __m128 shifted = _mm_movehl_ps(sumbound, sumbound);
+            __m128 current = _mm_add_ps(sumbound, shifted);
+            __m128 shuffled = _mm_shuffle_ps(current, current, 1);
+            __m128 finalsum = _mm_add_ps(current, shuffled);
+            dist = _mm_cvtss_f32(finalsum);
+/*
+            float sumarr[8];
+            _mm256_storeu_ps(sumarr,sumvec);
+            currcosine= sumarr[0]+sumarr[1]+
+                        sumarr[2]+sumarr[3]+
+                        sumarr[4]+sumarr[5]+
+                        sumarr[6]+sumarr[7];
+            //cleanup
+  */
+            if(pq.size()<k){
+                pq.push({dist, global_id});
+            }else{
+                if(dist<pq.top().first){
+                    pq.pop();
+                    pq.push({dist, global_id});
+                }
+            }
+        }
+    }
+        float *speldist = distances+(i*k);
+        int *spelbs = labels+(i*k);
+        int count = pq.size();
+        for(int c = count-1; c>=0; c--){
+            speldist[c]= pq.top().first;
+            spelbs[c]= pq.top().second;
+            pq.pop();
+        }
+        for(int step = count; step<k; step++){
+            speldist[step]=-1.0;
+            spelbs[step]= -1;
+        }
+    }
+}
+void IndexIVF::inject_centroids(const float* external_centroids) {
+    if(trained) return;
+    router.add(nbucket, external_centroids);
+    trained = true;
+}

src/IndexIVFPQ.cpp ADDED Viewed

	@@ -0,0 +1,119 @@

+#include "IndexIVFPQ.h"
+#include "IndexIVF.h"
+#include "clustering.h"
+#include <queue>
+#include <iostream>
+#include <immintrin.h>
+#include <random>
+#include <cstring>
+IndexIVFPQ::IndexIVFPQ(int d, int nbucket, int m): d(d), m(m), nbucket(nbucket), router(d), pq(d, m){
+    codes.resize(nbucket);
+    ids.resize(nbucket);
+};
+void IndexIVFPQ::train(int n, const float *x, bool subsampling, int seed){
+    if(trained)return;
+    coarse_centroids.resize(nbucket*d);
+    int maxtrain = 150000;
+    if(n>maxtrain && subsampling){
+        std::mt19937 gen(seed);
+        std::uniform_int_distribution<int>dis(0,n-1);
+        std::vector<float> sample_buffer(maxtrain * d);
+        for(int i=0; i<maxtrain; i++){
+            int randval = dis(gen);
+            std::memcpy(&sample_buffer[i*d],
+                         &x[randval * d],
+                         d * sizeof(float));
+        }
+        kmean_clustering(d, maxtrain, nbucket, sample_buffer.data(), coarse_centroids.data(), seed);
+    }else{kmean_clustering(d, n, nbucket, x, coarse_centroids.data(), seed);}
+    router.add(nbucket, coarse_centroids.data());
+    std::vector<float>residuals(n*d);
+    std::vector<float> distances(n);
+    std::vector<int> labels(n);
+    router.search(n,x,1,distances.data(), labels.data());
+    for(int i = 0;i<n; i++){
+        int drawerid = labels[i];
+        for(int j = 0; j<d; j++){
+            residuals[(i*d)+j] = x[(i*d)+j] - coarse_centroids[(drawerid*d)+j];
+        }
+    }
+    pq.train(n, residuals.data(), subsampling, seed);
+    trained = true;
+}
+void IndexIVFPQ::add(int n, const float *x, const uint64_t* xids){
+    if (!trained) return;
+    std::vector<float>residuals(n*d);
+    std::vector<float> distances(n);
+    std::vector<int> labels(n);
+    router.search(n,x,1,distances.data(), labels.data());
+    std::cout << "expected centroids size: " << nbucket * d << std::endl;
+std::cout << "actual centroids size: " << coarse_centroids.size() << std::endl;
+std::cout << "codes vector size: " << codes.size() << std::endl;
+    for(int i = 0;i<n; i++){
+        int drawerid = labels[i];
+        for(int j = 0; j<d; j++){
+            residuals[(i*d)+j] = x[(i*d)+j]-coarse_centroids[(drawerid*d)+j];
+        }
+        std::vector<uint8_t> zipvect(m);
+        pq.encode(residuals.data()+(i*d), zipvect.data());
+        codes[drawerid].insert(codes[drawerid].end(), zipvect.begin(), zipvect.end());
+        ids[drawerid].push_back(xids[i]);
+    }
+}
+void IndexIVFPQ::search(int n, const float *query, int k, int nprobe, float* distances, int64_t* labels){
+    std::vector<int> assign(n*nprobe);
+    std::vector<float> coarse_distances(n*nprobe);
+    router.search(n,query, nprobe, coarse_distances.data(),assign.data());
+    for(int i = 0; i<n; i++){
+        std::priority_queue<std::pair<float, int>> max_heap;
+        std::vector<float> query_residual(d);
+        for(int p=0; p<nprobe; p++){
+            int drawerid = assign[(i*nprobe)+p];
+            /*for(int j = 0; j<d; j++){
+                query_residual[j] = query[(i*d)+j] - coarse_centroids[(drawerid*d)+j];
+            }
+            */
+            for(int j=0; j<d; j+=8){
+                __m256 ccvec= _mm256_loadu_ps(&coarse_centroids[(drawerid*d)+j]);
+                __m256 qrvec= _mm256_loadu_ps(&query[(i*d)+j]);
+                __m256 diffvec = _mm256_sub_ps(qrvec,ccvec);
+                _mm256_storeu_ps(&query_residual[j], diffvec);
+            }
+            std::vector<float> distance_table(m*256);
+            pq.compute_distance_table(query_residual.data(), distance_table.data());
+            for(int v = 0; v<codes[drawerid].size()/m; v++){
+                float totaldistance =0.0;
+                for(int m_idx = 0; m_idx<m; m_idx++){
+                    int centroid_id = codes[drawerid][(v*m)+m_idx];
+                    totaldistance+=distance_table[centroid_id+(m_idx*256)];
+                }
+                if(max_heap.size()<k){
+                    max_heap.push({totaldistance, ids[drawerid][v]});
+                }else{
+                    if(totaldistance<max_heap.top().first){
+                        max_heap.pop();
+                        max_heap.push({totaldistance, ids[drawerid][v]});
+                    }
+                }
+            }
+        }
+        float *subdist = distances+(i*k);
+        int64_t *sublbs = labels+(i*k);
+        int count = max_heap.size();
+        for(int c = count-1; c>=0; c--){
+            subdist[c] = max_heap.top().first;
+            sublbs[c] = max_heap.top().second;
+            max_heap.pop();
+        }
+        for(int fod = count; fod<k; fod++){
+            subdist[fod]=-1.0;
+            sublbs[fod]=-1;
+        }
+    }
+}

src/IndexPQ.cpp ADDED Viewed

	@@ -0,0 +1,93 @@

+#include <IndexPQ.h>
+#include <vector>
+#include "clustering.h"
+#include<immintrin.h>
+#include <random>
+#include <cstring>
+IndexPQ::IndexPQ(int d, int m):d(d), m(m){
+    k_sub = 256;
+    d_sub = d/m;
+    centroids.resize(m*d_sub*k_sub);
+};
+void IndexPQ::train(int n, const float *x, bool subsampling, int seed){
+    if(trained) return;
+    std::vector<float> train_data(n * d_sub);
+    for(int i = 0; i < m; i++){
+        for(int row = 0; row < n; row++){
+            const float* source_id = x + (row * d) + (i * d_sub);
+            float* dest_id = train_data.data() + (row * d_sub);
+            for(int j = 0; j < d_sub; j++){
+                dest_id[j] = source_id[j];
+            }
+        }
+        int maxtrain = 150000;
+        if(n > maxtrain && subsampling){
+            std::mt19937 gen(seed + i);
+            std::uniform_int_distribution<int> dis(0, n - 1);
+            std::vector<float> sample_buffer(maxtrain * d_sub);
+            for(int p = 0; p < maxtrain; p++){
+                int randval = dis(gen);
+                std::memcpy(&sample_buffer[p * d_sub],
+                             &train_data[randval * d_sub],
+                             d_sub * sizeof(float));
+            }
+            kmean_clustering(d_sub, maxtrain, k_sub, sample_buffer.data(), centroids.data() + (i * d_sub * k_sub),seed+i);
+        } else {
+            kmean_clustering(d_sub, n, k_sub, train_data.data() , centroids.data() + (i * d_sub * k_sub), seed+i);
+        }
+    }
+    trained = true;
+}
+void IndexPQ::encode(const float *x, uint8_t* out){
+    if(!trained)return;
+    for(int i =0; i<m; i++){
+        const float *query_chunk = x + (i*d_sub);
+        float mindistance = 1e9;
+        int bestid = 0;
+        for(int id=0; id<k_sub; id++){
+            const float* centroid_chunk = centroids.data()+(i*k_sub*d_sub)+(id*d_sub);
+            float dist = 0;
+            for(int j =0; j<d_sub; j++){
+                float diff = query_chunk[j]- centroid_chunk[j];
+                  dist += diff*diff;
+            }
+            if(dist<mindistance){
+                mindistance = dist;
+                bestid = id;
+            }
+        }
+        out[i] = bestid;
+    }
+}
+//precalc all distance for query and centroid
+void IndexPQ::compute_distance_table(const float *query, float *outable){
+    for(int i =0; i<m; i++){
+        const float* query_slice = query+(i*d_sub);
+        for(int j = 0; j<k_sub; j++){
+            float dist = 0;
+            const float *offset= centroids.data()+(i*k_sub*d_sub) + (j*d_sub);
+            /*for(int k = 0;k<d_sub; k++){
+                float diff = offset[k]-query_slice[k];
+                dist+=diff*diff;
+            }*/
+             __m256 sumvec = _mm256_setzero_ps();
+            for(int k =0; k<d_sub; k+=8){
+               __m256 offvec= _mm256_loadu_ps(&offset[k]);
+               __m256 querslice= _mm256_loadu_ps(&query_slice[k]);
+               __m256 diffvec =  _mm256_sub_ps(offvec,querslice);
+               sumvec = _mm256_fmadd_ps(diffvec, diffvec, sumvec);
+            }
+            float unpacked[8];
+            _mm256_storeu_ps(unpacked, sumvec);
+                   dist=unpacked[0]+unpacked[1]+
+                        unpacked[2]+unpacked[3]+
+                        unpacked[4]+unpacked[5]+
+                        unpacked[6]+unpacked[7];
+            outable[(i*k_sub)+j] = dist;
+        }
+    }
+}

src/bindings.cpp ADDED Viewed

	@@ -0,0 +1,118 @@

+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include "IndexIVF.h"
+#include "IndexIVFPQ.h"
+#include "iostream"
+#include <pybind11/stl.h>
+#include <vector>
+namespace py = pybind11;
+// "vecmini" is the name of the module you will type in python-> 'import vecmini'
+PYBIND11_MODULE(vecmini, m) {
+    m.doc() = "Vecmini: A mini custom IVF Vector Database with Metadata Filtering";
+    py::class_<IndexIVF>(m, "IndexIVF")
+        .def(py::init<int, int>(), py::arg("d"), py::arg("nbucket"))
+        .def("train", [](IndexIVF &self, int n, py::array_t<float, py::array::c_style> x) {
+            py::buffer_info buf = x.request();
+            self.train(n, (const float *)buf.ptr);
+        }, py::arg("n"), py::arg("x").noconvert())
+        .def("add", [](IndexIVF &self, int n,
+                       py::array_t<float, py::array::c_style | py::array::forcecast> x,
+                       py::array_t<uint64_t, py::array::c_style | py::array::forcecast> xids) {
+            py::buffer_info buf_x = x.request();
+            py::buffer_info buf_xids = xids.request();
+            self.add(n, (const float *)buf_x.ptr, (const uint64_t *)buf_xids.ptr);
+        }, py::arg("n"), py::arg("x"), py::arg("xids"))
+        // Expose search() - UPDATED FOR NPROBE AND BITMASK
+        .def("search", [](IndexIVF &self, int n,
+                          py::array_t<float, py::array::c_style | py::array::forcecast> x,
+                          int k, int nprobe, py::object bitmask) {
+            py::buffer_info buf_x = x.request();
+            // Empty arrays to hold the answers for Python
+            py::array_t<float> distances({n, k});
+            py::array_t<int> labels({n, k});
+            const uint8_t* bitmask_ptr = nullptr;
+            py::array_t<uint8_t> bitmask_arr;
+            if (!bitmask.is_none()) {
+                bitmask_arr = bitmask.cast<py::array_t<uint8_t, py::array::c_style | py::array::forcecast>>();
+                bitmask_ptr = (const uint8_t*)bitmask_arr.request().ptr;
+                std::cout<<"recieved bitmask , *pointer address->" <<(void*)bitmask_ptr<<"\n";
+            } else {
+                std::cout<<"recieved NONE\n";
+            }
+            self.search(n, (const float *)buf_x.ptr, k, nprobe, bitmask_ptr,
+                        distances.mutable_data(), labels.mutable_data());
+            return py::make_tuple(distances, labels);
+        }, py::arg("n"), py::arg("x"), py::arg("k"), py::arg("nprobe"), py::arg("bitmask"));
+    py::class_<IndexIVFPQ>(m, "IndexIVFPQ")
+        .def(py::init<int, int, int>(),
+        py::arg("d"),
+        py::arg("nbucket"),
+        py::arg("m"))
+        .def("train", [](IndexIVFPQ &self, int n, py::array_t<float, py::array::c_style> x, bool subsampling, bool seed) {
+            py::buffer_info buf = x.request();
+            self.train(n, static_cast<const float *>(buf.ptr), subsampling, seed);
+        }, py::arg("n"), py::arg("x").noconvert(), py::arg("subsampling"), py::arg("seed"))
+        .def("add", [](IndexIVFPQ &self,int n, py::array_t<float, py::array::c_style> x, py::array_t<uint64_t, py::array::c_style> xids){
+            py::buffer_info bufx = x.request();
+            py::buffer_info bufxids = xids.request();
+            self.add(n, static_cast<const float *>(bufx.ptr),static_cast<const uint64_t *>(bufxids.ptr));
+        }, py::arg("n"), py::arg("x").noconvert(), py::arg("xids").noconvert())
+        .def("search", [](IndexIVFPQ &self, int n,
+                        py::array_t<float, py::array::c_style> query,
+                        int k, int nprobe){
+            py::buffer_info buf_query = query.request();
+            py::array_t<float> distances({n,k});
+            py::array_t<int64_t> labels({n,k});
+            self.search(n, static_cast<const float *>(buf_query.ptr), k, nprobe, distances.mutable_data(), labels.mutable_data());
+            return py::make_tuple(distances, labels);
+        }, py::arg("n"), py::arg("query").noconvert(), py::arg("k"), py::arg("nprobe"));
+    py::class_<IndexFlatL2>(m, "IndexFlatL2")
+        .def(py::init<int>(),
+        py::arg("d"))
+        .def("add", [](IndexFlatL2 &self,int n, py::array_t<float, py::array::c_style> x){
+            py::buffer_info bufx = x.request();
+            self.add(n, static_cast<const float *>(bufx.ptr));
+        }, py::arg("n"), py::arg("x").noconvert())
+        .def("search", [](IndexFlatL2 &self, int n,
+                        py::array_t<float, py::array::c_style> x,
+                        int k){
+            py::buffer_info bufx = x.request();
+            py::array_t<float> distances({n,k});
+            py::array_t<int> labels({n,k});
+            self.search(n, static_cast<const float *>(bufx.ptr), k, distances.mutable_data(), labels.mutable_data());
+            return py::make_tuple(distances, labels);
+        }, py::arg("n"), py::arg("x").noconvert(), py::arg("k"));
+}

src/clustering.cpp ADDED Viewed

	@@ -0,0 +1,40 @@

+#include<vector>
+#include "clustering.h"
+#include "IndexFlat.h"
+#include <random>
+#include <cstring>
+#include <cmath>
+void kmean_clustering(int d, int n, int k, const float *x, float *centroids, int seed){
+    std::mt19937 gen(seed);
+    std::uniform_int_distribution<int> distr(0, n - 1);
+    for (int i = 0; i < k; i++) {
+        int rand_idx = distr(gen);
+        std::memcpy(centroids + (i * d), x + (rand_idx * d), d * sizeof(float));
+    }
+    int niter = 15;
+    std::vector<int> assign(n);
+    std::vector<float> distances(n);
+    for(int iter = 0; iter<niter; iter++){
+        IndexFlatL2 index(d);
+        index.add(k,centroids);
+        index.search(n,x,1,distances.data(), assign.data());
+        std::vector<float> newcentroid(k*d,0.0);
+        std::vector<int> counts(k,0);
+        for(int i = 0; i<n; i++){
+            int c = assign[i];
+            counts[c]+=1;
+            for(int m =0; m<d; m++){
+                newcentroid[c*d+m] += x[i*d+m];
+            }
+        }
+        for(int c = 0; c<k; c++){
+            if (counts[c]>0){
+                for(int m = 0; m<d; m++){
+                    centroids[c*d+m] = newcentroid[c*d+m]/counts[c];
+                }
+            }
+        }
+    }
+}

src/rand.json ADDED Viewed

The diff for this file is too large to render. See raw diff