Spaces:

CAPS-IDI
/

caps-chatbot-internal

Sleeping

atwine Devin commited on 12 days ago

Commit

543fbbd

1 Parent(s): c2a8c7b

Replace FAISS index with portable numpy embeddings in pkl

FAISS C++ objects are not safely picklable across OS/build environments
(Windows pkl cannot be loaded on HF Space Linux). Fix: store raw
L2-normalised numpy embeddings in the pkl and perform retrieval via
a plain dot product. No FAISS dependency at runtime on the Space.

- build_index.py: build_embeddings() replaces build_faiss_index();
saves embeddings ndarray instead of faiss.IndexFlatIP object
- retrieval.py: pure numpy dot product, zero FAISS calls at runtime
- app.py: variable renamed _rag_index -> _rag_embeddings
- data/sanyu_knowledge_base.pkl: rebuilt with new format (167 chunks)

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>

Files changed (4) hide show

app.py +2 -2
build_index.py +20 -20
data/sanyu_knowledge_base.pkl +2 -2
retrieval.py +20 -27

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from retrieval import load_index, retrieve
 client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
 # Load FAISS index and embedding model once at startup
-_rag_index, _rag_chunks, _rag_model_name = load_index()
 _rag_model = SentenceTransformer(_rag_model_name)
 META_PROMPT = """<system>
@@ -393,7 +393,7 @@ def respond(message, history):
     # Retrieve relevant chunks from the FAISS index for this query
     query_text = extract_text(message)
-    rag_results = retrieve(query_text, _rag_index, _rag_chunks, _rag_model, top_k=4)
     # Inject retrieved context as a separate Content block before conversation history
     if rag_results:

 client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
 # Load FAISS index and embedding model once at startup
+_rag_embeddings, _rag_chunks, _rag_model_name = load_index()
 _rag_model = SentenceTransformer(_rag_model_name)
 META_PROMPT = """<system>
     # Retrieve relevant chunks from the FAISS index for this query
     query_text = extract_text(message)
+    rag_results = retrieve(query_text, _rag_embeddings, _rag_chunks, _rag_model, top_k=4)
     # Inject retrieved context as a separate Content block before conversation history
     if rag_results:

build_index.py CHANGED Viewed

@@ -17,7 +17,6 @@ import pickle
 import pdfplumber
 import numpy as np
-import faiss
 from sentence_transformers import SentenceTransformer
@@ -215,12 +214,15 @@ def chunk_section(section: dict,
 # 4.5  Embedding and FAISS index
 # ---------------------------------------------------------------------------
-def build_faiss_index(all_chunks: list,
-                      model_name: str = EMBEDDING_MODEL
-                      ) -> tuple:
     """
-    Embeds all chunks and builds a FAISS IndexFlatIP (cosine similarity).
-    Returns (index, chunks) tuple.
     """
     model = SentenceTransformer(model_name)
@@ -229,25 +231,23 @@ def build_faiss_index(all_chunks: list,
     embeddings = model.encode(texts, show_progress_bar=False, batch_size=32)
     embeddings = np.array(embeddings, dtype='float32')
-    # Normalise for cosine similarity
-    faiss.normalize_L2(embeddings)
-    # Build flat index (exact search — fine for this document size)
-    dimension = embeddings.shape[1]
-    index = faiss.IndexFlatIP(dimension)  # Inner product = cosine after normalisation
-    index.add(embeddings)
-    print(f"Index built: {index.ntotal} vectors, dimension {dimension}")
-    return index, all_chunks
 # ---------------------------------------------------------------------------
 # 4.6  Serialisation
 # ---------------------------------------------------------------------------
-def save_index(index, chunks: list, output_path: str):
     """
-    Saves the FAISS index and chunk metadata to a single .pkl file.
     """
     # Ensure the output directory exists
     output_dir = os.path.dirname(output_path)
@@ -255,7 +255,7 @@ def save_index(index, chunks: list, output_path: str):
         os.makedirs(output_dir, exist_ok=True)
     payload = {
-        'index': index,
         'chunks': chunks,
         'embedding_model': EMBEDDING_MODEL,
         'chunk_count': len(chunks),
@@ -328,8 +328,8 @@ def main():
             print("  [... truncated for display ...]")
     # --- Build FAISS index and save ---
-    index, chunks = build_faiss_index(all_chunks)
-    save_index(index, chunks, OUTPUT_PATH)
     print(f"\nDone. Upload '{OUTPUT_PATH}' to your Hugging Face Space.")

 import pdfplumber
 import numpy as np
 from sentence_transformers import SentenceTransformer
 # 4.5  Embedding and FAISS index
 # ---------------------------------------------------------------------------
+def build_embeddings(all_chunks: list,
+                     model_name: str = EMBEDDING_MODEL
+                     ) -> tuple:
     """
+    Embeds all chunks and returns L2-normalised numpy embeddings.
+    Stores raw numpy arrays instead of a FAISS index object so the .pkl
+    is portable across OS/FAISS versions (FAISS C++ objects are not
+    safely picklable across different platform builds).
+    Returns (embeddings, chunks) tuple.
     """
     model = SentenceTransformer(model_name)
     embeddings = model.encode(texts, show_progress_bar=False, batch_size=32)
     embeddings = np.array(embeddings, dtype='float32')
+    # Normalise for cosine similarity (dot product == cosine on unit vectors)
+    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+    embeddings = embeddings / np.maximum(norms, 1e-10)
+    print(f"Embeddings built: {embeddings.shape[0]} vectors, dimension {embeddings.shape[1]}")
+    return embeddings, all_chunks
 # ---------------------------------------------------------------------------
 # 4.6  Serialisation
 # ---------------------------------------------------------------------------
+def save_index(embeddings: np.ndarray, chunks: list, output_path: str):
     """
+    Saves L2-normalised numpy embeddings and chunk metadata to a .pkl file.
+    Raw numpy arrays are used instead of a FAISS index object to ensure
+    cross-platform portability (Windows build → Linux HF Space runtime).
     """
     # Ensure the output directory exists
     output_dir = os.path.dirname(output_path)
         os.makedirs(output_dir, exist_ok=True)
     payload = {
+        'embeddings': embeddings,   # np.ndarray float32, shape (n, d), L2-normalised
         'chunks': chunks,
         'embedding_model': EMBEDDING_MODEL,
         'chunk_count': len(chunks),
             print("  [... truncated for display ...]")
     # --- Build FAISS index and save ---
+    embeddings, chunks = build_embeddings(all_chunks)
+    save_index(embeddings, chunks, OUTPUT_PATH)
     print(f"\nDone. Upload '{OUTPUT_PATH}' to your Hugging Face Space.")

data/sanyu_knowledge_base.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eb28d0fb9ebba59356c14cd72be979c2e2c2310a50d76e19a68eca39ad70d0a1
-size 432034

 version https://git-lfs.github.com/spec/v1
+oid sha256:2fafd196b78a01cfe20f8c55d1f1c81fe919d5fc57049e48da0e64e3e2537310
+size 432077

retrieval.py CHANGED Viewed

@@ -2,10 +2,12 @@
 retrieval.py
 ------------
 Sanyu RAG — Retrieval Module
-Implements load_index() and retrieve() as described in devin-instructions.md (Section 4.7).
-This module is imported by the Sanyu app (app.py) at runtime to serve relevant
-context chunks in response to user queries.
 The .pkl file is located at: data/sanyu_knowledge_base.pkl (hardcoded as agreed).
 """
@@ -15,68 +17,59 @@ import numpy as np
 from sentence_transformers import SentenceTransformer
-# Path to the serialised FAISS index — hardcoded as confirmed with Atwine.
 DEFAULT_PKL_PATH = 'data/sanyu_knowledge_base.pkl'
 def load_index(pkl_path: str = DEFAULT_PKL_PATH) -> tuple:
     """
-    Loads the FAISS index and chunk metadata from the .pkl file.
     Returns:
-        (index, chunks, model_name)
-        - index: faiss.IndexFlatIP
         - chunks: list of chunk dicts (text + metadata)
         - model_name: str, the embedding model used to build the index
     """
     with open(pkl_path, 'rb') as f:
         payload = pickle.load(f)
-    return payload['index'], payload['chunks'], payload['embedding_model']
 def retrieve(query: str,
-             index,
              chunks: list,
              model: SentenceTransformer,
              top_k: int = 4) -> list:
     """
     Retrieves the top_k most relevant chunks for a given query.
-    Uses numpy dot product directly against stored vectors extracted from the
-    IndexFlatIP index — bypasses index.search() entirely to avoid FAISS
-    SWIG API version incompatibilities across environments.
     Args:
         query: The user's input string.
-        index: A loaded faiss.IndexFlatIP object.
-        chunks: The list of chunk dicts associated with the index.
         model: A loaded SentenceTransformer model instance.
-        top_k: Number of results to return (default 4, per spec).
     Returns:
-        List of chunk dicts, each with an added 'similarity_score' key,
         ordered from most to least relevant.
     """
     query_embedding = model.encode([query], normalize_embeddings=True)
     query_embedding = np.array(query_embedding, dtype='float32')  # shape (1, d)
-    # Extract all stored vectors from the FAISS flat index into a numpy array.
-    # reconstruct_n(start, n) is stable across all FAISS versions.
-    n_total = index.ntotal
-    d = index.d
-    all_vectors = np.zeros((n_total, d), dtype='float32')
-    for i in range(n_total):
-        all_vectors[i] = index.reconstruct(i)
-    # Cosine similarity via dot product (vectors are already L2-normalised)
-    scores = (query_embedding @ all_vectors.T).flatten()  # shape (n_total,)
     # Get top_k indices sorted by descending score
     top_indices = np.argsort(scores)[::-1][:top_k]
     results = []
     for idx in top_indices:
-        chunk = chunks[idx].copy()
         chunk['similarity_score'] = float(scores[idx])
         results.append(chunk)

 retrieval.py
 ------------
 Sanyu RAG — Retrieval Module
+Loads pre-computed L2-normalised numpy embeddings from the .pkl file and
+performs retrieval via a simple dot-product similarity search (pure numpy).
+No FAISS dependency at runtime — avoids FAISS SWIG binary incompatibilities
+between build environments (Windows vs Linux HF Space).
 The .pkl file is located at: data/sanyu_knowledge_base.pkl (hardcoded as agreed).
 """
 from sentence_transformers import SentenceTransformer
+# Path to the serialised knowledge base — hardcoded as confirmed with Atwine.
 DEFAULT_PKL_PATH = 'data/sanyu_knowledge_base.pkl'
 def load_index(pkl_path: str = DEFAULT_PKL_PATH) -> tuple:
     """
+    Loads the embeddings and chunk metadata from the .pkl file.
     Returns:
+        (embeddings, chunks, model_name)
+        - embeddings: np.ndarray float32, shape (n, d), L2-normalised
         - chunks: list of chunk dicts (text + metadata)
         - model_name: str, the embedding model used to build the index
     """
     with open(pkl_path, 'rb') as f:
         payload = pickle.load(f)
+    return payload['embeddings'], payload['chunks'], payload['embedding_model']
 def retrieve(query: str,
+             embeddings: np.ndarray,
              chunks: list,
              model: SentenceTransformer,
              top_k: int = 4) -> list:
     """
     Retrieves the top_k most relevant chunks for a given query.
+    Uses a numpy dot product against pre-computed L2-normalised embeddings
+    (equivalent to cosine similarity). No FAISS required at runtime.
     Args:
         query: The user's input string.
+        embeddings: np.ndarray of shape (n, d), L2-normalised chunk embeddings.
+        chunks: The list of chunk dicts corresponding to the embeddings.
         model: A loaded SentenceTransformer model instance.
+        top_k: Number of results to return (default 4).
     Returns:
+        List of chunk dicts with an added 'similarity_score' key,
         ordered from most to least relevant.
     """
     query_embedding = model.encode([query], normalize_embeddings=True)
     query_embedding = np.array(query_embedding, dtype='float32')  # shape (1, d)
+    # Cosine similarity via dot product (both sides are L2-normalised)
+    scores = (query_embedding @ embeddings.T).flatten()  # shape (n,)
     # Get top_k indices sorted by descending score
     top_indices = np.argsort(scores)[::-1][:top_k]
     results = []
     for idx in top_indices:
+        chunk = chunks[int(idx)].copy()
         chunk['similarity_score'] = float(scores[idx])
         results.append(chunk)