atwine Devin commited on
Commit
543fbbd
Β·
1 Parent(s): c2a8c7b

Replace FAISS index with portable numpy embeddings in pkl

Browse files

FAISS C++ objects are not safely picklable across OS/build environments
(Windows pkl cannot be loaded on HF Space Linux). Fix: store raw
L2-normalised numpy embeddings in the pkl and perform retrieval via
a plain dot product. No FAISS dependency at runtime on the Space.

- build_index.py: build_embeddings() replaces build_faiss_index();
saves embeddings ndarray instead of faiss.IndexFlatIP object
- retrieval.py: pure numpy dot product, zero FAISS calls at runtime
- app.py: variable renamed _rag_index -> _rag_embeddings
- data/sanyu_knowledge_base.pkl: rebuilt with new format (167 chunks)

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>

Files changed (4) hide show
  1. app.py +2 -2
  2. build_index.py +20 -20
  3. data/sanyu_knowledge_base.pkl +2 -2
  4. retrieval.py +20 -27
app.py CHANGED
@@ -8,7 +8,7 @@ from retrieval import load_index, retrieve
8
  client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
9
 
10
  # Load FAISS index and embedding model once at startup
11
- _rag_index, _rag_chunks, _rag_model_name = load_index()
12
  _rag_model = SentenceTransformer(_rag_model_name)
13
 
14
  META_PROMPT = """<system>
@@ -393,7 +393,7 @@ def respond(message, history):
393
 
394
  # Retrieve relevant chunks from the FAISS index for this query
395
  query_text = extract_text(message)
396
- rag_results = retrieve(query_text, _rag_index, _rag_chunks, _rag_model, top_k=4)
397
 
398
  # Inject retrieved context as a separate Content block before conversation history
399
  if rag_results:
 
8
  client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
9
 
10
  # Load FAISS index and embedding model once at startup
11
+ _rag_embeddings, _rag_chunks, _rag_model_name = load_index()
12
  _rag_model = SentenceTransformer(_rag_model_name)
13
 
14
  META_PROMPT = """<system>
 
393
 
394
  # Retrieve relevant chunks from the FAISS index for this query
395
  query_text = extract_text(message)
396
+ rag_results = retrieve(query_text, _rag_embeddings, _rag_chunks, _rag_model, top_k=4)
397
 
398
  # Inject retrieved context as a separate Content block before conversation history
399
  if rag_results:
build_index.py CHANGED
@@ -17,7 +17,6 @@ import pickle
17
 
18
  import pdfplumber
19
  import numpy as np
20
- import faiss
21
  from sentence_transformers import SentenceTransformer
22
 
23
 
@@ -215,12 +214,15 @@ def chunk_section(section: dict,
215
  # 4.5 Embedding and FAISS index
216
  # ---------------------------------------------------------------------------
217
 
218
- def build_faiss_index(all_chunks: list,
219
- model_name: str = EMBEDDING_MODEL
220
- ) -> tuple:
221
  """
222
- Embeds all chunks and builds a FAISS IndexFlatIP (cosine similarity).
223
- Returns (index, chunks) tuple.
 
 
 
224
  """
225
  model = SentenceTransformer(model_name)
226
 
@@ -229,25 +231,23 @@ def build_faiss_index(all_chunks: list,
229
  embeddings = model.encode(texts, show_progress_bar=False, batch_size=32)
230
  embeddings = np.array(embeddings, dtype='float32')
231
 
232
- # Normalise for cosine similarity
233
- faiss.normalize_L2(embeddings)
 
234
 
235
- # Build flat index (exact search β€” fine for this document size)
236
- dimension = embeddings.shape[1]
237
- index = faiss.IndexFlatIP(dimension) # Inner product = cosine after normalisation
238
- index.add(embeddings)
239
-
240
- print(f"Index built: {index.ntotal} vectors, dimension {dimension}")
241
- return index, all_chunks
242
 
243
 
244
  # ---------------------------------------------------------------------------
245
  # 4.6 Serialisation
246
  # ---------------------------------------------------------------------------
247
 
248
- def save_index(index, chunks: list, output_path: str):
249
  """
250
- Saves the FAISS index and chunk metadata to a single .pkl file.
 
 
251
  """
252
  # Ensure the output directory exists
253
  output_dir = os.path.dirname(output_path)
@@ -255,7 +255,7 @@ def save_index(index, chunks: list, output_path: str):
255
  os.makedirs(output_dir, exist_ok=True)
256
 
257
  payload = {
258
- 'index': index,
259
  'chunks': chunks,
260
  'embedding_model': EMBEDDING_MODEL,
261
  'chunk_count': len(chunks),
@@ -328,8 +328,8 @@ def main():
328
  print(" [... truncated for display ...]")
329
 
330
  # --- Build FAISS index and save ---
331
- index, chunks = build_faiss_index(all_chunks)
332
- save_index(index, chunks, OUTPUT_PATH)
333
 
334
  print(f"\nDone. Upload '{OUTPUT_PATH}' to your Hugging Face Space.")
335
 
 
17
 
18
  import pdfplumber
19
  import numpy as np
 
20
  from sentence_transformers import SentenceTransformer
21
 
22
 
 
214
  # 4.5 Embedding and FAISS index
215
  # ---------------------------------------------------------------------------
216
 
217
+ def build_embeddings(all_chunks: list,
218
+ model_name: str = EMBEDDING_MODEL
219
+ ) -> tuple:
220
  """
221
+ Embeds all chunks and returns L2-normalised numpy embeddings.
222
+ Stores raw numpy arrays instead of a FAISS index object so the .pkl
223
+ is portable across OS/FAISS versions (FAISS C++ objects are not
224
+ safely picklable across different platform builds).
225
+ Returns (embeddings, chunks) tuple.
226
  """
227
  model = SentenceTransformer(model_name)
228
 
 
231
  embeddings = model.encode(texts, show_progress_bar=False, batch_size=32)
232
  embeddings = np.array(embeddings, dtype='float32')
233
 
234
+ # Normalise for cosine similarity (dot product == cosine on unit vectors)
235
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
236
+ embeddings = embeddings / np.maximum(norms, 1e-10)
237
 
238
+ print(f"Embeddings built: {embeddings.shape[0]} vectors, dimension {embeddings.shape[1]}")
239
+ return embeddings, all_chunks
 
 
 
 
 
240
 
241
 
242
  # ---------------------------------------------------------------------------
243
  # 4.6 Serialisation
244
  # ---------------------------------------------------------------------------
245
 
246
+ def save_index(embeddings: np.ndarray, chunks: list, output_path: str):
247
  """
248
+ Saves L2-normalised numpy embeddings and chunk metadata to a .pkl file.
249
+ Raw numpy arrays are used instead of a FAISS index object to ensure
250
+ cross-platform portability (Windows build β†’ Linux HF Space runtime).
251
  """
252
  # Ensure the output directory exists
253
  output_dir = os.path.dirname(output_path)
 
255
  os.makedirs(output_dir, exist_ok=True)
256
 
257
  payload = {
258
+ 'embeddings': embeddings, # np.ndarray float32, shape (n, d), L2-normalised
259
  'chunks': chunks,
260
  'embedding_model': EMBEDDING_MODEL,
261
  'chunk_count': len(chunks),
 
328
  print(" [... truncated for display ...]")
329
 
330
  # --- Build FAISS index and save ---
331
+ embeddings, chunks = build_embeddings(all_chunks)
332
+ save_index(embeddings, chunks, OUTPUT_PATH)
333
 
334
  print(f"\nDone. Upload '{OUTPUT_PATH}' to your Hugging Face Space.")
335
 
data/sanyu_knowledge_base.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb28d0fb9ebba59356c14cd72be979c2e2c2310a50d76e19a68eca39ad70d0a1
3
- size 432034
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fafd196b78a01cfe20f8c55d1f1c81fe919d5fc57049e48da0e64e3e2537310
3
+ size 432077
retrieval.py CHANGED
@@ -2,10 +2,12 @@
2
  retrieval.py
3
  ------------
4
  Sanyu RAG β€” Retrieval Module
5
- Implements load_index() and retrieve() as described in devin-instructions.md (Section 4.7).
6
 
7
- This module is imported by the Sanyu app (app.py) at runtime to serve relevant
8
- context chunks in response to user queries.
 
 
 
9
 
10
  The .pkl file is located at: data/sanyu_knowledge_base.pkl (hardcoded as agreed).
11
  """
@@ -15,68 +17,59 @@ import numpy as np
15
  from sentence_transformers import SentenceTransformer
16
 
17
 
18
- # Path to the serialised FAISS index β€” hardcoded as confirmed with Atwine.
19
  DEFAULT_PKL_PATH = 'data/sanyu_knowledge_base.pkl'
20
 
21
 
22
  def load_index(pkl_path: str = DEFAULT_PKL_PATH) -> tuple:
23
  """
24
- Loads the FAISS index and chunk metadata from the .pkl file.
25
 
26
  Returns:
27
- (index, chunks, model_name)
28
- - index: faiss.IndexFlatIP
29
  - chunks: list of chunk dicts (text + metadata)
30
  - model_name: str, the embedding model used to build the index
31
  """
32
  with open(pkl_path, 'rb') as f:
33
  payload = pickle.load(f)
34
- return payload['index'], payload['chunks'], payload['embedding_model']
35
 
36
 
37
  def retrieve(query: str,
38
- index,
39
  chunks: list,
40
  model: SentenceTransformer,
41
  top_k: int = 4) -> list:
42
  """
43
  Retrieves the top_k most relevant chunks for a given query.
44
 
45
- Uses numpy dot product directly against stored vectors extracted from the
46
- IndexFlatIP index β€” bypasses index.search() entirely to avoid FAISS
47
- SWIG API version incompatibilities across environments.
48
 
49
  Args:
50
  query: The user's input string.
51
- index: A loaded faiss.IndexFlatIP object.
52
- chunks: The list of chunk dicts associated with the index.
53
  model: A loaded SentenceTransformer model instance.
54
- top_k: Number of results to return (default 4, per spec).
55
 
56
  Returns:
57
- List of chunk dicts, each with an added 'similarity_score' key,
58
  ordered from most to least relevant.
59
  """
60
  query_embedding = model.encode([query], normalize_embeddings=True)
61
  query_embedding = np.array(query_embedding, dtype='float32') # shape (1, d)
62
 
63
- # Extract all stored vectors from the FAISS flat index into a numpy array.
64
- # reconstruct_n(start, n) is stable across all FAISS versions.
65
- n_total = index.ntotal
66
- d = index.d
67
- all_vectors = np.zeros((n_total, d), dtype='float32')
68
- for i in range(n_total):
69
- all_vectors[i] = index.reconstruct(i)
70
-
71
- # Cosine similarity via dot product (vectors are already L2-normalised)
72
- scores = (query_embedding @ all_vectors.T).flatten() # shape (n_total,)
73
 
74
  # Get top_k indices sorted by descending score
75
  top_indices = np.argsort(scores)[::-1][:top_k]
76
 
77
  results = []
78
  for idx in top_indices:
79
- chunk = chunks[idx].copy()
80
  chunk['similarity_score'] = float(scores[idx])
81
  results.append(chunk)
82
 
 
2
  retrieval.py
3
  ------------
4
  Sanyu RAG β€” Retrieval Module
 
5
 
6
+ Loads pre-computed L2-normalised numpy embeddings from the .pkl file and
7
+ performs retrieval via a simple dot-product similarity search (pure numpy).
8
+
9
+ No FAISS dependency at runtime β€” avoids FAISS SWIG binary incompatibilities
10
+ between build environments (Windows vs Linux HF Space).
11
 
12
  The .pkl file is located at: data/sanyu_knowledge_base.pkl (hardcoded as agreed).
13
  """
 
17
  from sentence_transformers import SentenceTransformer
18
 
19
 
20
+ # Path to the serialised knowledge base β€” hardcoded as confirmed with Atwine.
21
  DEFAULT_PKL_PATH = 'data/sanyu_knowledge_base.pkl'
22
 
23
 
24
  def load_index(pkl_path: str = DEFAULT_PKL_PATH) -> tuple:
25
  """
26
+ Loads the embeddings and chunk metadata from the .pkl file.
27
 
28
  Returns:
29
+ (embeddings, chunks, model_name)
30
+ - embeddings: np.ndarray float32, shape (n, d), L2-normalised
31
  - chunks: list of chunk dicts (text + metadata)
32
  - model_name: str, the embedding model used to build the index
33
  """
34
  with open(pkl_path, 'rb') as f:
35
  payload = pickle.load(f)
36
+ return payload['embeddings'], payload['chunks'], payload['embedding_model']
37
 
38
 
39
  def retrieve(query: str,
40
+ embeddings: np.ndarray,
41
  chunks: list,
42
  model: SentenceTransformer,
43
  top_k: int = 4) -> list:
44
  """
45
  Retrieves the top_k most relevant chunks for a given query.
46
 
47
+ Uses a numpy dot product against pre-computed L2-normalised embeddings
48
+ (equivalent to cosine similarity). No FAISS required at runtime.
 
49
 
50
  Args:
51
  query: The user's input string.
52
+ embeddings: np.ndarray of shape (n, d), L2-normalised chunk embeddings.
53
+ chunks: The list of chunk dicts corresponding to the embeddings.
54
  model: A loaded SentenceTransformer model instance.
55
+ top_k: Number of results to return (default 4).
56
 
57
  Returns:
58
+ List of chunk dicts with an added 'similarity_score' key,
59
  ordered from most to least relevant.
60
  """
61
  query_embedding = model.encode([query], normalize_embeddings=True)
62
  query_embedding = np.array(query_embedding, dtype='float32') # shape (1, d)
63
 
64
+ # Cosine similarity via dot product (both sides are L2-normalised)
65
+ scores = (query_embedding @ embeddings.T).flatten() # shape (n,)
 
 
 
 
 
 
 
 
66
 
67
  # Get top_k indices sorted by descending score
68
  top_indices = np.argsort(scores)[::-1][:top_k]
69
 
70
  results = []
71
  for idx in top_indices:
72
+ chunk = chunks[int(idx)].copy()
73
  chunk['similarity_score'] = float(scores[idx])
74
  results.append(chunk)
75