Spaces:
Sleeping
Sleeping
Replace FAISS index with portable numpy embeddings in pkl
Browse filesFAISS C++ objects are not safely picklable across OS/build environments
(Windows pkl cannot be loaded on HF Space Linux). Fix: store raw
L2-normalised numpy embeddings in the pkl and perform retrieval via
a plain dot product. No FAISS dependency at runtime on the Space.
- build_index.py: build_embeddings() replaces build_faiss_index();
saves embeddings ndarray instead of faiss.IndexFlatIP object
- retrieval.py: pure numpy dot product, zero FAISS calls at runtime
- app.py: variable renamed _rag_index -> _rag_embeddings
- data/sanyu_knowledge_base.pkl: rebuilt with new format (167 chunks)
Generated with [Devin](https://cli.devin.ai/docs)
Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
- app.py +2 -2
- build_index.py +20 -20
- data/sanyu_knowledge_base.pkl +2 -2
- retrieval.py +20 -27
app.py
CHANGED
|
@@ -8,7 +8,7 @@ from retrieval import load_index, retrieve
|
|
| 8 |
client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
|
| 9 |
|
| 10 |
# Load FAISS index and embedding model once at startup
|
| 11 |
-
|
| 12 |
_rag_model = SentenceTransformer(_rag_model_name)
|
| 13 |
|
| 14 |
META_PROMPT = """<system>
|
|
@@ -393,7 +393,7 @@ def respond(message, history):
|
|
| 393 |
|
| 394 |
# Retrieve relevant chunks from the FAISS index for this query
|
| 395 |
query_text = extract_text(message)
|
| 396 |
-
rag_results = retrieve(query_text,
|
| 397 |
|
| 398 |
# Inject retrieved context as a separate Content block before conversation history
|
| 399 |
if rag_results:
|
|
|
|
| 8 |
client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
|
| 9 |
|
| 10 |
# Load FAISS index and embedding model once at startup
|
| 11 |
+
_rag_embeddings, _rag_chunks, _rag_model_name = load_index()
|
| 12 |
_rag_model = SentenceTransformer(_rag_model_name)
|
| 13 |
|
| 14 |
META_PROMPT = """<system>
|
|
|
|
| 393 |
|
| 394 |
# Retrieve relevant chunks from the FAISS index for this query
|
| 395 |
query_text = extract_text(message)
|
| 396 |
+
rag_results = retrieve(query_text, _rag_embeddings, _rag_chunks, _rag_model, top_k=4)
|
| 397 |
|
| 398 |
# Inject retrieved context as a separate Content block before conversation history
|
| 399 |
if rag_results:
|
build_index.py
CHANGED
|
@@ -17,7 +17,6 @@ import pickle
|
|
| 17 |
|
| 18 |
import pdfplumber
|
| 19 |
import numpy as np
|
| 20 |
-
import faiss
|
| 21 |
from sentence_transformers import SentenceTransformer
|
| 22 |
|
| 23 |
|
|
@@ -215,12 +214,15 @@ def chunk_section(section: dict,
|
|
| 215 |
# 4.5 Embedding and FAISS index
|
| 216 |
# ---------------------------------------------------------------------------
|
| 217 |
|
| 218 |
-
def
|
| 219 |
-
|
| 220 |
-
|
| 221 |
"""
|
| 222 |
-
Embeds all chunks and
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
| 224 |
"""
|
| 225 |
model = SentenceTransformer(model_name)
|
| 226 |
|
|
@@ -229,25 +231,23 @@ def build_faiss_index(all_chunks: list,
|
|
| 229 |
embeddings = model.encode(texts, show_progress_bar=False, batch_size=32)
|
| 230 |
embeddings = np.array(embeddings, dtype='float32')
|
| 231 |
|
| 232 |
-
# Normalise for cosine similarity
|
| 233 |
-
|
|
|
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
index = faiss.IndexFlatIP(dimension) # Inner product = cosine after normalisation
|
| 238 |
-
index.add(embeddings)
|
| 239 |
-
|
| 240 |
-
print(f"Index built: {index.ntotal} vectors, dimension {dimension}")
|
| 241 |
-
return index, all_chunks
|
| 242 |
|
| 243 |
|
| 244 |
# ---------------------------------------------------------------------------
|
| 245 |
# 4.6 Serialisation
|
| 246 |
# ---------------------------------------------------------------------------
|
| 247 |
|
| 248 |
-
def save_index(
|
| 249 |
"""
|
| 250 |
-
Saves
|
|
|
|
|
|
|
| 251 |
"""
|
| 252 |
# Ensure the output directory exists
|
| 253 |
output_dir = os.path.dirname(output_path)
|
|
@@ -255,7 +255,7 @@ def save_index(index, chunks: list, output_path: str):
|
|
| 255 |
os.makedirs(output_dir, exist_ok=True)
|
| 256 |
|
| 257 |
payload = {
|
| 258 |
-
'
|
| 259 |
'chunks': chunks,
|
| 260 |
'embedding_model': EMBEDDING_MODEL,
|
| 261 |
'chunk_count': len(chunks),
|
|
@@ -328,8 +328,8 @@ def main():
|
|
| 328 |
print(" [... truncated for display ...]")
|
| 329 |
|
| 330 |
# --- Build FAISS index and save ---
|
| 331 |
-
|
| 332 |
-
save_index(
|
| 333 |
|
| 334 |
print(f"\nDone. Upload '{OUTPUT_PATH}' to your Hugging Face Space.")
|
| 335 |
|
|
|
|
| 17 |
|
| 18 |
import pdfplumber
|
| 19 |
import numpy as np
|
|
|
|
| 20 |
from sentence_transformers import SentenceTransformer
|
| 21 |
|
| 22 |
|
|
|
|
| 214 |
# 4.5 Embedding and FAISS index
|
| 215 |
# ---------------------------------------------------------------------------
|
| 216 |
|
| 217 |
+
def build_embeddings(all_chunks: list,
|
| 218 |
+
model_name: str = EMBEDDING_MODEL
|
| 219 |
+
) -> tuple:
|
| 220 |
"""
|
| 221 |
+
Embeds all chunks and returns L2-normalised numpy embeddings.
|
| 222 |
+
Stores raw numpy arrays instead of a FAISS index object so the .pkl
|
| 223 |
+
is portable across OS/FAISS versions (FAISS C++ objects are not
|
| 224 |
+
safely picklable across different platform builds).
|
| 225 |
+
Returns (embeddings, chunks) tuple.
|
| 226 |
"""
|
| 227 |
model = SentenceTransformer(model_name)
|
| 228 |
|
|
|
|
| 231 |
embeddings = model.encode(texts, show_progress_bar=False, batch_size=32)
|
| 232 |
embeddings = np.array(embeddings, dtype='float32')
|
| 233 |
|
| 234 |
+
# Normalise for cosine similarity (dot product == cosine on unit vectors)
|
| 235 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 236 |
+
embeddings = embeddings / np.maximum(norms, 1e-10)
|
| 237 |
|
| 238 |
+
print(f"Embeddings built: {embeddings.shape[0]} vectors, dimension {embeddings.shape[1]}")
|
| 239 |
+
return embeddings, all_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
|
| 242 |
# ---------------------------------------------------------------------------
|
| 243 |
# 4.6 Serialisation
|
| 244 |
# ---------------------------------------------------------------------------
|
| 245 |
|
| 246 |
+
def save_index(embeddings: np.ndarray, chunks: list, output_path: str):
|
| 247 |
"""
|
| 248 |
+
Saves L2-normalised numpy embeddings and chunk metadata to a .pkl file.
|
| 249 |
+
Raw numpy arrays are used instead of a FAISS index object to ensure
|
| 250 |
+
cross-platform portability (Windows build β Linux HF Space runtime).
|
| 251 |
"""
|
| 252 |
# Ensure the output directory exists
|
| 253 |
output_dir = os.path.dirname(output_path)
|
|
|
|
| 255 |
os.makedirs(output_dir, exist_ok=True)
|
| 256 |
|
| 257 |
payload = {
|
| 258 |
+
'embeddings': embeddings, # np.ndarray float32, shape (n, d), L2-normalised
|
| 259 |
'chunks': chunks,
|
| 260 |
'embedding_model': EMBEDDING_MODEL,
|
| 261 |
'chunk_count': len(chunks),
|
|
|
|
| 328 |
print(" [... truncated for display ...]")
|
| 329 |
|
| 330 |
# --- Build FAISS index and save ---
|
| 331 |
+
embeddings, chunks = build_embeddings(all_chunks)
|
| 332 |
+
save_index(embeddings, chunks, OUTPUT_PATH)
|
| 333 |
|
| 334 |
print(f"\nDone. Upload '{OUTPUT_PATH}' to your Hugging Face Space.")
|
| 335 |
|
data/sanyu_knowledge_base.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2fafd196b78a01cfe20f8c55d1f1c81fe919d5fc57049e48da0e64e3e2537310
|
| 3 |
+
size 432077
|
retrieval.py
CHANGED
|
@@ -2,10 +2,12 @@
|
|
| 2 |
retrieval.py
|
| 3 |
------------
|
| 4 |
Sanyu RAG β Retrieval Module
|
| 5 |
-
Implements load_index() and retrieve() as described in devin-instructions.md (Section 4.7).
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
The .pkl file is located at: data/sanyu_knowledge_base.pkl (hardcoded as agreed).
|
| 11 |
"""
|
|
@@ -15,68 +17,59 @@ import numpy as np
|
|
| 15 |
from sentence_transformers import SentenceTransformer
|
| 16 |
|
| 17 |
|
| 18 |
-
# Path to the serialised
|
| 19 |
DEFAULT_PKL_PATH = 'data/sanyu_knowledge_base.pkl'
|
| 20 |
|
| 21 |
|
| 22 |
def load_index(pkl_path: str = DEFAULT_PKL_PATH) -> tuple:
|
| 23 |
"""
|
| 24 |
-
Loads the
|
| 25 |
|
| 26 |
Returns:
|
| 27 |
-
(
|
| 28 |
-
-
|
| 29 |
- chunks: list of chunk dicts (text + metadata)
|
| 30 |
- model_name: str, the embedding model used to build the index
|
| 31 |
"""
|
| 32 |
with open(pkl_path, 'rb') as f:
|
| 33 |
payload = pickle.load(f)
|
| 34 |
-
return payload['
|
| 35 |
|
| 36 |
|
| 37 |
def retrieve(query: str,
|
| 38 |
-
|
| 39 |
chunks: list,
|
| 40 |
model: SentenceTransformer,
|
| 41 |
top_k: int = 4) -> list:
|
| 42 |
"""
|
| 43 |
Retrieves the top_k most relevant chunks for a given query.
|
| 44 |
|
| 45 |
-
Uses numpy dot product
|
| 46 |
-
|
| 47 |
-
SWIG API version incompatibilities across environments.
|
| 48 |
|
| 49 |
Args:
|
| 50 |
query: The user's input string.
|
| 51 |
-
|
| 52 |
-
chunks: The list of chunk dicts
|
| 53 |
model: A loaded SentenceTransformer model instance.
|
| 54 |
-
top_k: Number of results to return (default 4
|
| 55 |
|
| 56 |
Returns:
|
| 57 |
-
List of chunk dicts
|
| 58 |
ordered from most to least relevant.
|
| 59 |
"""
|
| 60 |
query_embedding = model.encode([query], normalize_embeddings=True)
|
| 61 |
query_embedding = np.array(query_embedding, dtype='float32') # shape (1, d)
|
| 62 |
|
| 63 |
-
#
|
| 64 |
-
|
| 65 |
-
n_total = index.ntotal
|
| 66 |
-
d = index.d
|
| 67 |
-
all_vectors = np.zeros((n_total, d), dtype='float32')
|
| 68 |
-
for i in range(n_total):
|
| 69 |
-
all_vectors[i] = index.reconstruct(i)
|
| 70 |
-
|
| 71 |
-
# Cosine similarity via dot product (vectors are already L2-normalised)
|
| 72 |
-
scores = (query_embedding @ all_vectors.T).flatten() # shape (n_total,)
|
| 73 |
|
| 74 |
# Get top_k indices sorted by descending score
|
| 75 |
top_indices = np.argsort(scores)[::-1][:top_k]
|
| 76 |
|
| 77 |
results = []
|
| 78 |
for idx in top_indices:
|
| 79 |
-
chunk = chunks[idx].copy()
|
| 80 |
chunk['similarity_score'] = float(scores[idx])
|
| 81 |
results.append(chunk)
|
| 82 |
|
|
|
|
| 2 |
retrieval.py
|
| 3 |
------------
|
| 4 |
Sanyu RAG β Retrieval Module
|
|
|
|
| 5 |
|
| 6 |
+
Loads pre-computed L2-normalised numpy embeddings from the .pkl file and
|
| 7 |
+
performs retrieval via a simple dot-product similarity search (pure numpy).
|
| 8 |
+
|
| 9 |
+
No FAISS dependency at runtime β avoids FAISS SWIG binary incompatibilities
|
| 10 |
+
between build environments (Windows vs Linux HF Space).
|
| 11 |
|
| 12 |
The .pkl file is located at: data/sanyu_knowledge_base.pkl (hardcoded as agreed).
|
| 13 |
"""
|
|
|
|
| 17 |
from sentence_transformers import SentenceTransformer
|
| 18 |
|
| 19 |
|
| 20 |
+
# Path to the serialised knowledge base β hardcoded as confirmed with Atwine.
|
| 21 |
DEFAULT_PKL_PATH = 'data/sanyu_knowledge_base.pkl'
|
| 22 |
|
| 23 |
|
| 24 |
def load_index(pkl_path: str = DEFAULT_PKL_PATH) -> tuple:
|
| 25 |
"""
|
| 26 |
+
Loads the embeddings and chunk metadata from the .pkl file.
|
| 27 |
|
| 28 |
Returns:
|
| 29 |
+
(embeddings, chunks, model_name)
|
| 30 |
+
- embeddings: np.ndarray float32, shape (n, d), L2-normalised
|
| 31 |
- chunks: list of chunk dicts (text + metadata)
|
| 32 |
- model_name: str, the embedding model used to build the index
|
| 33 |
"""
|
| 34 |
with open(pkl_path, 'rb') as f:
|
| 35 |
payload = pickle.load(f)
|
| 36 |
+
return payload['embeddings'], payload['chunks'], payload['embedding_model']
|
| 37 |
|
| 38 |
|
| 39 |
def retrieve(query: str,
|
| 40 |
+
embeddings: np.ndarray,
|
| 41 |
chunks: list,
|
| 42 |
model: SentenceTransformer,
|
| 43 |
top_k: int = 4) -> list:
|
| 44 |
"""
|
| 45 |
Retrieves the top_k most relevant chunks for a given query.
|
| 46 |
|
| 47 |
+
Uses a numpy dot product against pre-computed L2-normalised embeddings
|
| 48 |
+
(equivalent to cosine similarity). No FAISS required at runtime.
|
|
|
|
| 49 |
|
| 50 |
Args:
|
| 51 |
query: The user's input string.
|
| 52 |
+
embeddings: np.ndarray of shape (n, d), L2-normalised chunk embeddings.
|
| 53 |
+
chunks: The list of chunk dicts corresponding to the embeddings.
|
| 54 |
model: A loaded SentenceTransformer model instance.
|
| 55 |
+
top_k: Number of results to return (default 4).
|
| 56 |
|
| 57 |
Returns:
|
| 58 |
+
List of chunk dicts with an added 'similarity_score' key,
|
| 59 |
ordered from most to least relevant.
|
| 60 |
"""
|
| 61 |
query_embedding = model.encode([query], normalize_embeddings=True)
|
| 62 |
query_embedding = np.array(query_embedding, dtype='float32') # shape (1, d)
|
| 63 |
|
| 64 |
+
# Cosine similarity via dot product (both sides are L2-normalised)
|
| 65 |
+
scores = (query_embedding @ embeddings.T).flatten() # shape (n,)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
# Get top_k indices sorted by descending score
|
| 68 |
top_indices = np.argsort(scores)[::-1][:top_k]
|
| 69 |
|
| 70 |
results = []
|
| 71 |
for idx in top_indices:
|
| 72 |
+
chunk = chunks[int(idx)].copy()
|
| 73 |
chunk['similarity_score'] = float(scores[idx])
|
| 74 |
results.append(chunk)
|
| 75 |
|