File size: 1,332 Bytes
2cb02a9
 
 
 
 
 
 
8473d47
 
 
 
2cb02a9
 
e7a5d8d
1b71715
2cb02a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import json
import os
import faiss
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load embedding model
os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers"
os.environ["HF_HOME"] = "/tmp/huggingface"

embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder="/tmp/sentence_transformers")

# Load your JSON KB
with open("./src/data/datamir_kb.json") as f:
    kb_docs = json.load(f)

# Prepare chunks and metadata
documents = [entry["content"] for entry in kb_docs]
metadatas = [{"id": entry["id"], "title": entry["title"]} for entry in kb_docs]

# Split content into smaller chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
all_chunks = []
all_metadata = []

for doc, meta in zip(documents, metadatas):
    chunks = splitter.split_text(doc)
    all_chunks.extend(chunks)
    all_metadata.extend([meta] * len(chunks))

# Embed chunks
embeddings = embed_model.encode(all_chunks).astype("float32")

# Build FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Helper: retrieve top chunks
def get_top_chunks(query, k=3):
    query_vec = embed_model.encode([query]).astype("float32")
    D, I = index.search(query_vec, k)
    return [all_chunks[i] for i in I[0]]