import json import os import faiss from sentence_transformers import SentenceTransformer from langchain.text_splitter import RecursiveCharacterTextSplitter # Load embedding model os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers" os.environ["HF_HOME"] = "/tmp/huggingface" embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder="/tmp/sentence_transformers") # Load your JSON KB with open("./src/data/datamir_kb.json") as f: kb_docs = json.load(f) # Prepare chunks and metadata documents = [entry["content"] for entry in kb_docs] metadatas = [{"id": entry["id"], "title": entry["title"]} for entry in kb_docs] # Split content into smaller chunks splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) all_chunks = [] all_metadata = [] for doc, meta in zip(documents, metadatas): chunks = splitter.split_text(doc) all_chunks.extend(chunks) all_metadata.extend([meta] * len(chunks)) # Embed chunks embeddings = embed_model.encode(all_chunks).astype("float32") # Build FAISS index index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) # Helper: retrieve top chunks def get_top_chunks(query, k=3): query_vec = embed_model.encode([query]).astype("float32") D, I = index.search(query_vec, k) return [all_chunks[i] for i in I[0]]