Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| # Load embedding model | |
| os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers" | |
| os.environ["HF_HOME"] = "/tmp/huggingface" | |
| embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder="/tmp/sentence_transformers") | |
| # Load your JSON KB | |
| with open("./src/data/datamir_kb.json") as f: | |
| kb_docs = json.load(f) | |
| # Prepare chunks and metadata | |
| documents = [entry["content"] for entry in kb_docs] | |
| metadatas = [{"id": entry["id"], "title": entry["title"]} for entry in kb_docs] | |
| # Split content into smaller chunks | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) | |
| all_chunks = [] | |
| all_metadata = [] | |
| for doc, meta in zip(documents, metadatas): | |
| chunks = splitter.split_text(doc) | |
| all_chunks.extend(chunks) | |
| all_metadata.extend([meta] * len(chunks)) | |
| # Embed chunks | |
| embeddings = embed_model.encode(all_chunks).astype("float32") | |
| # Build FAISS index | |
| index = faiss.IndexFlatL2(embeddings.shape[1]) | |
| index.add(embeddings) | |
| # Helper: retrieve top chunks | |
| def get_top_chunks(query, k=3): | |
| query_vec = embed_model.encode([query]).astype("float32") | |
| D, I = index.search(query_vec, k) | |
| return [all_chunks[i] for i in I[0]] |