import json import os import faiss import numpy as np # ---------- LOAD SETTINGS ---------- def load_settings(): with open("config/settings.json", "r", encoding="utf-8") as f: return json.load(f) SETTINGS = load_settings() EMBEDDING_MODEL = SETTINGS["embedding_model"] FAISS_INDEX_PATH = SETTINGS["faiss_index_path"] METADATA_PATH = SETTINGS["metadata_path"] CONDITIONS_DIR = "Conditions" INDEX_DIR = os.path.dirname(FAISS_INDEX_PATH) # ---------- LOAD CHUNKS ---------- def load_chunks(): texts = [] metadatas = [] for condition in os.listdir(CONDITIONS_DIR): cond_path = os.path.join(CONDITIONS_DIR, condition) if not os.path.isdir(cond_path): continue chunks_path = os.path.join(cond_path, "chunks.json") if not os.path.exists(chunks_path): continue with open(chunks_path, "r", encoding="utf-8") as f: chunks = json.load(f) for chunk in chunks: texts.append(chunk["text"]) metadatas.append({ "condition": chunk.get("condition"), "section": chunk.get("section"), "source_id": chunk.get("source_id") }) return texts, metadatas # ---------- BUILD INDEX ---------- def main(): print("🔨 Building FAISS index...") from sentence_transformers import SentenceTransformer model = SentenceTransformer(EMBEDDING_MODEL) texts, metadatas = load_chunks() if not texts: raise RuntimeError("No chunks found. Ensure Conditions/*/chunks.json exists.") print(f"Loaded {len(texts)} chunks") embeddings = model.encode(texts, show_progress_bar=True) embeddings = np.asarray(embeddings, dtype="float32") index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) os.makedirs(INDEX_DIR, exist_ok=True) faiss.write_index(index, FAISS_INDEX_PATH) with open(METADATA_PATH, "w", encoding="utf-8") as f: json.dump(metadatas, f, indent=2) print("✅ FAISS index built successfully") if __name__ == "__main__": main()