import json import sys import faiss import numpy as np from pathlib import Path project_root = Path(__file__).parent.parent sys.path.append(str(project_root)) # We are intentionally ignoring the E402 warning here because the sys.path # modification must happen before we can import from our local package. from src.fot_recommender.config import ( # noqa: E402 PROCESSED_DATA_DIR, RAW_KB_PATH, FINAL_KB_CHUNKS_PATH, FAISS_INDEX_PATH, EMBEDDING_MODEL_NAME, ) from src.fot_recommender.semantic_chunker import chunk_by_concept # noqa: E402 from src.fot_recommender.rag_pipeline import ( # noqa: E402 initialize_embedding_model, create_embeddings, ) def build(): """ Builds the entire knowledge base artifact set needed by the application: 1. The processed, semantically chunked JSON file. 2. The Facebook AI Similarity Search (FAISS) vector index file (`faiss_index.bin`). """ print("--- Building Final Knowledge Base and FAISS Index ---") # --- Create Final Chunks --- print(f"Loading raw knowledge base from: {RAW_KB_PATH}") with open(RAW_KB_PATH, "r", encoding="utf-8") as f: raw_kb = json.load(f) final_chunks = chunk_by_concept(raw_kb) PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True) with open(FINAL_KB_CHUNKS_PATH, "w", encoding="utf-8") as f: json.dump(final_chunks, f, indent=4) print(f"āœ… Saved {len(final_chunks)} semantic chunks to {FINAL_KB_CHUNKS_PATH}") # --- Create and Save FAISS Index --- print("\n--- Creating FAISS Index ---") # Initialize model using the name from the config file model = initialize_embedding_model(model_name=EMBEDDING_MODEL_NAME) embeddings = create_embeddings(final_chunks, model) # Explicitly set dtype for FAISS embeddings = np.asarray(embeddings).astype("float32") dimension = embeddings.shape[1] index = faiss.IndexFlatIP(dimension) index.add(embeddings) # type: ignore faiss.write_index(index, str(FAISS_INDEX_PATH)) print(f"āœ… Saved FAISS index with {index.ntotal} vectors to {FAISS_INDEX_PATH}") print("\nšŸŽ‰ Success! All artifacts are built and ready for the application.") if __name__ == "__main__": build()