Spaces:
Running
Running
File size: 2,234 Bytes
c32c832 8bfb8e4 c32c832 280d562 23ef32a 280d562 8bfb8e4 929cde0 c32c832 8bfb8e4 c32c832 8bfb8e4 c32c832 8bfb8e4 c32c832 8bfb8e4 c32c832 23ef32a c32c832 23ef32a 8bfb8e4 23ef32a 8bfb8e4 23ef32a c32c832 8bfb8e4 c32c832 929cde0 c32c832 280d562 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import json
import sys
import faiss
import numpy as np
from pathlib import Path
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
# We are intentionally ignoring the E402 warning here because the sys.path
# modification must happen before we can import from our local package.
from src.fot_recommender.config import ( # noqa: E402
PROCESSED_DATA_DIR,
RAW_KB_PATH,
FINAL_KB_CHUNKS_PATH,
FAISS_INDEX_PATH,
EMBEDDING_MODEL_NAME,
)
from src.fot_recommender.semantic_chunker import chunk_by_concept # noqa: E402
from src.fot_recommender.rag_pipeline import ( # noqa: E402
initialize_embedding_model,
create_embeddings,
)
def build():
"""
Builds the entire knowledge base artifact set needed by the application:
1. The processed, semantically chunked JSON file.
2. The Facebook AI Similarity Search (FAISS) vector index file (`faiss_index.bin`).
"""
print("--- Building Final Knowledge Base and FAISS Index ---")
# --- Create Final Chunks ---
print(f"Loading raw knowledge base from: {RAW_KB_PATH}")
with open(RAW_KB_PATH, "r", encoding="utf-8") as f:
raw_kb = json.load(f)
final_chunks = chunk_by_concept(raw_kb)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
with open(FINAL_KB_CHUNKS_PATH, "w", encoding="utf-8") as f:
json.dump(final_chunks, f, indent=4)
print(f"✅ Saved {len(final_chunks)} semantic chunks to {FINAL_KB_CHUNKS_PATH}")
# --- Create and Save FAISS Index ---
print("\n--- Creating FAISS Index ---")
# Initialize model using the name from the config file
model = initialize_embedding_model(model_name=EMBEDDING_MODEL_NAME)
embeddings = create_embeddings(final_chunks, model)
# Explicitly set dtype for FAISS
embeddings = np.asarray(embeddings).astype("float32")
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings) # type: ignore
faiss.write_index(index, str(FAISS_INDEX_PATH))
print(f"✅ Saved FAISS index with {index.ntotal} vectors to {FAISS_INDEX_PATH}")
print("\n🎉 Success! All artifacts are built and ready for the application.")
if __name__ == "__main__":
build()
|