File size: 2,234 Bytes
c32c832
 
8bfb8e4
 
c32c832
 
 
 
 
280d562
 
 
23ef32a
 
 
 
 
 
280d562
 
8bfb8e4
 
 
929cde0
c32c832
 
 
8bfb8e4
 
 
c32c832
8bfb8e4
c32c832
8bfb8e4
c32c832
8bfb8e4
 
c32c832
 
 
23ef32a
c32c832
23ef32a
8bfb8e4
 
 
 
23ef32a
 
8bfb8e4
 
 
 
 
 
 
 
 
23ef32a
 
c32c832
8bfb8e4
c32c832
929cde0
c32c832
280d562
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import json
import sys
import faiss
import numpy as np
from pathlib import Path

project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))

# We are intentionally ignoring the E402 warning here because the sys.path
# modification must happen before we can import from our local package.
from src.fot_recommender.config import (  # noqa: E402
    PROCESSED_DATA_DIR,
    RAW_KB_PATH,
    FINAL_KB_CHUNKS_PATH,
    FAISS_INDEX_PATH,
    EMBEDDING_MODEL_NAME,
)
from src.fot_recommender.semantic_chunker import chunk_by_concept  # noqa: E402
from src.fot_recommender.rag_pipeline import (  # noqa: E402
    initialize_embedding_model,
    create_embeddings,
)


def build():
    """
    Builds the entire knowledge base artifact set needed by the application:
    1.  The processed, semantically chunked JSON file.
    2.  The Facebook AI Similarity Search (FAISS) vector index file (`faiss_index.bin`).
    """
    print("--- Building Final Knowledge Base and FAISS Index ---")

    # --- Create Final Chunks ---
    print(f"Loading raw knowledge base from: {RAW_KB_PATH}")
    with open(RAW_KB_PATH, "r", encoding="utf-8") as f:
        raw_kb = json.load(f)

    final_chunks = chunk_by_concept(raw_kb)
    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
    with open(FINAL_KB_CHUNKS_PATH, "w", encoding="utf-8") as f:
        json.dump(final_chunks, f, indent=4)
    print(f"✅ Saved {len(final_chunks)} semantic chunks to {FINAL_KB_CHUNKS_PATH}")

    # --- Create and Save FAISS Index ---
    print("\n--- Creating FAISS Index ---")

    # Initialize model using the name from the config file
    model = initialize_embedding_model(model_name=EMBEDDING_MODEL_NAME)
    embeddings = create_embeddings(final_chunks, model)

    # Explicitly set dtype for FAISS
    embeddings = np.asarray(embeddings).astype("float32")

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings)  # type: ignore

    faiss.write_index(index, str(FAISS_INDEX_PATH))
    print(f"✅ Saved FAISS index with {index.ntotal} vectors to {FAISS_INDEX_PATH}")

    print("\n🎉 Success! All artifacts are built and ready for the application.")


if __name__ == "__main__":
    build()