File size: 2,099 Bytes
b453b40
 
 
 
 
820c2fc
 
 
 
 
 
 
 
 
 
 
b453b40
 
820c2fc
b453b40
820c2fc
 
b453b40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
820c2fc
 
 
b453b40
 
 
 
 
820c2fc
b453b40
820c2fc
 
 
b453b40
 
 
820c2fc
 
 
b453b40
 
 
820c2fc
b453b40
 
 
 
 
 
 
 
 
 
 
820c2fc
b453b40
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import json
import os
import faiss
import numpy as np


# ---------- LOAD SETTINGS ----------
def load_settings():
    with open("config/settings.json", "r", encoding="utf-8") as f:
        return json.load(f)

SETTINGS = load_settings()

EMBEDDING_MODEL = SETTINGS["embedding_model"]
FAISS_INDEX_PATH = SETTINGS["faiss_index_path"]
METADATA_PATH = SETTINGS["metadata_path"]

CONDITIONS_DIR = "Conditions"
INDEX_DIR = os.path.dirname(FAISS_INDEX_PATH)


# ---------- LOAD CHUNKS ----------
def load_chunks():
    texts = []
    metadatas = []

    for condition in os.listdir(CONDITIONS_DIR):
        cond_path = os.path.join(CONDITIONS_DIR, condition)
        if not os.path.isdir(cond_path):
            continue

        chunks_path = os.path.join(cond_path, "chunks.json")
        if not os.path.exists(chunks_path):
            continue

        with open(chunks_path, "r", encoding="utf-8") as f:
            chunks = json.load(f)

        for chunk in chunks:
            texts.append(chunk["text"])
            metadatas.append({
                "condition": chunk.get("condition"),
                "section": chunk.get("section"),
                "source_id": chunk.get("source_id")
            })

    return texts, metadatas


# ---------- BUILD INDEX ----------
def main():
    print("🔨 Building FAISS index...")

    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(EMBEDDING_MODEL)

    texts, metadatas = load_chunks()
    if not texts:
        raise RuntimeError("No chunks found. Ensure Conditions/*/chunks.json exists.")

    print(f"Loaded {len(texts)} chunks")

    embeddings = model.encode(texts, show_progress_bar=True)
    embeddings = np.asarray(embeddings, dtype="float32")

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    os.makedirs(INDEX_DIR, exist_ok=True)

    faiss.write_index(index, FAISS_INDEX_PATH)

    with open(METADATA_PATH, "w", encoding="utf-8") as f:
        json.dump(metadatas, f, indent=2)

    print("✅ FAISS index built successfully")


if __name__ == "__main__":
    main()