EphAsad commited on
Commit
b453b40
·
verified ·
1 Parent(s): 7b95f2f

Update embed_index.py

Browse files
Files changed (1) hide show
  1. embed_index.py +63 -1
embed_index.py CHANGED
@@ -1 +1,63 @@
1
- #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import faiss
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ from config.settings import *
8
+
9
+ CONDITIONS_DIR = "Conditions"
10
+ INDEX_DIR = "index"
11
+
12
+ def load_chunks():
13
+ texts = []
14
+ metadatas = []
15
+
16
+ for condition in os.listdir(CONDITIONS_DIR):
17
+ cond_path = os.path.join(CONDITIONS_DIR, condition)
18
+ if not os.path.isdir(cond_path):
19
+ continue
20
+
21
+ chunks_path = os.path.join(cond_path, "chunks.json")
22
+ if not os.path.exists(chunks_path):
23
+ continue
24
+
25
+ with open(chunks_path, "r", encoding="utf-8") as f:
26
+ chunks = json.load(f)
27
+
28
+ for chunk in chunks:
29
+ texts.append(chunk["text"])
30
+ metadatas.append({
31
+ "condition": chunk["condition"],
32
+ "section": chunk["section"],
33
+ "source_id": chunk["source_id"]
34
+ })
35
+
36
+ return texts, metadatas
37
+
38
+
39
+ def main():
40
+ print("Loading embedding model...")
41
+ model = SentenceTransformer(EMBEDDING_MODEL)
42
+
43
+ texts, metadatas = load_chunks()
44
+ print(f"Loaded {len(texts)} chunks")
45
+
46
+ embeddings = model.encode(texts, show_progress_bar=True)
47
+ embeddings = np.array(embeddings).astype("float32")
48
+
49
+ index = faiss.IndexFlatL2(embeddings.shape[1])
50
+ index.add(embeddings)
51
+
52
+ os.makedirs(INDEX_DIR, exist_ok=True)
53
+
54
+ faiss.write_index(index, FAISS_INDEX_PATH)
55
+
56
+ with open(METADATA_PATH, "w", encoding="utf-8") as f:
57
+ json.dump(metadatas, f, indent=2)
58
+
59
+ print("FAISS index built successfully")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ main()