FireSymptom / embed_index.py
EphAsad's picture
Update embed_index.py
820c2fc verified
import json
import os
import faiss
import numpy as np
# ---------- LOAD SETTINGS ----------
def load_settings():
with open("config/settings.json", "r", encoding="utf-8") as f:
return json.load(f)
SETTINGS = load_settings()
EMBEDDING_MODEL = SETTINGS["embedding_model"]
FAISS_INDEX_PATH = SETTINGS["faiss_index_path"]
METADATA_PATH = SETTINGS["metadata_path"]
CONDITIONS_DIR = "Conditions"
INDEX_DIR = os.path.dirname(FAISS_INDEX_PATH)
# ---------- LOAD CHUNKS ----------
def load_chunks():
texts = []
metadatas = []
for condition in os.listdir(CONDITIONS_DIR):
cond_path = os.path.join(CONDITIONS_DIR, condition)
if not os.path.isdir(cond_path):
continue
chunks_path = os.path.join(cond_path, "chunks.json")
if not os.path.exists(chunks_path):
continue
with open(chunks_path, "r", encoding="utf-8") as f:
chunks = json.load(f)
for chunk in chunks:
texts.append(chunk["text"])
metadatas.append({
"condition": chunk.get("condition"),
"section": chunk.get("section"),
"source_id": chunk.get("source_id")
})
return texts, metadatas
# ---------- BUILD INDEX ----------
def main():
print("🔨 Building FAISS index...")
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(EMBEDDING_MODEL)
texts, metadatas = load_chunks()
if not texts:
raise RuntimeError("No chunks found. Ensure Conditions/*/chunks.json exists.")
print(f"Loaded {len(texts)} chunks")
embeddings = model.encode(texts, show_progress_bar=True)
embeddings = np.asarray(embeddings, dtype="float32")
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
os.makedirs(INDEX_DIR, exist_ok=True)
faiss.write_index(index, FAISS_INDEX_PATH)
with open(METADATA_PATH, "w", encoding="utf-8") as f:
json.dump(metadatas, f, indent=2)
print("✅ FAISS index built successfully")
if __name__ == "__main__":
main()