Email-Rag-Prototype / data /build_embeddings.py
Ravi Raj
Add AI-generated attachment chunks and rebuild embeddings
9ca244a
# data/build_embeddings.py
import json
from pathlib import Path
import sys
import numpy as np
from sentence_transformers import SentenceTransformer
# --- ensure project root is on sys.path so `email_rag` is importable ---
ROOT_DIR = Path(__file__).resolve().parents[1] # parent of `data/`
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from email_rag.rag_config import CHUNKS_PATH, CHUNK_IDS_PATH, EMBEDDINGS_PATH
def load_jsonl(path: Path):
rows = []
with path.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
rows.append(json.loads(line))
return rows
def main():
print(f"Loading chunks from {CHUNKS_PATH} ...")
chunks = load_jsonl(CHUNKS_PATH)
texts = [c["text"] for c in chunks]
chunk_ids = [c["chunk_id"] for c in chunks]
print(f"Total chunks: {len(chunks)}")
print("Loading sentence-transformers model ...")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("Encoding chunks ...")
embeddings = model.encode(
texts,
normalize_embeddings=True,
show_progress_bar=True,
)
embeddings = embeddings.astype("float32")
print(f"Saving embeddings to {EMBEDDINGS_PATH} ...")
np.save(EMBEDDINGS_PATH, embeddings)
print(f"Saving chunk IDs to {CHUNK_IDS_PATH} ...")
with CHUNK_IDS_PATH.open("w", encoding="utf-8") as f:
json.dump(chunk_ids, f)
print("Done.")
if __name__ == "__main__":
main()