Spaces:
Sleeping
Sleeping
| # data/build_embeddings.py | |
| import json | |
| from pathlib import Path | |
| import sys | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| # --- ensure project root is on sys.path so `email_rag` is importable --- | |
| ROOT_DIR = Path(__file__).resolve().parents[1] # parent of `data/` | |
| if str(ROOT_DIR) not in sys.path: | |
| sys.path.insert(0, str(ROOT_DIR)) | |
| from email_rag.rag_config import CHUNKS_PATH, CHUNK_IDS_PATH, EMBEDDINGS_PATH | |
| def load_jsonl(path: Path): | |
| rows = [] | |
| with path.open("r", encoding="utf-8") as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| rows.append(json.loads(line)) | |
| return rows | |
| def main(): | |
| print(f"Loading chunks from {CHUNKS_PATH} ...") | |
| chunks = load_jsonl(CHUNKS_PATH) | |
| texts = [c["text"] for c in chunks] | |
| chunk_ids = [c["chunk_id"] for c in chunks] | |
| print(f"Total chunks: {len(chunks)}") | |
| print("Loading sentence-transformers model ...") | |
| model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| print("Encoding chunks ...") | |
| embeddings = model.encode( | |
| texts, | |
| normalize_embeddings=True, | |
| show_progress_bar=True, | |
| ) | |
| embeddings = embeddings.astype("float32") | |
| print(f"Saving embeddings to {EMBEDDINGS_PATH} ...") | |
| np.save(EMBEDDINGS_PATH, embeddings) | |
| print(f"Saving chunk IDs to {CHUNK_IDS_PATH} ...") | |
| with CHUNK_IDS_PATH.open("w", encoding="utf-8") as f: | |
| json.dump(chunk_ids, f) | |
| print("Done.") | |
| if __name__ == "__main__": | |
| main() |