Email-Rag-Prototype / ingest.py
raviix46's picture
Create ingest.py
4fb49f6 verified
# ingest.py
from pathlib import Path
import subprocess
from rag_config import ROOT_DIR
def run(cmd):
print(f"Running: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
def main():
data_dir = ROOT_DIR / "data"
# 1) Build a small email subset (if not already present)
subset_path = data_dir / "emails_subset_with_ids.csv"
if not subset_path.exists():
run(["python", "data/preprocess_subset.py"])
else:
print("emails_subset_with_ids.csv already exists, skipping preprocess_subset.")
# 2) Build chunks + threads/messages indices
chunks_path = data_dir / "chunks.jsonl"
threads_path = data_dir / "threads.json"
messages_path = data_dir / "messages.json"
if not (chunks_path.exists() and threads_path.exists() and messages_path.exists()):
run(["python", "data/build_indices.py"])
else:
print("Index files already exist, skipping build_indices.")
# 3) Build embeddings for all chunks
emb_path = data_dir / "embeddings.npy"
if not emb_path.exists():
run(["python", "data/build_embeddings.py"])
else:
print("embeddings.npy already exists, skipping build_embeddings.")
print("Ingest pipeline completed.")
if __name__ == "__main__":
main()