File size: 1,260 Bytes
4fb49f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# ingest.py
from pathlib import Path
import subprocess

from rag_config import ROOT_DIR

def run(cmd):
    print(f"Running: {' '.join(cmd)}")
    subprocess.run(cmd, check=True)

def main():
    data_dir = ROOT_DIR / "data"

    # 1) Build a small email subset (if not already present)
    subset_path = data_dir / "emails_subset_with_ids.csv"
    if not subset_path.exists():
        run(["python", "data/preprocess_subset.py"])
    else:
        print("emails_subset_with_ids.csv already exists, skipping preprocess_subset.")

    # 2) Build chunks + threads/messages indices
    chunks_path = data_dir / "chunks.jsonl"
    threads_path = data_dir / "threads.json"
    messages_path = data_dir / "messages.json"
    if not (chunks_path.exists() and threads_path.exists() and messages_path.exists()):
        run(["python", "data/build_indices.py"])
    else:
        print("Index files already exist, skipping build_indices.")

    # 3) Build embeddings for all chunks
    emb_path = data_dir / "embeddings.npy"
    if not emb_path.exists():
        run(["python", "data/build_embeddings.py"])
    else:
        print("embeddings.npy already exists, skipping build_embeddings.")

    print("Ingest pipeline completed.")

if __name__ == "__main__":
    main()