raviix46 commited on
Commit
4fb49f6
·
verified ·
1 Parent(s): aa81fbd

Create ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +40 -0
ingest.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ingest.py
2
+ from pathlib import Path
3
+ import subprocess
4
+
5
+ from rag_config import ROOT_DIR
6
+
7
+ def run(cmd):
8
+ print(f"Running: {' '.join(cmd)}")
9
+ subprocess.run(cmd, check=True)
10
+
11
+ def main():
12
+ data_dir = ROOT_DIR / "data"
13
+
14
+ # 1) Build a small email subset (if not already present)
15
+ subset_path = data_dir / "emails_subset_with_ids.csv"
16
+ if not subset_path.exists():
17
+ run(["python", "data/preprocess_subset.py"])
18
+ else:
19
+ print("emails_subset_with_ids.csv already exists, skipping preprocess_subset.")
20
+
21
+ # 2) Build chunks + threads/messages indices
22
+ chunks_path = data_dir / "chunks.jsonl"
23
+ threads_path = data_dir / "threads.json"
24
+ messages_path = data_dir / "messages.json"
25
+ if not (chunks_path.exists() and threads_path.exists() and messages_path.exists()):
26
+ run(["python", "data/build_indices.py"])
27
+ else:
28
+ print("Index files already exist, skipping build_indices.")
29
+
30
+ # 3) Build embeddings for all chunks
31
+ emb_path = data_dir / "embeddings.npy"
32
+ if not emb_path.exists():
33
+ run(["python", "data/build_embeddings.py"])
34
+ else:
35
+ print("embeddings.npy already exists, skipping build_embeddings.")
36
+
37
+ print("Ingest pipeline completed.")
38
+
39
+ if __name__ == "__main__":
40
+ main()