Spaces:

raviix46
/

Email-Rag-Prototype

Sleeping

raviix46 commited on Nov 13, 2025

Commit

4fb49f6

verified ·

1 Parent(s): aa81fbd

Create ingest.py

Files changed (1) hide show

ingest.py ADDED Viewed

+# ingest.py
+from pathlib import Path
+import subprocess
+from rag_config import ROOT_DIR
+def run(cmd):
+    print(f"Running: {' '.join(cmd)}")
+    subprocess.run(cmd, check=True)
+def main():
+    data_dir = ROOT_DIR / "data"
+    # 1) Build a small email subset (if not already present)
+    subset_path = data_dir / "emails_subset_with_ids.csv"
+    if not subset_path.exists():
+        run(["python", "data/preprocess_subset.py"])
+    else:
+        print("emails_subset_with_ids.csv already exists, skipping preprocess_subset.")
+    # 2) Build chunks + threads/messages indices
+    chunks_path = data_dir / "chunks.jsonl"
+    threads_path = data_dir / "threads.json"
+    messages_path = data_dir / "messages.json"
+    if not (chunks_path.exists() and threads_path.exists() and messages_path.exists()):
+        run(["python", "data/build_indices.py"])
+    else:
+        print("Index files already exist, skipping build_indices.")
+    # 3) Build embeddings for all chunks
+    emb_path = data_dir / "embeddings.npy"
+    if not emb_path.exists():
+        run(["python", "data/build_embeddings.py"])
+    else:
+        print("embeddings.npy already exists, skipping build_embeddings.")
+    print("Ingest pipeline completed.")
+if __name__ == "__main__":
+    main()