Zubaish commited on
Commit
d4bb434
·
1 Parent(s): 772c22e

Update ingest logic

Browse files
Files changed (1) hide show
  1. ingest.py +33 -13
ingest.py CHANGED
@@ -1,18 +1,38 @@
1
  import os
2
- from huggingface_hub import snapshot_download
3
- from config import HF_DATASET_ID, KB_DIR
 
 
 
 
4
 
5
- def download_kb():
6
- os.makedirs(KB_DIR, exist_ok=True)
7
 
8
- snapshot_download(
9
- repo_id=HF_DATASET_ID,
10
- repo_type="dataset",
11
- local_dir=KB_DIR,
12
- local_dir_use_symlinks=False
13
- )
14
 
15
- print("✅ Knowledge base downloaded")
 
 
 
 
 
16
 
17
- if __name__ == "__main__":
18
- download_kb()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ from datasets import load_dataset
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_chroma import Chroma
7
+ from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
8
 
9
+ os.makedirs(KB_DIR, exist_ok=True)
 
10
 
11
+ print("⬇️ Downloading PDFs from HF Dataset...")
12
+ dataset = load_dataset(HF_DATASET_REPO, split="train")
 
 
 
 
13
 
14
+ pdf_paths = []
15
+ for row in dataset:
16
+ path = os.path.join(KB_DIR, row["file_name"])
17
+ with open(path, "wb") as f:
18
+ f.write(row["file"])
19
+ pdf_paths.append(path)
20
 
21
+ print("📄 Loading documents...")
22
+ docs = []
23
+ for p in pdf_paths:
24
+ docs.extend(PyPDFLoader(p).load())
25
+
26
+ splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
27
+ splits = splitter.split_documents(docs)
28
+
29
+ print("🧠 Creating embeddings...")
30
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
31
+
32
+ Chroma.from_documents(
33
+ splits,
34
+ embedding=embeddings,
35
+ persist_directory=CHROMA_DIR
36
+ )
37
+
38
+ print("✅ Ingestion complete")