Zubaish commited on
Commit
f09a853
·
1 Parent(s): ce847a1
Files changed (1) hide show
  1. ingest.py +18 -15
ingest.py CHANGED
@@ -8,50 +8,53 @@ from langchain_chroma import Chroma
8
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
9
 
10
  def run_ingestion():
11
- # Clean up previous runs
12
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
13
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
14
  os.makedirs(KB_DIR, exist_ok=True)
15
 
16
  print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
17
- # Load normally without extra format arguments
18
  dataset = load_dataset(HF_DATASET_REPO, split="train")
19
 
20
  pdf_paths = []
21
  for i, row in enumerate(dataset):
22
- # Hugging Face PDF folders usually store the decoded PDF in a column named 'pdf'
23
  pdf_feature = row.get("pdf")
24
 
25
- # If it's a dict, it usually has a 'path' to the actual file on disk
26
- # This is the most efficient way to get the file
27
- if isinstance(pdf_feature, dict) and pdf_feature.get("path"):
 
28
  src_path = pdf_feature["path"]
 
 
 
 
29
  dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
30
  shutil.copy(src_path, dest_path)
31
  pdf_paths.append(dest_path)
 
32
  else:
33
- # Fallback: if we can't find a path, skip or try bytes
34
- print(f"⚠️ Could not find file path for row {i}, skipping.")
35
 
36
- print(f"📄 Loading and splitting {len(pdf_paths)} documents...")
 
37
  docs = []
38
  for p in pdf_paths:
39
  try:
40
- # Using pypdf-based loader
41
  loader = PyPDFLoader(p)
42
  docs.extend(loader.load())
43
  except Exception as e:
44
- print(f"⚠️ Error reading {p}: {e}")
45
 
46
  if not docs:
47
- print("❌ No documents were successfully loaded.")
48
  return
49
 
50
- # Standard text splitting
51
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
52
  splits = splitter.split_documents(docs)
53
 
54
- print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
55
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
56
 
57
  Chroma.from_documents(
@@ -59,7 +62,7 @@ def run_ingestion():
59
  embedding=embeddings,
60
  persist_directory=CHROMA_DIR
61
  )
62
- print(f"✅ Ingestion complete. DB saved to {CHROMA_DIR}")
63
 
64
  if __name__ == "__main__":
65
  run_ingestion()
 
8
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
9
 
10
  def run_ingestion():
11
+ # 1. Clean Environment
12
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
13
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
14
  os.makedirs(KB_DIR, exist_ok=True)
15
 
16
  print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
 
17
  dataset = load_dataset(HF_DATASET_REPO, split="train")
18
 
19
  pdf_paths = []
20
  for i, row in enumerate(dataset):
 
21
  pdf_feature = row.get("pdf")
22
 
23
+ # Determine Source Path
24
+ # HF PdfFolder datasets store the local path in the 'path' key of the feature
25
+ src_path = None
26
+ if isinstance(pdf_feature, dict) and "path" in pdf_feature:
27
  src_path = pdf_feature["path"]
28
+ elif hasattr(pdf_feature, "filename"):
29
+ src_path = pdf_feature.filename
30
+
31
+ if src_path and os.path.exists(src_path):
32
  dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
33
  shutil.copy(src_path, dest_path)
34
  pdf_paths.append(dest_path)
35
+ print(f"✅ Cached: doc_{i}.pdf")
36
  else:
37
+ print(f"⚠️ Could not resolve path for doc_{i}, skipping.")
 
38
 
39
+ # 2. Process Documents
40
+ print(f"📄 Processing {len(pdf_paths)} documents...")
41
  docs = []
42
  for p in pdf_paths:
43
  try:
 
44
  loader = PyPDFLoader(p)
45
  docs.extend(loader.load())
46
  except Exception as e:
47
+ print(f" Error reading {p}: {e}")
48
 
49
  if not docs:
50
+ print("❌ CRITICAL: No documents were successfully loaded.")
51
  return
52
 
53
+ # 3. Chunk and Embed
54
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
55
  splits = splitter.split_documents(docs)
56
 
57
+ print(f"🧠 Indexing {len(splits)} chunks...")
58
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
59
 
60
  Chroma.from_documents(
 
62
  embedding=embeddings,
63
  persist_directory=CHROMA_DIR
64
  )
65
+ print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
66
 
67
  if __name__ == "__main__":
68
  run_ingestion()