Zubaish commited on
Commit
3ad751d
·
1 Parent(s): 06629cc
Files changed (1) hide show
  1. ingest.py +29 -15
ingest.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- from datasets import load_dataset
3
  from langchain_community.document_loaders import PyPDFLoader
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
5
  from langchain_huggingface import HuggingFaceEmbeddings
@@ -10,33 +10,47 @@ def run_ingestion():
10
  os.makedirs(KB_DIR, exist_ok=True)
11
 
12
  print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
13
- # decode(False) prevents the library from turning bytes into pdfplumber objects
14
- dataset = load_dataset(HF_DATASET_REPO, split="train").with_format("binary").decode(False)
15
 
 
 
 
 
 
 
 
 
16
  pdf_paths = []
17
  for i, row in enumerate(dataset):
18
- # Determine filename and raw data column
19
- fname = row.get("filename") or row.get("file_name") or f"doc_{i}.pdf"
20
- # Access the raw 'bytes' from the 'pdf' column
21
- pdf_feature = row.get("pdf")
22
 
23
- if pdf_feature is None:
 
 
 
 
 
24
  continue
25
 
26
- path = os.path.join(KB_DIR, fname)
27
  with open(path, "wb") as f:
28
- if isinstance(pdf_feature, dict) and "bytes" in pdf_feature:
29
- f.write(pdf_feature["bytes"])
 
30
  else:
31
- f.write(pdf_feature)
 
32
  pdf_paths.append(path)
33
 
34
  print(f"📄 Processing {len(pdf_paths)} documents...")
35
  docs = []
36
  for p in pdf_paths:
37
- loader = PyPDFLoader(p)
38
- docs.extend(loader.load())
 
 
 
39
 
 
40
  splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
41
  splits = splitter.split_documents(docs)
42
 
@@ -48,7 +62,7 @@ def run_ingestion():
48
  embedding=embeddings,
49
  persist_directory=CHROMA_DIR
50
  )
51
- print(f"✅ Ingestion complete. Data saved to {CHROMA_DIR}")
52
 
53
  if __name__ == "__main__":
54
  run_ingestion()
 
1
  import os
2
+ from datasets import load_dataset, Features, Value
3
  from langchain_community.document_loaders import PyPDFLoader
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
5
  from langchain_huggingface import HuggingFaceEmbeddings
 
10
  os.makedirs(KB_DIR, exist_ok=True)
11
 
12
  print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
 
 
13
 
14
+ # We cast the 'pdf' column to binary to prevent automatic decoding into a 'PDF' object
15
+ features = Features({"pdf": Value("binary"), "file_name": Value("string")})
16
+
17
+ # Use decode=False to get raw bytes
18
+ dataset = load_dataset(HF_DATASET_REPO, split="train", decode=False)
19
+
20
+ print(f"📊 Dataset columns: {dataset.column_names}")
21
+
22
  pdf_paths = []
23
  for i, row in enumerate(dataset):
24
+ # Your logs show the column is named 'pdf'
25
+ pdf_data = row.get("pdf")
 
 
26
 
27
+ # Determine filename
28
+ fname = row.get("file_name") or row.get("filename") or f"document_{i}.pdf"
29
+ path = os.path.join(KB_DIR, fname)
30
+
31
+ if pdf_data is None:
32
+ print(f"⚠️ Row {i} has no data, skipping.")
33
  continue
34
 
 
35
  with open(path, "wb") as f:
36
+ # When decode=False, pdf_data is usually a dict like {'bytes': b'...', 'path': None}
37
+ if isinstance(pdf_data, dict) and "bytes" in pdf_data:
38
+ f.write(pdf_data["bytes"])
39
  else:
40
+ f.write(pdf_data)
41
+
42
  pdf_paths.append(path)
43
 
44
  print(f"📄 Processing {len(pdf_paths)} documents...")
45
  docs = []
46
  for p in pdf_paths:
47
+ try:
48
+ loader = PyPDFLoader(p)
49
+ docs.extend(loader.load())
50
+ except Exception as e:
51
+ print(f"❌ Could not load {p}: {e}")
52
 
53
+ # Split text into chunks
54
  splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
55
  splits = splitter.split_documents(docs)
56
 
 
62
  embedding=embeddings,
63
  persist_directory=CHROMA_DIR
64
  )
65
+ print(f"✅ Ingestion complete. DB saved to {CHROMA_DIR}")
66
 
67
  if __name__ == "__main__":
68
  run_ingestion()