OnlyTheTruth03 commited on
Commit
11a6288
Β·
1 Parent(s): 4cf1681

Paraquet dataset correction in ingest.py

Browse files
Files changed (1) hide show
  1. src/ingest.py +11 -9
src/ingest.py CHANGED
@@ -1,6 +1,5 @@
1
  # src/ingest.py
2
  import os
3
- import io
4
  import pickle
5
  import faiss
6
  from pypdf import PdfReader
@@ -20,18 +19,21 @@ def build_index():
20
  print("πŸ“₯ Loading dataset from Hugging Face...")
21
  dataset = load_dataset(DATASET_NAME, split="train")
22
 
 
 
23
  documents = []
24
 
25
  for row in dataset:
26
- # HF gives a PDF file-like object
27
- pdf_file = row[list(row.keys())[0]] # safest generic access
 
 
 
28
 
29
- if not pdf_file:
30
- continue
31
 
32
- # πŸ”‘ READ BYTES CORRECTLY
33
- pdf_bytes = pdf_file.read()
34
- reader = PdfReader(io.BytesIO(pdf_bytes))
35
 
36
  for page_num, page in enumerate(reader.pages, start=1):
37
  text = page.extract_text()
@@ -44,7 +46,7 @@ def build_index():
44
  })
45
 
46
  if not documents:
47
- raise RuntimeError("❌ No text extracted from PDFs")
48
 
49
  print(f"βœ‚οΈ Extracted {len(documents)} text chunks")
50
 
 
1
  # src/ingest.py
2
  import os
 
3
  import pickle
4
  import faiss
5
  from pypdf import PdfReader
 
19
  print("πŸ“₯ Loading dataset from Hugging Face...")
20
  dataset = load_dataset(DATASET_NAME, split="train")
21
 
22
+ print("πŸ“„ Dataset columns:", dataset.column_names)
23
+
24
  documents = []
25
 
26
  for row in dataset:
27
+ # HF Pdf feature β†’ object with `.path`
28
+ pdf_obj = row[dataset.column_names[0]]
29
+
30
+ if not pdf_obj or not hasattr(pdf_obj, "path"):
31
+ raise RuntimeError("❌ Invalid PDF object from HF dataset")
32
 
33
+ pdf_path = pdf_obj.path
34
+ print(f"πŸ“‚ Reading PDF from {pdf_path}")
35
 
36
+ reader = PdfReader(pdf_path)
 
 
37
 
38
  for page_num, page in enumerate(reader.pages, start=1):
39
  text = page.extract_text()
 
46
  })
47
 
48
  if not documents:
49
+ raise RuntimeError("❌ No text extracted from PDF")
50
 
51
  print(f"βœ‚οΈ Extracted {len(documents)} text chunks")
52