Spaces:
Sleeping
Sleeping
Commit
Β·
11a6288
1
Parent(s):
4cf1681
Paraquet dataset correction in ingest.py
Browse files- src/ingest.py +11 -9
src/ingest.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
# src/ingest.py
|
| 2 |
import os
|
| 3 |
-
import io
|
| 4 |
import pickle
|
| 5 |
import faiss
|
| 6 |
from pypdf import PdfReader
|
|
@@ -20,18 +19,21 @@ def build_index():
|
|
| 20 |
print("π₯ Loading dataset from Hugging Face...")
|
| 21 |
dataset = load_dataset(DATASET_NAME, split="train")
|
| 22 |
|
|
|
|
|
|
|
| 23 |
documents = []
|
| 24 |
|
| 25 |
for row in dataset:
|
| 26 |
-
# HF
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
|
| 32 |
-
|
| 33 |
-
pdf_bytes = pdf_file.read()
|
| 34 |
-
reader = PdfReader(io.BytesIO(pdf_bytes))
|
| 35 |
|
| 36 |
for page_num, page in enumerate(reader.pages, start=1):
|
| 37 |
text = page.extract_text()
|
|
@@ -44,7 +46,7 @@ def build_index():
|
|
| 44 |
})
|
| 45 |
|
| 46 |
if not documents:
|
| 47 |
-
raise RuntimeError("β No text extracted from
|
| 48 |
|
| 49 |
print(f"βοΈ Extracted {len(documents)} text chunks")
|
| 50 |
|
|
|
|
| 1 |
# src/ingest.py
|
| 2 |
import os
|
|
|
|
| 3 |
import pickle
|
| 4 |
import faiss
|
| 5 |
from pypdf import PdfReader
|
|
|
|
| 19 |
print("π₯ Loading dataset from Hugging Face...")
|
| 20 |
dataset = load_dataset(DATASET_NAME, split="train")
|
| 21 |
|
| 22 |
+
print("π Dataset columns:", dataset.column_names)
|
| 23 |
+
|
| 24 |
documents = []
|
| 25 |
|
| 26 |
for row in dataset:
|
| 27 |
+
# HF Pdf feature β object with `.path`
|
| 28 |
+
pdf_obj = row[dataset.column_names[0]]
|
| 29 |
+
|
| 30 |
+
if not pdf_obj or not hasattr(pdf_obj, "path"):
|
| 31 |
+
raise RuntimeError("β Invalid PDF object from HF dataset")
|
| 32 |
|
| 33 |
+
pdf_path = pdf_obj.path
|
| 34 |
+
print(f"π Reading PDF from {pdf_path}")
|
| 35 |
|
| 36 |
+
reader = PdfReader(pdf_path)
|
|
|
|
|
|
|
| 37 |
|
| 38 |
for page_num, page in enumerate(reader.pages, start=1):
|
| 39 |
text = page.extract_text()
|
|
|
|
| 46 |
})
|
| 47 |
|
| 48 |
if not documents:
|
| 49 |
+
raise RuntimeError("β No text extracted from PDF")
|
| 50 |
|
| 51 |
print(f"βοΈ Extracted {len(documents)} text chunks")
|
| 52 |
|