Spaces:

OnlyTheTruth03
/

OTT_Bot

Sleeping

OnlyTheTruth03 commited on Dec 23, 2025

Commit

11a6288

1 Parent(s): 4cf1681

Paraquet dataset correction in ingest.py

Files changed (1) hide show

src/ingest.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # src/ingest.py
 import os
-import io
 import pickle
 import faiss
 from pypdf import PdfReader
@@ -20,18 +19,21 @@ def build_index():
     print("📥 Loading dataset from Hugging Face...")
     dataset = load_dataset(DATASET_NAME, split="train")
     documents = []
     for row in dataset:
-        # HF gives a PDF file-like object
-        pdf_file = row[list(row.keys())[0]]  # safest generic access
-        if not pdf_file:
-            continue
-        # 🔑 READ BYTES CORRECTLY
-        pdf_bytes = pdf_file.read()
-        reader = PdfReader(io.BytesIO(pdf_bytes))
         for page_num, page in enumerate(reader.pages, start=1):
             text = page.extract_text()
@@ -44,7 +46,7 @@ def build_index():
             })
     if not documents:
-        raise RuntimeError("❌ No text extracted from PDFs")
     print(f"✂️ Extracted {len(documents)} text chunks")

 # src/ingest.py
 import os
 import pickle
 import faiss
 from pypdf import PdfReader
     print("📥 Loading dataset from Hugging Face...")
     dataset = load_dataset(DATASET_NAME, split="train")
+    print("📄 Dataset columns:", dataset.column_names)
     documents = []
     for row in dataset:
+        # HF Pdf feature → object with `.path`
+        pdf_obj = row[dataset.column_names[0]]
+        if not pdf_obj or not hasattr(pdf_obj, "path"):
+            raise RuntimeError("❌ Invalid PDF object from HF dataset")
+        pdf_path = pdf_obj.path
+        print(f"📂 Reading PDF from {pdf_path}")
+        reader = PdfReader(pdf_path)
         for page_num, page in enumerate(reader.pages, start=1):
             text = page.extract_text()
             })
     if not documents:
+        raise RuntimeError("❌ No text extracted from PDF")
     print(f"✂️ Extracted {len(documents)} text chunks")