OTT_Bot / src /ingest.py
OnlyTheTruth03's picture
Prod ready
f129d48
import os
import pickle
import faiss
from datasets import load_dataset
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from config import (
DATASET_NAME,
INDEX_DIR,
FAISS_INDEX_PATH,
DOCS_PATH,
EMBEDDING_MODEL,
)
os.makedirs(INDEX_DIR, exist_ok=True)
embedder = SentenceTransformer(EMBEDDING_MODEL)
def build_index():
print("πŸ“₯ Loading HF dataset...")
dataset = load_dataset(DATASET_NAME, split="train")
documents = []
for row in dataset:
pdf_obj = row[dataset.column_names[0]]
# βœ… Correct & stable for HF Spaces
pdf_path = pdf_obj.path
print(f"πŸ“„ Reading PDF: {pdf_path}")
reader = PdfReader(pdf_path)
for page_no, page in enumerate(reader.pages, start=1):
text = page.extract_text()
if not text:
continue
documents.append({
"text": text.strip(),
"page": page_no,
})
if not documents:
raise RuntimeError("❌ No text extracted from PDFs")
texts = [d["text"] for d in documents]
embeddings = embedder.encode(texts).astype("float32")
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
faiss.write_index(index, FAISS_INDEX_PATH)
with open(DOCS_PATH, "wb") as f:
pickle.dump(documents, f)
print("βœ… FAISS index built successfully")