import os import pickle import faiss from datasets import load_dataset from pypdf import PdfReader from sentence_transformers import SentenceTransformer from config import ( DATASET_NAME, INDEX_DIR, FAISS_INDEX_PATH, DOCS_PATH, EMBEDDING_MODEL, ) os.makedirs(INDEX_DIR, exist_ok=True) embedder = SentenceTransformer(EMBEDDING_MODEL) def build_index(): print("📥 Loading HF dataset...") dataset = load_dataset(DATASET_NAME, split="train") documents = [] for row in dataset: pdf_obj = row[dataset.column_names[0]] # ✅ Correct & stable for HF Spaces pdf_path = pdf_obj.path print(f"📄 Reading PDF: {pdf_path}") reader = PdfReader(pdf_path) for page_no, page in enumerate(reader.pages, start=1): text = page.extract_text() if not text: continue documents.append({ "text": text.strip(), "page": page_no, }) if not documents: raise RuntimeError("❌ No text extracted from PDFs") texts = [d["text"] for d in documents] embeddings = embedder.encode(texts).astype("float32") index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) faiss.write_index(index, FAISS_INDEX_PATH) with open(DOCS_PATH, "wb") as f: pickle.dump(documents, f) print("✅ FAISS index built successfully")