Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| import faiss | |
| from datasets import load_dataset | |
| from pypdf import PdfReader | |
| from sentence_transformers import SentenceTransformer | |
| from config import ( | |
| DATASET_NAME, | |
| INDEX_DIR, | |
| FAISS_INDEX_PATH, | |
| DOCS_PATH, | |
| EMBEDDING_MODEL, | |
| ) | |
| os.makedirs(INDEX_DIR, exist_ok=True) | |
| embedder = SentenceTransformer(EMBEDDING_MODEL) | |
| def build_index(): | |
| print("π₯ Loading HF dataset...") | |
| dataset = load_dataset(DATASET_NAME, split="train") | |
| documents = [] | |
| for row in dataset: | |
| pdf_obj = row[dataset.column_names[0]] | |
| # β Correct & stable for HF Spaces | |
| pdf_path = pdf_obj.path | |
| print(f"π Reading PDF: {pdf_path}") | |
| reader = PdfReader(pdf_path) | |
| for page_no, page in enumerate(reader.pages, start=1): | |
| text = page.extract_text() | |
| if not text: | |
| continue | |
| documents.append({ | |
| "text": text.strip(), | |
| "page": page_no, | |
| }) | |
| if not documents: | |
| raise RuntimeError("β No text extracted from PDFs") | |
| texts = [d["text"] for d in documents] | |
| embeddings = embedder.encode(texts).astype("float32") | |
| index = faiss.IndexFlatL2(embeddings.shape[1]) | |
| index.add(embeddings) | |
| faiss.write_index(index, FAISS_INDEX_PATH) | |
| with open(DOCS_PATH, "wb") as f: | |
| pickle.dump(documents, f) | |
| print("β FAISS index built successfully") | |