Spaces:
Sleeping
Sleeping
File size: 1,432 Bytes
f129d48 3688256 f129d48 baad26a 0b4e744 f129d48 0b4e744 f129d48 0b4e744 3bbb203 f129d48 29204d1 f129d48 11a6288 f129d48 709c859 3688256 f129d48 0b4e744 f129d48 3688256 f129d48 29204d1 baad26a f129d48 709c859 29204d1 f129d48 709c859 f129d48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import os
import pickle
import faiss
from datasets import load_dataset
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from config import (
DATASET_NAME,
INDEX_DIR,
FAISS_INDEX_PATH,
DOCS_PATH,
EMBEDDING_MODEL,
)
os.makedirs(INDEX_DIR, exist_ok=True)
embedder = SentenceTransformer(EMBEDDING_MODEL)
def build_index():
print("π₯ Loading HF dataset...")
dataset = load_dataset(DATASET_NAME, split="train")
documents = []
for row in dataset:
pdf_obj = row[dataset.column_names[0]]
# β
Correct & stable for HF Spaces
pdf_path = pdf_obj.path
print(f"π Reading PDF: {pdf_path}")
reader = PdfReader(pdf_path)
for page_no, page in enumerate(reader.pages, start=1):
text = page.extract_text()
if not text:
continue
documents.append({
"text": text.strip(),
"page": page_no,
})
if not documents:
raise RuntimeError("β No text extracted from PDFs")
texts = [d["text"] for d in documents]
embeddings = embedder.encode(texts).astype("float32")
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
faiss.write_index(index, FAISS_INDEX_PATH)
with open(DOCS_PATH, "wb") as f:
pickle.dump(documents, f)
print("β
FAISS index built successfully")
|