File size: 1,432 Bytes
f129d48
3688256
f129d48
 
 
baad26a
0b4e744
f129d48
 
0b4e744
 
f129d48
0b4e744
3bbb203
f129d48
29204d1
f129d48
 
 
 
 
 
11a6288
f129d48
709c859
3688256
f129d48
0b4e744
f129d48
 
 
 
 
 
 
3688256
f129d48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29204d1
baad26a
f129d48
709c859
29204d1
f129d48
709c859
f129d48
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import pickle
import faiss
from datasets import load_dataset
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from config import (
    DATASET_NAME,
    INDEX_DIR,
    FAISS_INDEX_PATH,
    DOCS_PATH,
    EMBEDDING_MODEL,
)

os.makedirs(INDEX_DIR, exist_ok=True)

embedder = SentenceTransformer(EMBEDDING_MODEL)


def build_index():
    print("πŸ“₯ Loading HF dataset...")
    dataset = load_dataset(DATASET_NAME, split="train")

    documents = []

    for row in dataset:
        pdf_obj = row[dataset.column_names[0]]

        # βœ… Correct & stable for HF Spaces
        pdf_path = pdf_obj.path

        print(f"πŸ“„ Reading PDF: {pdf_path}")
        reader = PdfReader(pdf_path)

        for page_no, page in enumerate(reader.pages, start=1):
            text = page.extract_text()
            if not text:
                continue

            documents.append({
                "text": text.strip(),
                "page": page_no,
            })

    if not documents:
        raise RuntimeError("❌ No text extracted from PDFs")

    texts = [d["text"] for d in documents]
    embeddings = embedder.encode(texts).astype("float32")

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    faiss.write_index(index, FAISS_INDEX_PATH)

    with open(DOCS_PATH, "wb") as f:
        pickle.dump(documents, f)

    print("βœ… FAISS index built successfully")