Rishitha3 commited on
Commit
b68369f
·
verified ·
1 Parent(s): 5fe5d84

Delete prebuilt_index

Browse files
Files changed (1) hide show
  1. prebuilt_index +0 -57
prebuilt_index DELETED
@@ -1,57 +0,0 @@
1
- # build_index.py
2
- import fitz, re, os, pickle
3
- import numpy as np
4
- import faiss
5
- from sentence_transformers import SentenceTransformer
6
-
7
- def load_pdf_text(file_path):
8
- doc = fitz.open(file_path)
9
- text = ""
10
- for page in doc:
11
- text += page.get_text()
12
- if not text.strip():
13
- raise ValueError("No text found in PDF.")
14
- return text
15
-
16
- def chunk_text(text, max_tokens=200):
17
- sentences = re.split(r'(?<=[.!?]) +', text)
18
- chunks, current_chunk = [], []
19
- current_len = 0
20
- for sentence in sentences:
21
- word_count = len(sentence.split())
22
- if current_len + word_count > max_tokens:
23
- chunks.append(" ".join(current_chunk))
24
- current_chunk = [sentence]
25
- current_len = word_count
26
- else:
27
- current_chunk.append(sentence)
28
- current_len += word_count
29
- if current_chunk:
30
- chunks.append(" ".join(current_chunk))
31
- return chunks
32
-
33
- def build_index(pdf_path, index_dir="prebuilt_index"):
34
- os.makedirs(index_dir, exist_ok=True)
35
-
36
- # 1. Extract + chunk
37
- text = load_pdf_text(pdf_path)
38
- chunks = chunk_text(text)
39
-
40
- # 2. Embed
41
- embed_model = SentenceTransformer("all-MiniLM-L6-v2")
42
- vectors = embed_model.encode(chunks)
43
-
44
- # 3. FAISS index
45
- dim = vectors.shape[1]
46
- index = faiss.IndexFlatL2(dim)
47
- index.add(np.array(vectors, dtype=np.float32))
48
-
49
- # 4. Save index + metadata
50
- faiss.write_index(index, os.path.join(index_dir, "faiss_index.bin"))
51
- with open(os.path.join(index_dir, "metadata.pkl"), "wb") as f:
52
- pickle.dump({"chunks": chunks, "model_name": "all-MiniLM-L6-v2"}, f)
53
-
54
- print(f"✅ Index saved to {index_dir}")
55
-
56
- if __name__ == "__main__":
57
- build_index("your_textbook.pdf") # Replace with your PDF path