Delete prebuilt_index
Browse files- prebuilt_index +0 -57
prebuilt_index
DELETED
|
@@ -1,57 +0,0 @@
|
|
| 1 |
-
# build_index.py
|
| 2 |
-
import fitz, re, os, pickle
|
| 3 |
-
import numpy as np
|
| 4 |
-
import faiss
|
| 5 |
-
from sentence_transformers import SentenceTransformer
|
| 6 |
-
|
| 7 |
-
def load_pdf_text(file_path):
|
| 8 |
-
doc = fitz.open(file_path)
|
| 9 |
-
text = ""
|
| 10 |
-
for page in doc:
|
| 11 |
-
text += page.get_text()
|
| 12 |
-
if not text.strip():
|
| 13 |
-
raise ValueError("No text found in PDF.")
|
| 14 |
-
return text
|
| 15 |
-
|
| 16 |
-
def chunk_text(text, max_tokens=200):
|
| 17 |
-
sentences = re.split(r'(?<=[.!?]) +', text)
|
| 18 |
-
chunks, current_chunk = [], []
|
| 19 |
-
current_len = 0
|
| 20 |
-
for sentence in sentences:
|
| 21 |
-
word_count = len(sentence.split())
|
| 22 |
-
if current_len + word_count > max_tokens:
|
| 23 |
-
chunks.append(" ".join(current_chunk))
|
| 24 |
-
current_chunk = [sentence]
|
| 25 |
-
current_len = word_count
|
| 26 |
-
else:
|
| 27 |
-
current_chunk.append(sentence)
|
| 28 |
-
current_len += word_count
|
| 29 |
-
if current_chunk:
|
| 30 |
-
chunks.append(" ".join(current_chunk))
|
| 31 |
-
return chunks
|
| 32 |
-
|
| 33 |
-
def build_index(pdf_path, index_dir="prebuilt_index"):
|
| 34 |
-
os.makedirs(index_dir, exist_ok=True)
|
| 35 |
-
|
| 36 |
-
# 1. Extract + chunk
|
| 37 |
-
text = load_pdf_text(pdf_path)
|
| 38 |
-
chunks = chunk_text(text)
|
| 39 |
-
|
| 40 |
-
# 2. Embed
|
| 41 |
-
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 42 |
-
vectors = embed_model.encode(chunks)
|
| 43 |
-
|
| 44 |
-
# 3. FAISS index
|
| 45 |
-
dim = vectors.shape[1]
|
| 46 |
-
index = faiss.IndexFlatL2(dim)
|
| 47 |
-
index.add(np.array(vectors, dtype=np.float32))
|
| 48 |
-
|
| 49 |
-
# 4. Save index + metadata
|
| 50 |
-
faiss.write_index(index, os.path.join(index_dir, "faiss_index.bin"))
|
| 51 |
-
with open(os.path.join(index_dir, "metadata.pkl"), "wb") as f:
|
| 52 |
-
pickle.dump({"chunks": chunks, "model_name": "all-MiniLM-L6-v2"}, f)
|
| 53 |
-
|
| 54 |
-
print(f"✅ Index saved to {index_dir}")
|
| 55 |
-
|
| 56 |
-
if __name__ == "__main__":
|
| 57 |
-
build_index("your_textbook.pdf") # Replace with your PDF path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|