Spaces:

Rishitha3
/

HyDE_RAG

Sleeping

App Files Files Community

Rishitha3 commited on Aug 30, 2025

Commit

b68369f

verified ·

1 Parent(s): 5fe5d84

Delete prebuilt_index

Browse files

Files changed (1) hide show

prebuilt_index +0 -57

prebuilt_index DELETED Viewed

@@ -1,57 +0,0 @@
-# build_index.py
-import fitz, re, os, pickle
-import numpy as np
-import faiss
-from sentence_transformers import SentenceTransformer
-def load_pdf_text(file_path):
-    doc = fitz.open(file_path)
-    text = ""
-    for page in doc:
-        text += page.get_text()
-    if not text.strip():
-        raise ValueError("No text found in PDF.")
-    return text
-def chunk_text(text, max_tokens=200):
-    sentences = re.split(r'(?<=[.!?]) +', text)
-    chunks, current_chunk = [], []
-    current_len = 0
-    for sentence in sentences:
-        word_count = len(sentence.split())
-        if current_len + word_count > max_tokens:
-            chunks.append(" ".join(current_chunk))
-            current_chunk = [sentence]
-            current_len = word_count
-        else:
-            current_chunk.append(sentence)
-            current_len += word_count
-    if current_chunk:
-        chunks.append(" ".join(current_chunk))
-    return chunks
-def build_index(pdf_path, index_dir="prebuilt_index"):
-    os.makedirs(index_dir, exist_ok=True)
-    # 1. Extract + chunk
-    text = load_pdf_text(pdf_path)
-    chunks = chunk_text(text)
-    # 2. Embed
-    embed_model = SentenceTransformer("all-MiniLM-L6-v2")
-    vectors = embed_model.encode(chunks)
-    # 3. FAISS index
-    dim = vectors.shape[1]
-    index = faiss.IndexFlatL2(dim)
-    index.add(np.array(vectors, dtype=np.float32))
-    # 4. Save index + metadata
-    faiss.write_index(index, os.path.join(index_dir, "faiss_index.bin"))
-    with open(os.path.join(index_dir, "metadata.pkl"), "wb") as f:
-        pickle.dump({"chunks": chunks, "model_name": "all-MiniLM-L6-v2"}, f)
-    print(f"✅ Index saved to {index_dir}")
-if __name__ == "__main__":
-    build_index("your_textbook.pdf")   # Replace with your PDF path