Spaces:
Running
Running
File size: 2,263 Bytes
388aa42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | """
Scheme Vectorstore Module
Builds and loads FAISS vectorstore for government schemes
"""
import os
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from rag.embeddings import get_embeddings
def build_scheme_vectorstore():
"""
Reads all PDFs from data/schemes_pdfs/ and builds FAISS index
Run this once to initialize the vectorstore
"""
documents = []
folder = "data/schemes_pdfs"
if not os.path.exists(folder):
os.makedirs(folder)
print(f"Created {folder}. Please add scheme PDFs to this folder.")
return
pdf_files = [f for f in os.listdir(folder) if f.endswith(".pdf")]
if not pdf_files:
print(f"No PDF files found in {folder}. Please add scheme PDFs.")
return
for file in pdf_files:
print(f"Processing {file}...")
loader = PyPDFLoader(os.path.join(folder, file))
documents.extend(loader.load())
if not documents:
print("No documents extracted. Check PDF files.")
return
print(f"Loaded {len(documents)} document chunks. Building vectorstore...")
embeddings = get_embeddings()
vectorstore = FAISS.from_documents(documents, embeddings)
os.makedirs("rag/scheme_index", exist_ok=True)
vectorstore.save_local("rag/scheme_index")
print("Scheme vectorstore built successfully!")
def load_scheme_vectorstore():
"""
Loads pre-built scheme vectorstore
Returns FAISS vectorstore instance
Raises:
FileNotFoundError: If vectorstore files don't exist
RuntimeError: If embeddings fail to load
"""
if not os.path.exists("rag/scheme_index/index.faiss"):
raise FileNotFoundError(
"Scheme vectorstore not found at rag/scheme_index/index.faiss. "
"Run 'python init_embeddings.py' or build_scheme_vectorstore() first."
)
print("📂 Loading scheme vectorstore...")
embeddings = get_embeddings()
vectorstore = FAISS.load_local("rag/scheme_index", embeddings, allow_dangerous_deserialization=True)
print("✅ Scheme vectorstore loaded successfully")
return vectorstore
|