GenAI_Loan_Advisor / rag /ingest_policies.py
DevKX's picture
Initial deploy
cffeaa1
# rag/ingestion/ingest_policies.py
import os
import sys
import platform
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS
from langchain_mistralai import MistralAIEmbeddings # or Mistral embeddings later
# ---- Dynamic FAISS import ----
def import_faiss():
try:
import faiss
return faiss
except ImportError:
import subprocess
system = platform.system()
try:
import torch
gpu_available = torch.cuda.is_available()
except ImportError:
gpu_available = False
if system == "Darwin" or not gpu_available:
pkg_name = "faiss-cpu"
else:
pkg_name = "faiss-gpu"
print(f"FAISS not found. Installing {pkg_name}...")
subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_name])
import faiss
return faiss
faiss = import_faiss()
from langchain_community.vectorstores.faiss import FAISS
# Load env
load_dotenv()
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
if not MISTRAL_API_KEY:
raise Exception("Missing MISTRAL_API_KEY in .env")
DOCUMENTS_DIR = "rag/policies"
VECTORSTORE_DIR = "rag/vectorstore"
def load_documents():
docs = []
for file in os.listdir(DOCUMENTS_DIR):
if file.endswith(".pdf"):
print(f"Loading document: {file}") # <-- Print the filename
loader = PyPDFLoader(os.path.join(DOCUMENTS_DIR, file))
docs.extend(loader.load())
return docs
def build_vectorstore():
docs = load_documents()
# Split into chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=1500,
chunk_overlap=300
)
chunks = splitter.split_documents(docs)
# Embeddings (OpenAI or Mistral later)
embeddings = MistralAIEmbeddings(model="mistral-embed")
# Create FAISS vectorstore
vectorstore = FAISS.from_documents(chunks, embeddings)
# Save DB
vectorstore.save_local(VECTORSTORE_DIR)
print(f"Vectorstore created at: {VECTORSTORE_DIR}")
def verify_vectorstore(vectorstore_dir=VECTORSTORE_DIR, test_query="Interest rate for high-risk customer"):
from langchain_community.vectorstores.faiss import FAISS
from langchain_mistralai import MistralAIEmbeddings
embeddings = MistralAIEmbeddings(model="mistral-embed", api_key=MISTRAL_API_KEY)
vectorstore = FAISS.load_local(vectorstore_dir, embeddings, allow_dangerous_deserialization=True)
results = vectorstore.similarity_search(test_query, k=3)
print("\n=== Verification Results ===")
for i, doc in enumerate(results):
print(f"\n--- Result {i+1} ---\n{doc.page_content}")
if __name__ == "__main__":
build_vectorstore()
verify_vectorstore(test_query="Overall risk for credit score 700 with delinquent account")
verify_vectorstore(test_query="Interest rate for medium-risk customer")