Spaces:
Sleeping
Sleeping
| # rag/ingestion/ingest_policies.py | |
| import os | |
| import sys | |
| import platform | |
| from dotenv import load_dotenv | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores.faiss import FAISS | |
| from langchain_mistralai import MistralAIEmbeddings # or Mistral embeddings later | |
| # ---- Dynamic FAISS import ---- | |
| def import_faiss(): | |
| try: | |
| import faiss | |
| return faiss | |
| except ImportError: | |
| import subprocess | |
| system = platform.system() | |
| try: | |
| import torch | |
| gpu_available = torch.cuda.is_available() | |
| except ImportError: | |
| gpu_available = False | |
| if system == "Darwin" or not gpu_available: | |
| pkg_name = "faiss-cpu" | |
| else: | |
| pkg_name = "faiss-gpu" | |
| print(f"FAISS not found. Installing {pkg_name}...") | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_name]) | |
| import faiss | |
| return faiss | |
| faiss = import_faiss() | |
| from langchain_community.vectorstores.faiss import FAISS | |
| # Load env | |
| load_dotenv() | |
| MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY") | |
| if not MISTRAL_API_KEY: | |
| raise Exception("Missing MISTRAL_API_KEY in .env") | |
| DOCUMENTS_DIR = "rag/policies" | |
| VECTORSTORE_DIR = "rag/vectorstore" | |
| def load_documents(): | |
| docs = [] | |
| for file in os.listdir(DOCUMENTS_DIR): | |
| if file.endswith(".pdf"): | |
| print(f"Loading document: {file}") # <-- Print the filename | |
| loader = PyPDFLoader(os.path.join(DOCUMENTS_DIR, file)) | |
| docs.extend(loader.load()) | |
| return docs | |
| def build_vectorstore(): | |
| docs = load_documents() | |
| # Split into chunks | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1500, | |
| chunk_overlap=300 | |
| ) | |
| chunks = splitter.split_documents(docs) | |
| # Embeddings (OpenAI or Mistral later) | |
| embeddings = MistralAIEmbeddings(model="mistral-embed") | |
| # Create FAISS vectorstore | |
| vectorstore = FAISS.from_documents(chunks, embeddings) | |
| # Save DB | |
| vectorstore.save_local(VECTORSTORE_DIR) | |
| print(f"Vectorstore created at: {VECTORSTORE_DIR}") | |
| def verify_vectorstore(vectorstore_dir=VECTORSTORE_DIR, test_query="Interest rate for high-risk customer"): | |
| from langchain_community.vectorstores.faiss import FAISS | |
| from langchain_mistralai import MistralAIEmbeddings | |
| embeddings = MistralAIEmbeddings(model="mistral-embed", api_key=MISTRAL_API_KEY) | |
| vectorstore = FAISS.load_local(vectorstore_dir, embeddings, allow_dangerous_deserialization=True) | |
| results = vectorstore.similarity_search(test_query, k=3) | |
| print("\n=== Verification Results ===") | |
| for i, doc in enumerate(results): | |
| print(f"\n--- Result {i+1} ---\n{doc.page_content}") | |
| if __name__ == "__main__": | |
| build_vectorstore() | |
| verify_vectorstore(test_query="Overall risk for credit score 700 with delinquent account") | |
| verify_vectorstore(test_query="Interest rate for medium-risk customer") |