databricks-rag-assistant / prepare_data.py
felipelemes's picture
Initial commit: Core RAG project files and setup
3975d30
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
# --- Configurations ---
PDF_PATH = "data/azure-databricks.pdf" # Path to PDF file
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Embedding model name to be used
VECTOR_DB_PATH = "vector_db" # Folder where the vector database will be saved
# --- 1. Load the PDF ---
print(f"Loading PDF from: {PDF_PATH}...")
try:
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()
print(f"PDF loaded successfully! Total of {len(documents)} pages.")
except Exception as e:
print(f"Error loading PDF: {e}")
print("Please ensure the PDF file exists and the path is correct.")
exit() # Stop the script if an error occurs
# --- 2. Split the text into chunks ---
print("Splitting text into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # Maximum size of each chunk (in characters)
chunk_overlap=200, # How many characters chunks can overlap (to maintain context)
length_function=len # Function to calculate chunk length
)
chunks = text_splitter.split_documents(documents)
print(f"Text split into {len(chunks)} chunks.")
# --- 3. Create Embeddings and Store in FAISS ---
print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
print("Generating embeddings and creating the FAISS vector database...")
# Create the vector database from the chunks and embeddings
vector_db = FAISS.from_documents(chunks, embeddings)
# --- 4. Save the Vector Database ---
print(f"Saving the vector database to: {VECTOR_DB_PATH}...")
vector_db.save_local(VECTOR_DB_PATH)
print("Vector database created and saved successfully!")