import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS

# --- Configurations ---
PDF_PATH = "data/azure-databricks.pdf" # Path to PDF file
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Embedding model name to be used
VECTOR_DB_PATH = "vector_db" # Folder where the vector database will be saved

# --- 1. Load the PDF ---
print(f"Loading PDF from: {PDF_PATH}...")
try:
    loader = PyPDFLoader(PDF_PATH)
    documents = loader.load()
    print(f"PDF loaded successfully! Total of {len(documents)} pages.")
except Exception as e:
    print(f"Error loading PDF: {e}")
    print("Please ensure the PDF file exists and the path is correct.")
    exit() # Stop the script if an error occurs

# --- 2. Split the text into chunks ---
print("Splitting text into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,       # Maximum size of each chunk (in characters)
    chunk_overlap=200,     # How many characters chunks can overlap (to maintain context)
    length_function=len    # Function to calculate chunk length
)
chunks = text_splitter.split_documents(documents)
print(f"Text split into {len(chunks)} chunks.")

# --- 3. Create Embeddings and Store in FAISS ---
print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)

print("Generating embeddings and creating the FAISS vector database...")
# Create the vector database from the chunks and embeddings
vector_db = FAISS.from_documents(chunks, embeddings)

# --- 4. Save the Vector Database ---
print(f"Saving the vector database to: {VECTOR_DB_PATH}...")
vector_db.save_local(VECTOR_DB_PATH)
print("Vector database created and saved successfully!")