import os
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Define the data path
DATA_PATH = "data/"
FAISS_PATH = "vectorstore/db_faiss"

# Step 1: Load raw PDFs
def load_pdf_files(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

documents = load_pdf_files(DATA_PATH)

# Step 2: Create Chunks
def create_chunks(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )
    return text_splitter.split_documents(extracted_data)

text_chunks = create_chunks(documents)

# Step 3: Embeddings
def get_embedding_model():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

embedding_model = get_embedding_model()

# Step 4: Store or Load FAISS
if not os.path.exists(FAISS_PATH):
    db = FAISS.from_documents(text_chunks, embedding_model)
    db.save_local(FAISS_PATH)
else:
    db = FAISS.load_local(FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)