import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

DATA_PATH = "data"
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
DB_FAISS_PATH = os.path.join(SCRIPT_DIR, "vectorstore", "db_faiss")

def create_vector_db():
    documents = []
    
    # Check if data directory exists
    if not os.path.exists(DATA_PATH):
        print(f"Directory {DATA_PATH} not found.")
        return

    # Load documents
    for filename in os.listdir(DATA_PATH):
        file_path = os.path.join(DATA_PATH, filename)
        if filename.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
            documents.extend(loader.load())
            print(f"Loaded {filename}")
        elif filename.endswith(".txt"):
            loader = TextLoader(file_path, encoding='utf-8')
            documents.extend(loader.load())
            print(f"Loaded {filename}")

    if not documents:
        print("No documents found to ingest.")
        return

    # Split text
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_documents(documents)
    print(f"Split documents into {len(texts)} chunks.")

    # Create embeddings (using HuggingFace - FREE!)
    print("Generating embeddings locally with sentence-transformers...")
    embeddings = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs={'device': 'cpu'}
    )

    # Create vector store
    db = FAISS.from_documents(texts, embeddings)
    db.save_local(DB_FAISS_PATH)
    print(f"Vector store saved to {DB_FAISS_PATH}")

if __name__ == "__main__":
    create_vector_db()