# This file will handle the Retrieval-Augmented Generation (RAG) pipeline.
# It will be responsible for loading the markdown documents from the data directory,
# splitting them into chunks, creating embeddings, and storing them in a FAISS vector store. 
#
#import os
#from langchain_community.document_loaders import DirectoryLoader, TextLoader
#from langchain_text_splitters import RecursiveCharacterTextSplitter
#from langchain_community.vectorstores import FAISS
#from langchain_huggingface import HuggingFaceEmbeddings
#
## Define the path for the data directory and the vector store
#DATA_PATH = "../data/agencijaA"
#DB_FAISS_PATH = "../vectorstore/db_faiss"
#
#def create_vector_db():
#    """
#    Creates a FAISS vector store from the markdown documents in the data directory.
#    """
#    # Load the documents
#    # Using TextLoader for .md files
#    loader = DirectoryLoader(DATA_PATH, glob='*.md', loader_cls=TextLoader)
#    documents = loader.load()
#    if not documents:
#        print("No documents found in the data directory. Please add your markdown files.")
#        return
#
#    # Split the documents into chunks
#    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
#    texts = text_splitter.split_documents(documents)
#    print(f"Split into {len(texts)} chunks.")
#
#    # Load the embedding model from Hugging Face
#    # 'paraphrase-multilingual-MiniLM-L12-v2' is a good model for multilingual text, including Serbian.
#    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
#                                       model_kwargs={'device': 'cpu'})
#
#    # Create the FAISS vector store from the text chunks and embeddings
#    db = FAISS.from_documents(texts, embeddings)
#    
#    # Save the vector store locally
#    db.save_local(DB_FAISS_PATH)
#    print("Vector store created successfully and saved locally.")
#
#if __name__ == '__main__':
#    create_vector_db() 


import os
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# Define the path for the data directory and the vector store
DATA_PATH = "../data/agencijaA"
DB_FAISS_PATH = "../vectorstore/db_faiss"

def create_vector_db():
    """
    Creates a FAISS vector store from the markdown documents in the data directory.
    """
    print(f"Attempting to load documents from: {DATA_PATH}")

    # Load the documents
    # KEY CHANGE: Pass the 'loader_kwargs' argument with 'encoding="utf-8"'
    loader = DirectoryLoader(
        DATA_PATH,
        glob='*.md',
        loader_cls=TextLoader,
        loader_kwargs={'autodetect_encoding': True} # Dodato za automatsko prepoznavanje ako UTF-8 nije dovoljan
    )
    # Stara opcija ako gornja ne radi uvek:
    # loader = DirectoryLoader(
    #     DATA_PATH,
    #     glob='*.md',
    #     loader_cls=TextLoader,
    #     loader_kwargs={'encoding': 'utf-8'} # Specificiramo UTF-8 kodiranje
    # )


    documents = loader.load()
    if not documents:
        print("No documents found in the data directory. Please add your markdown files.")
        print(f"Checked path: {os.path.abspath(DATA_PATH)}")
        return

    # Split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_documents(documents)
    print(f"Split into {len(texts)} chunks.")

    # Load the embedding model from Hugging Face
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
                                       model_kwargs={'device': 'cpu'})
    print(f"IDE GAAAAAAAS SVI VOLE EVU RAAAAS")
    # Create the FAISS vector store from the text chunks and embeddings
    db = FAISS.from_documents(texts, embeddings)
    
    # Save the vector store locally
    db.save_local(DB_FAISS_PATH)
    print("Vector store created successfully and saved locally.")
    print(f"Vector store saved to: {os.path.abspath(DB_FAISS_PATH)}")

if __name__ == '__main__':
    create_vector_db()