# This file will handle the Retrieval-Augmented Generation (RAG) pipeline. # It will be responsible for loading the markdown documents from the data directory, # splitting them into chunks, creating embeddings, and storing them in a FAISS vector store. # #import os #from langchain_community.document_loaders import DirectoryLoader, TextLoader #from langchain_text_splitters import RecursiveCharacterTextSplitter #from langchain_community.vectorstores import FAISS #from langchain_huggingface import HuggingFaceEmbeddings # ## Define the path for the data directory and the vector store #DATA_PATH = "../data/agencijaA" #DB_FAISS_PATH = "../vectorstore/db_faiss" # #def create_vector_db(): # """ # Creates a FAISS vector store from the markdown documents in the data directory. # """ # # Load the documents # # Using TextLoader for .md files # loader = DirectoryLoader(DATA_PATH, glob='*.md', loader_cls=TextLoader) # documents = loader.load() # if not documents: # print("No documents found in the data directory. Please add your markdown files.") # return # # # Split the documents into chunks # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) # texts = text_splitter.split_documents(documents) # print(f"Split into {len(texts)} chunks.") # # # Load the embedding model from Hugging Face # # 'paraphrase-multilingual-MiniLM-L12-v2' is a good model for multilingual text, including Serbian. # embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', # model_kwargs={'device': 'cpu'}) # # # Create the FAISS vector store from the text chunks and embeddings # db = FAISS.from_documents(texts, embeddings) # # # Save the vector store locally # db.save_local(DB_FAISS_PATH) # print("Vector store created successfully and saved locally.") # #if __name__ == '__main__': # create_vector_db() import os from langchain_community.document_loaders import DirectoryLoader, TextLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings # Define the path for the data directory and the vector store DATA_PATH = "../data/agencijaA" DB_FAISS_PATH = "../vectorstore/db_faiss" def create_vector_db(): """ Creates a FAISS vector store from the markdown documents in the data directory. """ print(f"Attempting to load documents from: {DATA_PATH}") # Load the documents # KEY CHANGE: Pass the 'loader_kwargs' argument with 'encoding="utf-8"' loader = DirectoryLoader( DATA_PATH, glob='*.md', loader_cls=TextLoader, loader_kwargs={'autodetect_encoding': True} # Dodato za automatsko prepoznavanje ako UTF-8 nije dovoljan ) # Stara opcija ako gornja ne radi uvek: # loader = DirectoryLoader( # DATA_PATH, # glob='*.md', # loader_cls=TextLoader, # loader_kwargs={'encoding': 'utf-8'} # Specificiramo UTF-8 kodiranje # ) documents = loader.load() if not documents: print("No documents found in the data directory. Please add your markdown files.") print(f"Checked path: {os.path.abspath(DATA_PATH)}") return # Split the documents into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) texts = text_splitter.split_documents(documents) print(f"Split into {len(texts)} chunks.") # Load the embedding model from Hugging Face embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', model_kwargs={'device': 'cpu'}) print(f"IDE GAAAAAAAS SVI VOLE EVU RAAAAS") # Create the FAISS vector store from the text chunks and embeddings db = FAISS.from_documents(texts, embeddings) # Save the vector store locally db.save_local(DB_FAISS_PATH) print("Vector store created successfully and saved locally.") print(f"Vector store saved to: {os.path.abspath(DB_FAISS_PATH)}") if __name__ == '__main__': create_vector_db()