Spaces:
Runtime error
Runtime error
| # This file will handle the Retrieval-Augmented Generation (RAG) pipeline. | |
| # It will be responsible for loading the markdown documents from the data directory, | |
| # splitting them into chunks, creating embeddings, and storing them in a FAISS vector store. | |
| # | |
| #import os | |
| #from langchain_community.document_loaders import DirectoryLoader, TextLoader | |
| #from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| #from langchain_community.vectorstores import FAISS | |
| #from langchain_huggingface import HuggingFaceEmbeddings | |
| # | |
| ## Define the path for the data directory and the vector store | |
| #DATA_PATH = "../data/agencijaA" | |
| #DB_FAISS_PATH = "../vectorstore/db_faiss" | |
| # | |
| #def create_vector_db(): | |
| # """ | |
| # Creates a FAISS vector store from the markdown documents in the data directory. | |
| # """ | |
| # # Load the documents | |
| # # Using TextLoader for .md files | |
| # loader = DirectoryLoader(DATA_PATH, glob='*.md', loader_cls=TextLoader) | |
| # documents = loader.load() | |
| # if not documents: | |
| # print("No documents found in the data directory. Please add your markdown files.") | |
| # return | |
| # | |
| # # Split the documents into chunks | |
| # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| # texts = text_splitter.split_documents(documents) | |
| # print(f"Split into {len(texts)} chunks.") | |
| # | |
| # # Load the embedding model from Hugging Face | |
| # # 'paraphrase-multilingual-MiniLM-L12-v2' is a good model for multilingual text, including Serbian. | |
| # embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', | |
| # model_kwargs={'device': 'cpu'}) | |
| # | |
| # # Create the FAISS vector store from the text chunks and embeddings | |
| # db = FAISS.from_documents(texts, embeddings) | |
| # | |
| # # Save the vector store locally | |
| # db.save_local(DB_FAISS_PATH) | |
| # print("Vector store created successfully and saved locally.") | |
| # | |
| #if __name__ == '__main__': | |
| # create_vector_db() | |
| import os | |
| from langchain_community.document_loaders import DirectoryLoader, TextLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| # Define the path for the data directory and the vector store | |
| DATA_PATH = "../data/agencijaA" | |
| DB_FAISS_PATH = "../vectorstore/db_faiss" | |
| def create_vector_db(): | |
| """ | |
| Creates a FAISS vector store from the markdown documents in the data directory. | |
| """ | |
| print(f"Attempting to load documents from: {DATA_PATH}") | |
| # Load the documents | |
| # KEY CHANGE: Pass the 'loader_kwargs' argument with 'encoding="utf-8"' | |
| loader = DirectoryLoader( | |
| DATA_PATH, | |
| glob='*.md', | |
| loader_cls=TextLoader, | |
| loader_kwargs={'autodetect_encoding': True} # Dodato za automatsko prepoznavanje ako UTF-8 nije dovoljan | |
| ) | |
| # Stara opcija ako gornja ne radi uvek: | |
| # loader = DirectoryLoader( | |
| # DATA_PATH, | |
| # glob='*.md', | |
| # loader_cls=TextLoader, | |
| # loader_kwargs={'encoding': 'utf-8'} # Specificiramo UTF-8 kodiranje | |
| # ) | |
| documents = loader.load() | |
| if not documents: | |
| print("No documents found in the data directory. Please add your markdown files.") | |
| print(f"Checked path: {os.path.abspath(DATA_PATH)}") | |
| return | |
| # Split the documents into chunks | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| texts = text_splitter.split_documents(documents) | |
| print(f"Split into {len(texts)} chunks.") | |
| # Load the embedding model from Hugging Face | |
| embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', | |
| model_kwargs={'device': 'cpu'}) | |
| print(f"IDE GAAAAAAAS SVI VOLE EVU RAAAAS") | |
| # Create the FAISS vector store from the text chunks and embeddings | |
| db = FAISS.from_documents(texts, embeddings) | |
| # Save the vector store locally | |
| db.save_local(DB_FAISS_PATH) | |
| print("Vector store created successfully and saved locally.") | |
| print(f"Vector store saved to: {os.path.abspath(DB_FAISS_PATH)}") | |
| if __name__ == '__main__': | |
| create_vector_db() |