Izazo / src /rag.py
NovakTJ
clean commit vasinih promena
ca5d3ff
# This file will handle the Retrieval-Augmented Generation (RAG) pipeline.
# It will be responsible for loading the markdown documents from the data directory,
# splitting them into chunks, creating embeddings, and storing them in a FAISS vector store.
#
#import os
#from langchain_community.document_loaders import DirectoryLoader, TextLoader
#from langchain_text_splitters import RecursiveCharacterTextSplitter
#from langchain_community.vectorstores import FAISS
#from langchain_huggingface import HuggingFaceEmbeddings
#
## Define the path for the data directory and the vector store
#DATA_PATH = "../data/agencijaA"
#DB_FAISS_PATH = "../vectorstore/db_faiss"
#
#def create_vector_db():
# """
# Creates a FAISS vector store from the markdown documents in the data directory.
# """
# # Load the documents
# # Using TextLoader for .md files
# loader = DirectoryLoader(DATA_PATH, glob='*.md', loader_cls=TextLoader)
# documents = loader.load()
# if not documents:
# print("No documents found in the data directory. Please add your markdown files.")
# return
#
# # Split the documents into chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
# texts = text_splitter.split_documents(documents)
# print(f"Split into {len(texts)} chunks.")
#
# # Load the embedding model from Hugging Face
# # 'paraphrase-multilingual-MiniLM-L12-v2' is a good model for multilingual text, including Serbian.
# embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
# model_kwargs={'device': 'cpu'})
#
# # Create the FAISS vector store from the text chunks and embeddings
# db = FAISS.from_documents(texts, embeddings)
#
# # Save the vector store locally
# db.save_local(DB_FAISS_PATH)
# print("Vector store created successfully and saved locally.")
#
#if __name__ == '__main__':
# create_vector_db()
import os
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
# Define the path for the data directory and the vector store
DATA_PATH = "../data/agencijaA"
DB_FAISS_PATH = "../vectorstore/db_faiss"
def create_vector_db():
"""
Creates a FAISS vector store from the markdown documents in the data directory.
"""
print(f"Attempting to load documents from: {DATA_PATH}")
# Load the documents
# KEY CHANGE: Pass the 'loader_kwargs' argument with 'encoding="utf-8"'
loader = DirectoryLoader(
DATA_PATH,
glob='*.md',
loader_cls=TextLoader,
loader_kwargs={'autodetect_encoding': True} # Dodato za automatsko prepoznavanje ako UTF-8 nije dovoljan
)
# Stara opcija ako gornja ne radi uvek:
# loader = DirectoryLoader(
# DATA_PATH,
# glob='*.md',
# loader_cls=TextLoader,
# loader_kwargs={'encoding': 'utf-8'} # Specificiramo UTF-8 kodiranje
# )
documents = loader.load()
if not documents:
print("No documents found in the data directory. Please add your markdown files.")
print(f"Checked path: {os.path.abspath(DATA_PATH)}")
return
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
print(f"Split into {len(texts)} chunks.")
# Load the embedding model from Hugging Face
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
model_kwargs={'device': 'cpu'})
print(f"IDE GAAAAAAAS SVI VOLE EVU RAAAAS")
# Create the FAISS vector store from the text chunks and embeddings
db = FAISS.from_documents(texts, embeddings)
# Save the vector store locally
db.save_local(DB_FAISS_PATH)
print("Vector store created successfully and saved locally.")
print(f"Vector store saved to: {os.path.abspath(DB_FAISS_PATH)}")
if __name__ == '__main__':
create_vector_db()