Spaces:

RTBHackaGeminiGymbros
/

Izazo

Runtime error

Izazo / src /rag.py

NovakTJ

clean commit vasinih promena

ca5d3ff 8 months ago

4.25 kB

	# This file will handle the Retrieval-Augmented Generation (RAG) pipeline.
	# It will be responsible for loading the markdown documents from the data directory,
	# splitting them into chunks, creating embeddings, and storing them in a FAISS vector store.
	#
	#import os
	#from langchain_community.document_loaders import DirectoryLoader, TextLoader
	#from langchain_text_splitters import RecursiveCharacterTextSplitter
	#from langchain_community.vectorstores import FAISS
	#from langchain_huggingface import HuggingFaceEmbeddings
	#
	## Define the path for the data directory and the vector store
	#DATA_PATH = "../data/agencijaA"
	#DB_FAISS_PATH = "../vectorstore/db_faiss"
	#
	#def create_vector_db():
	# """
	# Creates a FAISS vector store from the markdown documents in the data directory.
	# """
	# # Load the documents
	# # Using TextLoader for .md files
	# loader = DirectoryLoader(DATA_PATH, glob='*.md', loader_cls=TextLoader)
	# documents = loader.load()
	# if not documents:
	# print("No documents found in the data directory. Please add your markdown files.")
	# return
	#
	# # Split the documents into chunks
	# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	# texts = text_splitter.split_documents(documents)
	# print(f"Split into {len(texts)} chunks.")
	#
	# # Load the embedding model from Hugging Face
	# # 'paraphrase-multilingual-MiniLM-L12-v2' is a good model for multilingual text, including Serbian.
	# embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
	# model_kwargs={'device': 'cpu'})
	#
	# # Create the FAISS vector store from the text chunks and embeddings
	# db = FAISS.from_documents(texts, embeddings)
	#
	# # Save the vector store locally
	# db.save_local(DB_FAISS_PATH)
	# print("Vector store created successfully and saved locally.")
	#
	#if __name__ == '__main__':
	# create_vector_db()


	import os
	from langchain_community.document_loaders import DirectoryLoader, TextLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings

	# Define the path for the data directory and the vector store
	DATA_PATH = "../data/agencijaA"
	DB_FAISS_PATH = "../vectorstore/db_faiss"

	def create_vector_db():
	"""
	Creates a FAISS vector store from the markdown documents in the data directory.
	"""
	print(f"Attempting to load documents from: {DATA_PATH}")

	# Load the documents
	# KEY CHANGE: Pass the 'loader_kwargs' argument with 'encoding="utf-8"'
	loader = DirectoryLoader(
	DATA_PATH,
	glob='*.md',
	loader_cls=TextLoader,
	loader_kwargs={'autodetect_encoding': True} # Dodato za automatsko prepoznavanje ako UTF-8 nije dovoljan
	)
	# Stara opcija ako gornja ne radi uvek:
	# loader = DirectoryLoader(
	# DATA_PATH,
	# glob='*.md',
	# loader_cls=TextLoader,
	# loader_kwargs={'encoding': 'utf-8'} # Specificiramo UTF-8 kodiranje
	# )


	documents = loader.load()
	if not documents:
	print("No documents found in the data directory. Please add your markdown files.")
	print(f"Checked path: {os.path.abspath(DATA_PATH)}")
	return

	# Split the documents into chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	texts = text_splitter.split_documents(documents)
	print(f"Split into {len(texts)} chunks.")

	# Load the embedding model from Hugging Face
	embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
	model_kwargs={'device': 'cpu'})
	print(f"IDE GAAAAAAAS SVI VOLE EVU RAAAAS")
	# Create the FAISS vector store from the text chunks and embeddings
	db = FAISS.from_documents(texts, embeddings)

	# Save the vector store locally
	db.save_local(DB_FAISS_PATH)
	print("Vector store created successfully and saved locally.")
	print(f"Vector store saved to: {os.path.abspath(DB_FAISS_PATH)}")

	if __name__ == '__main__':
	create_vector_db()