AGENT_ANALYSE_RAG_dev

Sleeping

AGENT_ANALYSE_RAG_dev / utils /audit /rag.py

Ilyas KHIAT

enhance graph

0222cea over 1 year ago

1.83 kB

	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain_experimental.text_splitter import SemanticChunker



	def get_text_from_content_for_doc(content):
	# text = ""
	# for page in content:
	# text += content[page]["texte"]

	text_filtered = "\n".join([content[page]["texte"].replace("\n","").replace(" "," ").replace("\t"," ") for page in content])
	return text_filtered



	def get_text_from_content_for_audio(content):
	return content["transcription"]


	def get_text_chunks(text):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=500, # the character length of the chunck
	chunk_overlap=100, # the character length of the overlap between chuncks
	length_function=len # the length function - in this case, character length (aka the python len() fn.)
	)
	chunks = text_splitter.split_text(text)
	return chunks

	def get_semantic_chunks(text):
	text_splitter = SemanticChunker(OpenAIEmbeddings(),breakpoint_threshold_type="standard_deviation",breakpoint_threshold_amount=2.718)
	chunks = text_splitter.create_documents([text])
	semantic_chunks = [chunk.page_content for chunk in chunks]
	return semantic_chunks

	def get_vectorstore(text_chunks):
	embedding = OpenAIEmbeddings(model="text-embedding-3-small")
	vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding)
	return vectorstore

	def setup_rag(file_type,content):
	if file_type == "pdf":
	text = get_text_from_content_for_doc(content)
	elif file_type == "audio":
	text = get_text_from_content_for_audio(content)

	chunks = get_semantic_chunks(text)

	vectorstore = get_vectorstore(chunks)

	return vectorstore, chunks