Ilyas KHIAT
enhance graph
0222cea
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_experimental.text_splitter import SemanticChunker
def get_text_from_content_for_doc(content):
# text = ""
# for page in content:
# text += content[page]["texte"]
text_filtered = "\n".join([content[page]["texte"].replace("\n","").replace(" "," ").replace("\t"," ") for page in content])
return text_filtered
def get_text_from_content_for_audio(content):
return content["transcription"]
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # the character length of the chunck
chunk_overlap=100, # the character length of the overlap between chuncks
length_function=len # the length function - in this case, character length (aka the python len() fn.)
)
chunks = text_splitter.split_text(text)
return chunks
def get_semantic_chunks(text):
text_splitter = SemanticChunker(OpenAIEmbeddings(),breakpoint_threshold_type="standard_deviation",breakpoint_threshold_amount=2.718)
chunks = text_splitter.create_documents([text])
semantic_chunks = [chunk.page_content for chunk in chunks]
return semantic_chunks
def get_vectorstore(text_chunks):
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding)
return vectorstore
def setup_rag(file_type,content):
if file_type == "pdf":
text = get_text_from_content_for_doc(content)
elif file_type == "audio":
text = get_text_from_content_for_audio(content)
chunks = get_semantic_chunks(text)
vectorstore = get_vectorstore(chunks)
return vectorstore, chunks