Spaces:
Sleeping
Sleeping
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_experimental.text_splitter import SemanticChunker | |
| def get_text_from_content_for_doc(content): | |
| # text = "" | |
| # for page in content: | |
| # text += content[page]["texte"] | |
| text_filtered = "\n".join([content[page]["texte"].replace("\n","").replace(" "," ").replace("\t"," ") for page in content]) | |
| return text_filtered | |
| def get_text_from_content_for_audio(content): | |
| return content["transcription"] | |
| def get_text_chunks(text): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, # the character length of the chunck | |
| chunk_overlap=100, # the character length of the overlap between chuncks | |
| length_function=len # the length function - in this case, character length (aka the python len() fn.) | |
| ) | |
| chunks = text_splitter.split_text(text) | |
| return chunks | |
| def get_semantic_chunks(text): | |
| text_splitter = SemanticChunker(OpenAIEmbeddings(),breakpoint_threshold_type="standard_deviation",breakpoint_threshold_amount=2.718) | |
| chunks = text_splitter.create_documents([text]) | |
| semantic_chunks = [chunk.page_content for chunk in chunks] | |
| return semantic_chunks | |
| def get_vectorstore(text_chunks): | |
| embedding = OpenAIEmbeddings(model="text-embedding-3-small") | |
| vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding) | |
| return vectorstore | |
| def setup_rag(file_type,content): | |
| if file_type == "pdf": | |
| text = get_text_from_content_for_doc(content) | |
| elif file_type == "audio": | |
| text = get_text_from_content_for_audio(content) | |
| chunks = get_semantic_chunks(text) | |
| vectorstore = get_vectorstore(chunks) | |
| return vectorstore, chunks | |