Spaces:
Sleeping
Sleeping
| # vectorization functions | |
| #%% | |
| # General | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| import helper_functions.load_embed_model | |
| def split_documents(documents, chunk_size=900, chunk_overlap=90): # check chunk size and overlap for our purpose | |
| """ | |
| This function splits documents into chunks of given size and overlap. | |
| Args: | |
| documents (list): List of documents to be split. | |
| chunk_size (int): Size of each chunk. | |
| chunk_overlap (int): Overlap between chunks. | |
| Returns: | |
| list: List of text chunks. | |
| """ | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| separators=["###", "Step ", "\n\n", "\n", ".", " ", ""] # check separators for our purpose | |
| ) | |
| chunks = text_splitter.split_documents(documents=documents) | |
| return chunks | |
| def create_embedding_vector_db(chunks): | |
| """ | |
| Uses the open-source embedding model HuggingFaceEmbeddings | |
| to create embeddings and store those in a vector database called FAISS, | |
| which allows for efficient similarity search | |
| Args: | |
| chunks (list): List of text chunks to be embedded. | |
| Returns: | |
| vector_db: The vector database containing the embedded chunks. | |
| """ | |
| # create the vector store | |
| embed_model = helper_functions.load_embed_model.embedding_model # load embedding model | |
| vector_database = FAISS.from_documents( # stores embeddings # from_documents includes metadata | |
| documents=chunks, | |
| embedding=embed_model | |
| ) | |
| return vector_database # optimize | |
| # Function to query the vector database and interact with Groq | |
| def query_vector_db(query, vector_db, k): | |
| """ | |
| This function queries the vector database with the user query and retrieves relevant documents | |
| Args: | |
| query (str): The user query. | |
| vector_db: The vector database to query. | |
| Returns: | |
| str: The context retrieved from the vector database. | |
| """ | |
| # Retrieve relevant documents | |
| docs = vector_db.similarity_search(query, k) # neigbors k are the chunks # similarity_search: FAISS function | |
| context = "\n".join([doc.page_content for doc in docs]) | |
| return context | |