Spaces:

UltimateRepAIr
/

RepAIr

Sleeping

File size: 2,288 Bytes

# vectorization functions
#%%
# General
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import helper_functions.load_embed_model

def split_documents(documents, chunk_size=900, chunk_overlap=90): # check chunk size and overlap for our purpose
    """
    This function splits documents into chunks of given size and overlap.

    Args:
        documents (list): List of documents to be split.
        chunk_size (int): Size of each chunk.
        chunk_overlap (int): Overlap between chunks.
    
    Returns:    
        list: List of text chunks.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["###", "Step ", "\n\n", "\n", ".", " ", ""] # check separators for our purpose
    )
    chunks = text_splitter.split_documents(documents=documents)
    return chunks

def create_embedding_vector_db(chunks):
    """
    Uses the open-source embedding model HuggingFaceEmbeddings 
    to create embeddings and store those in a vector database called FAISS, 
    which allows for efficient similarity search

    Args:
        chunks (list): List of text chunks to be embedded.
    
    Returns:
        vector_db: The vector database containing the embedded chunks.
    """
    # create the vector store 
    embed_model = helper_functions.load_embed_model.embedding_model # load embedding model
    vector_database = FAISS.from_documents( # stores embeddings # from_documents includes metadata
        documents=chunks,
        embedding=embed_model
    )
    return vector_database # optimize

# Function to query the vector database and interact with Groq
def query_vector_db(query, vector_db, k):
    """
    This function queries the vector database with the user query and retrieves relevant documents

    Args:
        query (str): The user query.
        vector_db: The vector database to query.
    
    Returns:
        str: The context retrieved from the vector database.

    """
    # Retrieve relevant documents
    docs = vector_db.similarity_search(query, k) # neigbors k are the chunks # similarity_search: FAISS function
    context = "\n".join([doc.page_content for doc in docs])

    return context