iprepbot

Sleeping

File size: 4,250 Bytes

5efc535

from dotenv import load_dotenv, find_dotenv
import pandas as pd
import os
import chromadb
from chromadb.utils import embedding_functions


def generate_qa_vector_db(vdb_path: str, df: pd.DataFrame) -> None:
    """This function processes the dataframe into the required format, and then creates the following collections in a ChromaDB instance
    1. question_collection - Contains question embeddings, and the metadata as 'position' and 'interview_phase'
    2. answer_collection - Contains the answer embeddings. No metadata (yet).

    Args:
        vdb_path (str): Relative path of the location of the ChromaDB instance.
        df (pd.DataFrame): Question/answer dataset.
    """
    chroma_client = chromadb.PersistentClient(path=vdb_path)

    huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
        api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"],
        model_name="sentence-transformers/all-MiniLM-L6-v2",
    )

    print("q_collection will be added")
    q_collection = chroma_client.create_collection(
        name="question_collection",
        metadata={"hnsw:space": "cosine"},
        embedding_function=huggingface_ef,
    )

    # Keep only question-related columns
    df_questions = df[
        ["Position/Role", "Question", "Interview Phase"]
    ].drop_duplicates()

    # df_questions = df_questions.drop_duplicates().reset_index(drop=True)
    df_questions.columns = [
        x.replace(" ", "_").lower().replace("/", "_or_") for x in df_questions.columns
    ]

    q_documents = [row.question for row in df_questions.itertuples()]
    q_metadata = [
        {"position": row.position_or_role, "interview_phase": row.interview_phase}
        for row in df_questions.itertuples()
    ]
    q_ids = ["q_id" + str(row.Index) for row in df_questions.itertuples()]

    q_collection.add(documents=q_documents, metadatas=q_metadata, ids=q_ids)
    print("q_collection added")

    print("a_collection will be added")
    a_collection = chroma_client.create_collection(
        name="answer_collection",
        metadata={"hnsw:space": "cosine"},
        embedding_function=huggingface_ef,
    )

    df_answers = df[["Answer", "Answer Quality"]]
    df_answers.columns = [
        x.replace(" ", "_").lower().replace("/", "_or_") for x in df_answers.columns
    ]

    a_documents = [row.answer for row in df_answers.itertuples()]
    a_metadata = [
        {"answer_quality": row.answer_quality} for row in df_answers.itertuples()
    ]
    a_ids = ["a_id" + str(row.Index) for row in df_answers.itertuples()]

    a_collection.add(documents=a_documents, ids=a_ids, metadatas=a_metadata)
    print("a_collection added")
    return None


def delete_collection_from_vector_db(vdb_path: str, collection_name: str) -> None:
    """Deletes a particular collection from the persistent ChromaDB instance.

    Args:
        vdb_path (str): Path of the persistent ChromaDB instance.
        collection_name (str): Name of the collection to be deleted.
    """
    chroma_client = chromadb.PersistentClient(path=vdb_path)
    chroma_client.delete_collection(collection_name)
    return None


def list_collections_from_vector_db(vdb_path: str) -> None:
    """Lists all the available collections from the persistent ChromaDB instance.

    Args:
        vdb_path (str): Path of the persistent ChromaDB instance.
    """
    chroma_client = chromadb.PersistentClient(path=vdb_path)
    print(chroma_client.list_collections())


def get_collection_from_vector_db(
    vdb_path: str, collection_name: str
) -> chromadb.Collection:
    """Fetches a particular ChromaDB collection object from the persistent ChromaDB instance.

    Args:
        vdb_path (str): Path of the persistent ChromaDB instance.
        collection_name (str): Name of the collection which needs to be retrieved.
    """
    load_dotenv(find_dotenv())
    chroma_client = chromadb.PersistentClient(path=vdb_path)

    huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
        api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"],
        model_name="sentence-transformers/all-MiniLM-L6-v2",
    )

    collection = chroma_client.get_collection(
        name=collection_name, embedding_function=huggingface_ef
    )

    return collection