|
|
from dotenv import load_dotenv, find_dotenv |
|
|
import pandas as pd |
|
|
import os |
|
|
import chromadb |
|
|
from chromadb.utils import embedding_functions |
|
|
|
|
|
|
|
|
def generate_qa_vector_db(vdb_path: str, df: pd.DataFrame) -> None: |
|
|
"""This function processes the dataframe into the required format, and then creates the following collections in a ChromaDB instance |
|
|
1. question_collection - Contains question embeddings, and the metadata as 'position' and 'interview_phase' |
|
|
2. answer_collection - Contains the answer embeddings. No metadata (yet). |
|
|
|
|
|
Args: |
|
|
vdb_path (str): Relative path of the location of the ChromaDB instance. |
|
|
df (pd.DataFrame): Question/answer dataset. |
|
|
""" |
|
|
chroma_client = chromadb.PersistentClient(path=vdb_path) |
|
|
|
|
|
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction( |
|
|
api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"], |
|
|
model_name="sentence-transformers/all-MiniLM-L6-v2", |
|
|
) |
|
|
|
|
|
print("q_collection will be added") |
|
|
q_collection = chroma_client.create_collection( |
|
|
name="question_collection", |
|
|
metadata={"hnsw:space": "cosine"}, |
|
|
embedding_function=huggingface_ef, |
|
|
) |
|
|
|
|
|
|
|
|
df_questions = df[ |
|
|
["Position/Role", "Question", "Interview Phase"] |
|
|
].drop_duplicates() |
|
|
|
|
|
|
|
|
df_questions.columns = [ |
|
|
x.replace(" ", "_").lower().replace("/", "_or_") for x in df_questions.columns |
|
|
] |
|
|
|
|
|
q_documents = [row.question for row in df_questions.itertuples()] |
|
|
q_metadata = [ |
|
|
{"position": row.position_or_role, "interview_phase": row.interview_phase} |
|
|
for row in df_questions.itertuples() |
|
|
] |
|
|
q_ids = ["q_id" + str(row.Index) for row in df_questions.itertuples()] |
|
|
|
|
|
q_collection.add(documents=q_documents, metadatas=q_metadata, ids=q_ids) |
|
|
print("q_collection added") |
|
|
|
|
|
print("a_collection will be added") |
|
|
a_collection = chroma_client.create_collection( |
|
|
name="answer_collection", |
|
|
metadata={"hnsw:space": "cosine"}, |
|
|
embedding_function=huggingface_ef, |
|
|
) |
|
|
|
|
|
df_answers = df[["Answer", "Answer Quality"]] |
|
|
df_answers.columns = [ |
|
|
x.replace(" ", "_").lower().replace("/", "_or_") for x in df_answers.columns |
|
|
] |
|
|
|
|
|
a_documents = [row.answer for row in df_answers.itertuples()] |
|
|
a_metadata = [ |
|
|
{"answer_quality": row.answer_quality} for row in df_answers.itertuples() |
|
|
] |
|
|
a_ids = ["a_id" + str(row.Index) for row in df_answers.itertuples()] |
|
|
|
|
|
a_collection.add(documents=a_documents, ids=a_ids, metadatas=a_metadata) |
|
|
print("a_collection added") |
|
|
return None |
|
|
|
|
|
|
|
|
def delete_collection_from_vector_db(vdb_path: str, collection_name: str) -> None: |
|
|
"""Deletes a particular collection from the persistent ChromaDB instance. |
|
|
|
|
|
Args: |
|
|
vdb_path (str): Path of the persistent ChromaDB instance. |
|
|
collection_name (str): Name of the collection to be deleted. |
|
|
""" |
|
|
chroma_client = chromadb.PersistentClient(path=vdb_path) |
|
|
chroma_client.delete_collection(collection_name) |
|
|
return None |
|
|
|
|
|
|
|
|
def list_collections_from_vector_db(vdb_path: str) -> None: |
|
|
"""Lists all the available collections from the persistent ChromaDB instance. |
|
|
|
|
|
Args: |
|
|
vdb_path (str): Path of the persistent ChromaDB instance. |
|
|
""" |
|
|
chroma_client = chromadb.PersistentClient(path=vdb_path) |
|
|
print(chroma_client.list_collections()) |
|
|
|
|
|
|
|
|
def get_collection_from_vector_db( |
|
|
vdb_path: str, collection_name: str |
|
|
) -> chromadb.Collection: |
|
|
"""Fetches a particular ChromaDB collection object from the persistent ChromaDB instance. |
|
|
|
|
|
Args: |
|
|
vdb_path (str): Path of the persistent ChromaDB instance. |
|
|
collection_name (str): Name of the collection which needs to be retrieved. |
|
|
""" |
|
|
load_dotenv(find_dotenv()) |
|
|
chroma_client = chromadb.PersistentClient(path=vdb_path) |
|
|
|
|
|
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction( |
|
|
api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"], |
|
|
model_name="sentence-transformers/all-MiniLM-L6-v2", |
|
|
) |
|
|
|
|
|
collection = chroma_client.get_collection( |
|
|
name=collection_name, embedding_function=huggingface_ef |
|
|
) |
|
|
|
|
|
return collection |
|
|
|