ai-kit / embedding_tools /create_embedding.py
Kim Adams
adding light docs
82df044
import os
import pandas as pd
from openai.embeddings_utils import get_embedding, cosine_similarity
benefit_info = None
embedding_info = None
def CreateEmbeddingsQA(input_path, output_path):
global benefit_info
if os.path.exists(output_path):
benefit_info = pd.read_pickle(output_path)
else:
benefit_info = pd.read_csv(input_path)
benefit_info['embedding'] = benefit_info['answer'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
benefit_info.to_pickle(output_path)
#---------- Updated for policy pdfs
def CreateEmbeddingsFlatPolicy(input_path, output_path):
global embedding_info
print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path)
embedding_info = pd.read_csv(input_path)
embedding_info.columns = ['data']
embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
embedding_info.to_pickle(output_path)
print("Loaded from CSV file.")
print(f"embedding_info type: {type(embedding_info)}")
print(f"embedding_info is None: {embedding_info is None}")
def CreateEmbeddingsFlat(input_path, output_path):
global embedding_info
print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path)
if os.path.exists(output_path):
embedding_info = pd.read_pickle(output_path)
print("Loaded from PKL file.")
else:
embedding_info = pd.read_csv(input_path)
embedding_info.columns = ['data']
embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
embedding_info.to_pickle(output_path)
print("Loaded from CSV file.")
print(f"embedding_info type: {type(embedding_info)}")
print(f"embedding_info is None: {embedding_info is None}")
#------------- #2: fetch embeddings context using text-embedding-ada-002 engine/cosine_similarity for lookup ----------------
def QueryEmbeddings(question):
global benefit_info
question_vector = get_embedding(question, engine='text-embedding-ada-002')
# Compute the cosine similarity
benefit_info["similarities"] = benefit_info['embedding'].apply(lambda x: cosine_similarity(x, question_vector))
sorted_benefits = benefit_info.sort_values("similarities", ascending=False).head(5)
best_answer = sorted_benefits.iloc[0]['answer']
print("Question: "+question)
print("Top question: "+ sorted_benefits.iloc[0]['question'])
print("Best answer: ", best_answer)
return best_answer
def QueryEmbeddingsFlat(query):
global embedding_info
question_vector = get_embedding(query, engine='text-embedding-ada-002')
best_answer="No embeddings."
print(type(embedding_info)) # Should be <class 'pandas.DataFrame'> if it's a DataFrame
print(embedding_info is None) # Should be False
if embedding_info is not None:
embedding_info["similarities"] = embedding_info['embedding'].apply(lambda x: cosine_similarity(x, question_vector))
sorted_matches = embedding_info.sort_values("similarities", ascending=False).head(5)
best_answer = sorted_matches.iloc[0]['data']
print("query: "+query)
else:
print("embedding_info is None")
print("Best answer: ", best_answer)
return best_answer