File size: 3,392 Bytes
e7d050a 3a5289f ece60bb 3a5289f 82df044 2cd1c76 ece60bb 46b3402 ece60bb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | import os
import pandas as pd
from openai.embeddings_utils import get_embedding, cosine_similarity
benefit_info = None
embedding_info = None
def CreateEmbeddingsQA(input_path, output_path):
global benefit_info
if os.path.exists(output_path):
benefit_info = pd.read_pickle(output_path)
else:
benefit_info = pd.read_csv(input_path)
benefit_info['embedding'] = benefit_info['answer'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
benefit_info.to_pickle(output_path)
#---------- Updated for policy pdfs
def CreateEmbeddingsFlatPolicy(input_path, output_path):
global embedding_info
print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path)
embedding_info = pd.read_csv(input_path)
embedding_info.columns = ['data']
embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
embedding_info.to_pickle(output_path)
print("Loaded from CSV file.")
print(f"embedding_info type: {type(embedding_info)}")
print(f"embedding_info is None: {embedding_info is None}")
def CreateEmbeddingsFlat(input_path, output_path):
global embedding_info
print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path)
if os.path.exists(output_path):
embedding_info = pd.read_pickle(output_path)
print("Loaded from PKL file.")
else:
embedding_info = pd.read_csv(input_path)
embedding_info.columns = ['data']
embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
embedding_info.to_pickle(output_path)
print("Loaded from CSV file.")
print(f"embedding_info type: {type(embedding_info)}")
print(f"embedding_info is None: {embedding_info is None}")
#------------- #2: fetch embeddings context using text-embedding-ada-002 engine/cosine_similarity for lookup ----------------
def QueryEmbeddings(question):
global benefit_info
question_vector = get_embedding(question, engine='text-embedding-ada-002')
# Compute the cosine similarity
benefit_info["similarities"] = benefit_info['embedding'].apply(lambda x: cosine_similarity(x, question_vector))
sorted_benefits = benefit_info.sort_values("similarities", ascending=False).head(5)
best_answer = sorted_benefits.iloc[0]['answer']
print("Question: "+question)
print("Top question: "+ sorted_benefits.iloc[0]['question'])
print("Best answer: ", best_answer)
return best_answer
def QueryEmbeddingsFlat(query):
global embedding_info
question_vector = get_embedding(query, engine='text-embedding-ada-002')
best_answer="No embeddings."
print(type(embedding_info)) # Should be <class 'pandas.DataFrame'> if it's a DataFrame
print(embedding_info is None) # Should be False
if embedding_info is not None:
embedding_info["similarities"] = embedding_info['embedding'].apply(lambda x: cosine_similarity(x, question_vector))
sorted_matches = embedding_info.sort_values("similarities", ascending=False).head(5)
best_answer = sorted_matches.iloc[0]['data']
print("query: "+query)
else:
print("embedding_info is None")
print("Best answer: ", best_answer)
return best_answer |