import os import pandas as pd from openai.embeddings_utils import get_embedding, cosine_similarity benefit_info = None embedding_info = None def CreateEmbeddingsQA(input_path, output_path): global benefit_info if os.path.exists(output_path): benefit_info = pd.read_pickle(output_path) else: benefit_info = pd.read_csv(input_path) benefit_info['embedding'] = benefit_info['answer'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002')) benefit_info.to_pickle(output_path) #---------- Updated for policy pdfs def CreateEmbeddingsFlatPolicy(input_path, output_path): global embedding_info print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path) embedding_info = pd.read_csv(input_path) embedding_info.columns = ['data'] embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002')) embedding_info.to_pickle(output_path) print("Loaded from CSV file.") print(f"embedding_info type: {type(embedding_info)}") print(f"embedding_info is None: {embedding_info is None}") def CreateEmbeddingsFlat(input_path, output_path): global embedding_info print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path) if os.path.exists(output_path): embedding_info = pd.read_pickle(output_path) print("Loaded from PKL file.") else: embedding_info = pd.read_csv(input_path) embedding_info.columns = ['data'] embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002')) embedding_info.to_pickle(output_path) print("Loaded from CSV file.") print(f"embedding_info type: {type(embedding_info)}") print(f"embedding_info is None: {embedding_info is None}") #------------- #2: fetch embeddings context using text-embedding-ada-002 engine/cosine_similarity for lookup ---------------- def QueryEmbeddings(question): global benefit_info question_vector = get_embedding(question, engine='text-embedding-ada-002') # Compute the cosine similarity benefit_info["similarities"] = benefit_info['embedding'].apply(lambda x: cosine_similarity(x, question_vector)) sorted_benefits = benefit_info.sort_values("similarities", ascending=False).head(5) best_answer = sorted_benefits.iloc[0]['answer'] print("Question: "+question) print("Top question: "+ sorted_benefits.iloc[0]['question']) print("Best answer: ", best_answer) return best_answer def QueryEmbeddingsFlat(query): global embedding_info question_vector = get_embedding(query, engine='text-embedding-ada-002') best_answer="No embeddings." print(type(embedding_info)) # Should be if it's a DataFrame print(embedding_info is None) # Should be False if embedding_info is not None: embedding_info["similarities"] = embedding_info['embedding'].apply(lambda x: cosine_similarity(x, question_vector)) sorted_matches = embedding_info.sort_values("similarities", ascending=False).head(5) best_answer = sorted_matches.iloc[0]['data'] print("query: "+query) else: print("embedding_info is None") print("Best answer: ", best_answer) return best_answer