| import os |
| import pandas as pd |
| from openai.embeddings_utils import get_embedding, cosine_similarity |
|
|
| benefit_info = None |
| embedding_info = None |
|
|
| def CreateEmbeddingsQA(input_path, output_path): |
| global benefit_info |
| if os.path.exists(output_path): |
| benefit_info = pd.read_pickle(output_path) |
| else: |
| benefit_info = pd.read_csv(input_path) |
| benefit_info['embedding'] = benefit_info['answer'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002')) |
| benefit_info.to_pickle(output_path) |
|
|
| |
| def CreateEmbeddingsFlatPolicy(input_path, output_path): |
| global embedding_info |
| print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path) |
| embedding_info = pd.read_csv(input_path) |
| embedding_info.columns = ['data'] |
| embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002')) |
| embedding_info.to_pickle(output_path) |
| print("Loaded from CSV file.") |
| print(f"embedding_info type: {type(embedding_info)}") |
| print(f"embedding_info is None: {embedding_info is None}") |
|
|
| def CreateEmbeddingsFlat(input_path, output_path): |
| global embedding_info |
| print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path) |
| if os.path.exists(output_path): |
| embedding_info = pd.read_pickle(output_path) |
| print("Loaded from PKL file.") |
| else: |
| embedding_info = pd.read_csv(input_path) |
| embedding_info.columns = ['data'] |
| embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002')) |
| embedding_info.to_pickle(output_path) |
| print("Loaded from CSV file.") |
|
|
| print(f"embedding_info type: {type(embedding_info)}") |
| print(f"embedding_info is None: {embedding_info is None}") |
|
|
| |
| def QueryEmbeddings(question): |
| global benefit_info |
| question_vector = get_embedding(question, engine='text-embedding-ada-002') |
|
|
| |
| benefit_info["similarities"] = benefit_info['embedding'].apply(lambda x: cosine_similarity(x, question_vector)) |
| sorted_benefits = benefit_info.sort_values("similarities", ascending=False).head(5) |
|
|
| best_answer = sorted_benefits.iloc[0]['answer'] |
| print("Question: "+question) |
| print("Top question: "+ sorted_benefits.iloc[0]['question']) |
| print("Best answer: ", best_answer) |
| return best_answer |
|
|
|
|
|
|
|
|
| def QueryEmbeddingsFlat(query): |
| global embedding_info |
| question_vector = get_embedding(query, engine='text-embedding-ada-002') |
| best_answer="No embeddings." |
| print(type(embedding_info)) |
| print(embedding_info is None) |
| if embedding_info is not None: |
| embedding_info["similarities"] = embedding_info['embedding'].apply(lambda x: cosine_similarity(x, question_vector)) |
| sorted_matches = embedding_info.sort_values("similarities", ascending=False).head(5) |
| best_answer = sorted_matches.iloc[0]['data'] |
| print("query: "+query) |
| else: |
| print("embedding_info is None") |
| print("Best answer: ", best_answer) |
| return best_answer |