Spaces:
Sleeping
Sleeping
| import openai | |
| import pandas as pd | |
| import numpy as np | |
| import faiss | |
| import os | |
| from dotenv import load_dotenv | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| load_dotenv() # take environment variables from .env. | |
| api_key = os.getenv('OPENAI_API_KEY') | |
| #print(api_key) | |
| from openai import OpenAI | |
| client = OpenAI() | |
| def get_openai_embedding(text, model="text-embedding-3-small"): | |
| text = text.replace("\n", " ") | |
| return client.embeddings.create(input = [text], model=model).data[0].embedding | |
| def save_openai_embeddings(csv_file, model='text-embedding-3-small'): | |
| # Load the CSV file | |
| df = pd.read_csv(csv_file) | |
| # Save the embeddings | |
| df['ada_embeddings'] = df.utterance.apply(lambda x: get_openai_embedding(x, model='text-embedding-3-small')) | |
| df.to_csv('embeddings/openai_embeddings.csv', index=False) | |
| print(f"Embeddings saved to embeddings/openai_embeddings.csv.") | |
| # get and save the embeddings for Intent cleared data | |
| #save_openai_embeddings(r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned.csv') | |
| def load_openai_embeddings(csv_file): | |
| # Load the CSV file | |
| df = pd.read_csv(csv_file) | |
| # Extract the embeddings | |
| embeddings = df['ada_embeddings'].tolist() | |
| return embeddings | |
| # Function to calculate similarity between user input and precomputed embeddings | |
| def calculate_openai_similarity(user_text, df, top_n=5): | |
| # Get embedding for the user input text | |
| user_embedding = np.array(get_openai_embedding(user_text, model='text-embedding-3-small')).reshape(1, -1) | |
| # Calculate cosine similarity between user input and all precomputed embeddings | |
| df['similarity'] = df['ada_embedding'].apply(lambda x: cosine_similarity([x], user_embedding)[0][0]) | |
| # Sort by similarity score (descending) and return the top_n most similar | |
| top_matches = df.sort_values(by='similarity', ascending=False).head(top_n) | |
| return top_matches[['combined', 'similarity']] | |
| def get_openai_similarity(user_text, df, top_n=5): | |
| # Get embedding for the user input text | |
| user_embedding = np.array(get_openai_embedding(user_text, model='text-embedding-3-small')).reshape(1, -1) | |
| # Calculate cosine similarity between user input and all precomputed embeddings | |
| df['similarity'] = df['ada_embedding'].apply(lambda x: cosine_similarity([x], user_embedding)[0][0]) | |
| # Sort by similarity score (descending) and return the top_n most similar | |
| top_matches = df.sort_values(by='similarity', ascending=False).head(top_n) | |
| return top_matches[['combined', 'similarity']] | |