Spaces:
Sleeping
Sleeping
| from sentence_transformers import SentenceTransformer | |
| model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') | |
| import pandas as pd | |
| import numpy as np | |
| import pickle | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| from tqdm import tqdm | |
| tqdm.pandas() | |
| nltk.download('punkt') | |
| from numpy import dot | |
| from numpy.linalg import norm | |
| import json | |
| import ast | |
| import requests | |
| import gradio as gr | |
| from datetime import datetime | |
| import time | |
| import dataframe_image as dfi | |
| print("Packages loaded!") | |
| # write out functions | |
| def load_pickle(): | |
| master_exploded = pickle.load(open("./Data/master_exploded_current.pkl", 'rb')) | |
| print("Exploded DF Shape:", master_exploded.shape) | |
| print("Successfully Loaded!") | |
| return master_exploded | |
| def sentence_embedding_generator(query): | |
| # query = input('What kind of mentor are you looking for?: ') | |
| print(f'You entered {query}') | |
| print("Loading Model...") | |
| model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') | |
| print("all-mpnet-base-v2 Model loaded!") | |
| embeddings = model.encode(query) | |
| return embeddings, query | |
| def cosine_similarity_generator(master_exploded, embeddings, query, filename = time.strftime("%Y%m%d-%H%M%S")): | |
| # current_datetime = datetime.now() | |
| print("Current datetime: ", time.strftime("%Y%m%d-%H%M%S")) | |
| master_exploded['query'] = query | |
| master_exploded['cos_sim'] = master_exploded['raw_embedding'].progress_apply(lambda x: (np.dot(embeddings, x) / | |
| (norm(embeddings)*norm(x)))) | |
| master_exploded_top = master_exploded[master_exploded['cos_sim']> 0.6] | |
| print("The number of results with cosine similarity > 0.6 are: ", len(master_exploded[master_exploded['cos_sim']> 0.6])) | |
| top_k = master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10) | |
| print(" The top k=10 results have a min cosine similarity of: ", master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10)['cos_sim'].min()) | |
| # print(master_exploded_top_k) | |
| cosine_sum_by_name = master_exploded_top.groupby(["id", "name", "tokenized_sentences" ]).agg({"cos_sim": ["sum"]}).reset_index() | |
| print("Taking sum of cosine similarities above 0.6 threshold...") | |
| cosine_sum_by_name.columns = cosine_sum_by_name.columns.map('_'.join) | |
| ranked_mentors = cosine_sum_by_name.reset_index().sort_values(by ="cos_sim_sum", ascending =False) | |
| cosine_sum_by_name = cosine_sum_by_name.rename(columns={"id_": "MentorID", "name_": "Name", "tokenized_sentences_": "Sentences"}, errors="raise") | |
| # path = "./Ranked_Results_Gradio/" | |
| # ranked_mentors_filename = path+'ranked_mentors_'+str(filename)+'.csv' | |
| # cos_sum_filename = path+'cos_sum_'+str(filename)+'.csv' | |
| # top_10_filename = path+'top_10_'+str(filename)+'.csv' | |
| # above_threshold_filename = path+"above_0.6_threshold_"+str(filename)+".csv" | |
| # save 3 files: Ranked mentors, top 10 matches baed on cosine similarity sum, and then all of the results per run. | |
| # ranked_mentors.head(10).to_csv(ranked_mentors_filename) | |
| # cosine_sum_by_name.sort_values(by ="cos_sim_sum", ascending =False).head(10).to_csv(top_10_filename) | |
| # cosine_sum_by_name.to_csv(cos_sum_filename) | |
| # master_exploded_top.sort_values(by ="cos_sim", ascending =False).to_csv(above_threshold_filename) | |
| return master_exploded_top, top_k, cosine_sum_by_name | |
| def dataframe_output(cosine_sum_by_name): | |
| # return master_exploded_top_k | |
| json_df = cosine_sum_by_name.to_json(orient="columns") | |
| return json_df | |
| def generate_results(input): | |
| master_exploded = load_pickle() | |
| embeddings, query = sentence_embedding_generator(str(input)) | |
| ranked_mentors, top_10, cosine_sum_by_name = cosine_similarity_generator(master_exploded, | |
| embeddings, | |
| query, | |
| time.strftime("%Y%m%d-%H%M%S")) | |
| print(cosine_sum_by_name.columns) | |
| df_output = pd.read_json(dataframe_output(cosine_sum_by_name)) | |
| print(df_output) | |
| # df_output = dataframe_output(cosine_sum_by_name) | |
| top_10 = top_10[['name','id', 'tokenized_sentences', 'cos_sim' ]] | |
| sentence_output = pd.read_json(dataframe_output(top_10)) | |
| print("JSON created...") | |
| subset = df_output.head(10) # Select the first 10 rows | |
| return subset, sentence_output | |
| iface = gr.Interface( | |
| fn=generate_results, | |
| inputs=gr.inputs.Textbox(label="What kind of mentor are you looking for?"), | |
| outputs=[gr.outputs.Dataframe(type="pandas"), gr.outputs.Dataframe(type="pandas")], | |
| title="SharpestMinds Mentor Recommender Semantic Search App", | |
| description="Converts a string query into an embedding, and then compares the aggregate cosine similarity by mentor.", | |
| ) | |
| iface.launch(auth=("admin", "russell2023")) | |