import streamlit as st import pickle from sentence_transformers import SentenceTransformer import pandas as pd from io import StringIO from sklearn.cluster import AgglomerativeClustering import numpy as np import plotly.express as px from statistics import mode st.title("Extract job function, department and role for a given job title") @st.cache_resource def get_artifacts(): model = SentenceTransformer('paraphrase-MiniLM-L6-v2') knn1 = pickle.load(open("model_function.pkl",'rb')) knn2 = pickle.load(open("model_department.pkl",'rb')) knn3 = pickle.load(open("model_role.pkl",'rb')) knn4 = pickle.load(open("model_function_taxonomy.pkl",'rb')) knn5 = pickle.load(open("model_department_taxonomy.pkl",'rb')) knn6 = pickle.load(open("model_role_taxonomy.pkl",'rb')) thresholds = pickle.load(open("thresholds.pkl",'rb')) return model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds def get_all_labels(job_title): x = model.encode([job_title]) predicted_function = knn1.predict(x)[0] x = model.encode([job_title + ' ' + predicted_function]) predicted_department = knn2.predict(x)[0] x = model.encode([job_title + ' ' + predicted_function + ' ' + predicted_department]) predicted_role = knn3.predict(x)[0] return predicted_function, predicted_department, predicted_role def get_taxonomy_V1(df): ################## Predict Function ####################### X = model.encode(df['Job Title']) clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['function'], metric='cosine', linkage='average') clust.fit(X) labels = clust.labels_ X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)] valy_ = knn4.predict(X2) val_predicted = [] for i in labels: id = np.unique(labels).tolist().index(i) val_predicted.append(str(valy_[id])) df['Function'] = val_predicted ################## Predict Departmebnt ####################### X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1)) clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['department'], metric='cosine', linkage='average') clust.fit(X) labels = clust.labels_ X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)] valy_ = knn5.predict(X2) val_predicted = [] for i in labels: id = np.unique(labels).tolist().index(i) val_predicted.append(str(valy_[id])) df['Department'] = val_predicted ################## Predict Role ####################### X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1)) clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['role'], metric='cosine', linkage='average') clust.fit(X) labels = clust.labels_ X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)] valy_ = knn6.predict(X2) val_predicted = [] for i in labels: id = np.unique(labels).tolist().index(i) val_predicted.append(str(valy_[id])) df['Role'] = val_predicted return df def get_taxonomy_V2(df): from sklearn.cluster import OPTICS, cluster_optics_dbscan df.columns = ['Job Title'] ################## Predict Function ####################### X = model.encode(df['Job Title']) val_pred = knn1.predict(X) df['Pred1'] = val_pred df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Pred1'], axis=1) X = model.encode(df['text']) clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average') clust.fit(X) labels = clust.labels_ valy_ = [] for id in np.unique(labels): valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]])) val_predicted = [] for i in labels: id = np.unique(labels).tolist().index(i) val_predicted.append(str(valy_[id])) df['Function'] = val_predicted ################## Predict Departmebnt ####################### X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1)) val_pred = knn2.predict(X) df['Pred1'] = val_pred df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Pred1'], axis=1) X = model.encode(df['text']) clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average') clust.fit(X) labels = clust.labels_ valy_ = [] for id in np.unique(labels): valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]])) val_predicted = [] for i in labels: id = np.unique(labels).tolist().index(i) val_predicted.append(str(valy_[id])) df['Department'] = val_predicted ################## Predict Role ####################### X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1)) val_pred = knn3.predict(X) df['Pred1'] = val_pred df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'] + ' ' + x['Pred1'], axis=1) X = model.encode(df['text']) clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average') clust.fit(X) labels = clust.labels_ valy_ = [] for id in np.unique(labels): valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]])) val_predicted = [] for i in labels: id = np.unique(labels).tolist().index(i) val_predicted.append(str(valy_[id])) df['Role'] = val_predicted return df if __name__ == '__main__': model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds = get_artifacts() job_title = st.text_input('Put the job title here - ', value="") if job_title != "": predicted_function, predicted_department, predicted_role = get_all_labels(job_title) st.markdown("Function: " + predicted_function) st.markdown("Department: " + predicted_department) st.markdown("Role: " + predicted_role) uploaded_file = st.file_uploader("Or, choose a csv file to see taxonomy") if uploaded_file is not None: # To read file as bytes: bytes_data = uploaded_file.getvalue() # To convert to a string based IO: stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) # To read file as string: string_data = stringio.read() # Can be used wherever a "file-like" object is accepted: df = pd.read_csv(uploaded_file, header=None) #predicted_functions, predicted_departments, predicted_roles = [], [], [] df.columns = ['Job Title'] #for i in range(df.shape[0]): # predicted_function, predicted_department, predicted_role = get_all_labels(df['Job Title'].iloc[i]) # predicted_functions.append(predicted_function) # predicted_departments.append(predicted_department) # predicted_roles.append(predicted_role) #df['Function'] = predicted_functions #df['Department'] = predicted_departments #df['Role'] = predicted_roles df = get_taxonomy_V2(df) df = df[['Job Title','Function','Department','Role']] st.table(df) st.download_button( "Press to Download", df.to_csv(index=False).encode('utf-8'), "job_titles.csv", "text/csv", key='download-csv' ) fig = px.sunburst(df, path=["Function", 'Department', 'Role', 'Job Title']) st.plotly_chart(fig, use_container_width=True)