Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pickle | |
| from sentence_transformers import SentenceTransformer | |
| import pandas as pd | |
| from io import StringIO | |
| from sklearn.cluster import AgglomerativeClustering | |
| import numpy as np | |
| import plotly.express as px | |
| from statistics import mode | |
| st.title("Extract job function, department and role for a given job title") | |
| def get_artifacts(): | |
| model = SentenceTransformer('paraphrase-MiniLM-L6-v2') | |
| knn1 = pickle.load(open("model_function.pkl",'rb')) | |
| knn2 = pickle.load(open("model_department.pkl",'rb')) | |
| knn3 = pickle.load(open("model_role.pkl",'rb')) | |
| knn4 = pickle.load(open("model_function_taxonomy.pkl",'rb')) | |
| knn5 = pickle.load(open("model_department_taxonomy.pkl",'rb')) | |
| knn6 = pickle.load(open("model_role_taxonomy.pkl",'rb')) | |
| thresholds = pickle.load(open("thresholds.pkl",'rb')) | |
| return model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds | |
| def get_all_labels(job_title): | |
| x = model.encode([job_title]) | |
| predicted_function = knn1.predict(x)[0] | |
| x = model.encode([job_title + ' ' + predicted_function]) | |
| predicted_department = knn2.predict(x)[0] | |
| x = model.encode([job_title + ' ' + predicted_function + ' ' + predicted_department]) | |
| predicted_role = knn3.predict(x)[0] | |
| return predicted_function, predicted_department, predicted_role | |
| def get_taxonomy_V1(df): | |
| ################## Predict Function ####################### | |
| X = model.encode(df['Job Title']) | |
| clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['function'], metric='cosine', linkage='average') | |
| clust.fit(X) | |
| labels = clust.labels_ | |
| X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)] | |
| valy_ = knn4.predict(X2) | |
| val_predicted = [] | |
| for i in labels: | |
| id = np.unique(labels).tolist().index(i) | |
| val_predicted.append(str(valy_[id])) | |
| df['Function'] = val_predicted | |
| ################## Predict Departmebnt ####################### | |
| X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1)) | |
| clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['department'], metric='cosine', linkage='average') | |
| clust.fit(X) | |
| labels = clust.labels_ | |
| X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)] | |
| valy_ = knn5.predict(X2) | |
| val_predicted = [] | |
| for i in labels: | |
| id = np.unique(labels).tolist().index(i) | |
| val_predicted.append(str(valy_[id])) | |
| df['Department'] = val_predicted | |
| ################## Predict Role ####################### | |
| X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1)) | |
| clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['role'], metric='cosine', linkage='average') | |
| clust.fit(X) | |
| labels = clust.labels_ | |
| X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)] | |
| valy_ = knn6.predict(X2) | |
| val_predicted = [] | |
| for i in labels: | |
| id = np.unique(labels).tolist().index(i) | |
| val_predicted.append(str(valy_[id])) | |
| df['Role'] = val_predicted | |
| return df | |
| def get_taxonomy_V2(df): | |
| from sklearn.cluster import OPTICS, cluster_optics_dbscan | |
| df.columns = ['Job Title'] | |
| ################## Predict Function ####################### | |
| X = model.encode(df['Job Title']) | |
| val_pred = knn1.predict(X) | |
| df['Pred1'] = val_pred | |
| df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Pred1'], axis=1) | |
| X = model.encode(df['text']) | |
| clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average') | |
| clust.fit(X) | |
| labels = clust.labels_ | |
| valy_ = [] | |
| for id in np.unique(labels): | |
| valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]])) | |
| val_predicted = [] | |
| for i in labels: | |
| id = np.unique(labels).tolist().index(i) | |
| val_predicted.append(str(valy_[id])) | |
| df['Function'] = val_predicted | |
| ################## Predict Departmebnt ####################### | |
| X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1)) | |
| val_pred = knn2.predict(X) | |
| df['Pred1'] = val_pred | |
| df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Pred1'], axis=1) | |
| X = model.encode(df['text']) | |
| clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average') | |
| clust.fit(X) | |
| labels = clust.labels_ | |
| valy_ = [] | |
| for id in np.unique(labels): | |
| valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]])) | |
| val_predicted = [] | |
| for i in labels: | |
| id = np.unique(labels).tolist().index(i) | |
| val_predicted.append(str(valy_[id])) | |
| df['Department'] = val_predicted | |
| ################## Predict Role ####################### | |
| X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1)) | |
| val_pred = knn3.predict(X) | |
| df['Pred1'] = val_pred | |
| df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'] + ' ' + x['Pred1'], axis=1) | |
| X = model.encode(df['text']) | |
| clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average') | |
| clust.fit(X) | |
| labels = clust.labels_ | |
| valy_ = [] | |
| for id in np.unique(labels): | |
| valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]])) | |
| val_predicted = [] | |
| for i in labels: | |
| id = np.unique(labels).tolist().index(i) | |
| val_predicted.append(str(valy_[id])) | |
| df['Role'] = val_predicted | |
| return df | |
| if __name__ == '__main__': | |
| model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds = get_artifacts() | |
| job_title = st.text_input('Put the job title here - ', value="") | |
| if job_title != "": | |
| predicted_function, predicted_department, predicted_role = get_all_labels(job_title) | |
| st.markdown("Function: " + predicted_function) | |
| st.markdown("Department: " + predicted_department) | |
| st.markdown("Role: " + predicted_role) | |
| uploaded_file = st.file_uploader("Or, choose a csv file to see taxonomy") | |
| if uploaded_file is not None: | |
| # To read file as bytes: | |
| bytes_data = uploaded_file.getvalue() | |
| # To convert to a string based IO: | |
| stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) | |
| # To read file as string: | |
| string_data = stringio.read() | |
| # Can be used wherever a "file-like" object is accepted: | |
| df = pd.read_csv(uploaded_file, header=None) | |
| #predicted_functions, predicted_departments, predicted_roles = [], [], [] | |
| df.columns = ['Job Title'] | |
| #for i in range(df.shape[0]): | |
| # predicted_function, predicted_department, predicted_role = get_all_labels(df['Job Title'].iloc[i]) | |
| # predicted_functions.append(predicted_function) | |
| # predicted_departments.append(predicted_department) | |
| # predicted_roles.append(predicted_role) | |
| #df['Function'] = predicted_functions | |
| #df['Department'] = predicted_departments | |
| #df['Role'] = predicted_roles | |
| df = get_taxonomy_V2(df) | |
| df = df[['Job Title','Function','Department','Role']] | |
| st.table(df) | |
| st.download_button( | |
| "Press to Download", | |
| df.to_csv(index=False).encode('utf-8'), | |
| "job_titles.csv", | |
| "text/csv", | |
| key='download-csv' | |
| ) | |
| fig = px.sunburst(df, path=["Function", 'Department', 'Role', 'Job Title']) | |
| st.plotly_chart(fig, use_container_width=True) |