taxonomy / app.py
victor7246's picture
Upload 10 files
7a03731 verified
import streamlit as st
import pickle
from sentence_transformers import SentenceTransformer
import pandas as pd
from io import StringIO
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import plotly.express as px
from statistics import mode
st.title("Extract job function, department and role for a given job title")
@st.cache_resource
def get_artifacts():
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
knn1 = pickle.load(open("model_function.pkl",'rb'))
knn2 = pickle.load(open("model_department.pkl",'rb'))
knn3 = pickle.load(open("model_role.pkl",'rb'))
knn4 = pickle.load(open("model_function_taxonomy.pkl",'rb'))
knn5 = pickle.load(open("model_department_taxonomy.pkl",'rb'))
knn6 = pickle.load(open("model_role_taxonomy.pkl",'rb'))
thresholds = pickle.load(open("thresholds.pkl",'rb'))
return model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds
def get_all_labels(job_title):
x = model.encode([job_title])
predicted_function = knn1.predict(x)[0]
x = model.encode([job_title + ' ' + predicted_function])
predicted_department = knn2.predict(x)[0]
x = model.encode([job_title + ' ' + predicted_function + ' ' + predicted_department])
predicted_role = knn3.predict(x)[0]
return predicted_function, predicted_department, predicted_role
def get_taxonomy_V1(df):
################## Predict Function #######################
X = model.encode(df['Job Title'])
clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['function'], metric='cosine', linkage='average')
clust.fit(X)
labels = clust.labels_
X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
valy_ = knn4.predict(X2)
val_predicted = []
for i in labels:
id = np.unique(labels).tolist().index(i)
val_predicted.append(str(valy_[id]))
df['Function'] = val_predicted
################## Predict Departmebnt #######################
X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1))
clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['department'], metric='cosine', linkage='average')
clust.fit(X)
labels = clust.labels_
X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
valy_ = knn5.predict(X2)
val_predicted = []
for i in labels:
id = np.unique(labels).tolist().index(i)
val_predicted.append(str(valy_[id]))
df['Department'] = val_predicted
################## Predict Role #######################
X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1))
clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['role'], metric='cosine', linkage='average')
clust.fit(X)
labels = clust.labels_
X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
valy_ = knn6.predict(X2)
val_predicted = []
for i in labels:
id = np.unique(labels).tolist().index(i)
val_predicted.append(str(valy_[id]))
df['Role'] = val_predicted
return df
def get_taxonomy_V2(df):
from sklearn.cluster import OPTICS, cluster_optics_dbscan
df.columns = ['Job Title']
################## Predict Function #######################
X = model.encode(df['Job Title'])
val_pred = knn1.predict(X)
df['Pred1'] = val_pred
df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Pred1'], axis=1)
X = model.encode(df['text'])
clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
clust.fit(X)
labels = clust.labels_
valy_ = []
for id in np.unique(labels):
valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))
val_predicted = []
for i in labels:
id = np.unique(labels).tolist().index(i)
val_predicted.append(str(valy_[id]))
df['Function'] = val_predicted
################## Predict Departmebnt #######################
X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1))
val_pred = knn2.predict(X)
df['Pred1'] = val_pred
df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Pred1'], axis=1)
X = model.encode(df['text'])
clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
clust.fit(X)
labels = clust.labels_
valy_ = []
for id in np.unique(labels):
valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))
val_predicted = []
for i in labels:
id = np.unique(labels).tolist().index(i)
val_predicted.append(str(valy_[id]))
df['Department'] = val_predicted
################## Predict Role #######################
X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1))
val_pred = knn3.predict(X)
df['Pred1'] = val_pred
df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'] + ' ' + x['Pred1'], axis=1)
X = model.encode(df['text'])
clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
clust.fit(X)
labels = clust.labels_
valy_ = []
for id in np.unique(labels):
valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))
val_predicted = []
for i in labels:
id = np.unique(labels).tolist().index(i)
val_predicted.append(str(valy_[id]))
df['Role'] = val_predicted
return df
if __name__ == '__main__':
model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds = get_artifacts()
job_title = st.text_input('Put the job title here - ', value="")
if job_title != "":
predicted_function, predicted_department, predicted_role = get_all_labels(job_title)
st.markdown("Function: " + predicted_function)
st.markdown("Department: " + predicted_department)
st.markdown("Role: " + predicted_role)
uploaded_file = st.file_uploader("Or, choose a csv file to see taxonomy")
if uploaded_file is not None:
# To read file as bytes:
bytes_data = uploaded_file.getvalue()
# To convert to a string based IO:
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
# To read file as string:
string_data = stringio.read()
# Can be used wherever a "file-like" object is accepted:
df = pd.read_csv(uploaded_file, header=None)
#predicted_functions, predicted_departments, predicted_roles = [], [], []
df.columns = ['Job Title']
#for i in range(df.shape[0]):
# predicted_function, predicted_department, predicted_role = get_all_labels(df['Job Title'].iloc[i])
# predicted_functions.append(predicted_function)
# predicted_departments.append(predicted_department)
# predicted_roles.append(predicted_role)
#df['Function'] = predicted_functions
#df['Department'] = predicted_departments
#df['Role'] = predicted_roles
df = get_taxonomy_V2(df)
df = df[['Job Title','Function','Department','Role']]
st.table(df)
st.download_button(
"Press to Download",
df.to_csv(index=False).encode('utf-8'),
"job_titles.csv",
"text/csv",
key='download-csv'
)
fig = px.sunburst(df, path=["Function", 'Department', 'Role', 'Job Title'])
st.plotly_chart(fig, use_container_width=True)