Spaces:

victor7246
/

taxonomy

Sleeping

App Files Files Community

taxonomy / app.py

victor7246

Upload 10 files

7a03731 verified over 1 year ago

raw

history blame contribute delete

7.81 kB

	import streamlit as st
	import pickle
	from sentence_transformers import SentenceTransformer
	import pandas as pd
	from io import StringIO
	from sklearn.cluster import AgglomerativeClustering
	import numpy as np
	import plotly.express as px
	from statistics import mode

	st.title("Extract job function, department and role for a given job title")

	@st.cache_resource
	def get_artifacts():
	model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
	knn1 = pickle.load(open("model_function.pkl",'rb'))
	knn2 = pickle.load(open("model_department.pkl",'rb'))
	knn3 = pickle.load(open("model_role.pkl",'rb'))

	knn4 = pickle.load(open("model_function_taxonomy.pkl",'rb'))
	knn5 = pickle.load(open("model_department_taxonomy.pkl",'rb'))
	knn6 = pickle.load(open("model_role_taxonomy.pkl",'rb'))
	thresholds = pickle.load(open("thresholds.pkl",'rb'))

	return model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds

	def get_all_labels(job_title):
	x = model.encode([job_title])
	predicted_function = knn1.predict(x)[0]
	x = model.encode([job_title + ' ' + predicted_function])
	predicted_department = knn2.predict(x)[0]
	x = model.encode([job_title + ' ' + predicted_function + ' ' + predicted_department])
	predicted_role = knn3.predict(x)[0]

	return predicted_function, predicted_department, predicted_role

	def get_taxonomy_V1(df):
	################## Predict Function #######################
	X = model.encode(df['Job Title'])

	clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['function'], metric='cosine', linkage='average')
	clust.fit(X)
	labels = clust.labels_

	X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
	valy_ = knn4.predict(X2)

	val_predicted = []
	for i in labels:
	id = np.unique(labels).tolist().index(i)
	val_predicted.append(str(valy_[id]))

	df['Function'] = val_predicted

	################## Predict Departmebnt #######################
	X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1))

	clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['department'], metric='cosine', linkage='average')
	clust.fit(X)
	labels = clust.labels_

	X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
	valy_ = knn5.predict(X2)

	val_predicted = []
	for i in labels:
	id = np.unique(labels).tolist().index(i)
	val_predicted.append(str(valy_[id]))

	df['Department'] = val_predicted

	################## Predict Role #######################
	X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1))

	clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['role'], metric='cosine', linkage='average')
	clust.fit(X)
	labels = clust.labels_

	X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
	valy_ = knn6.predict(X2)

	val_predicted = []
	for i in labels:
	id = np.unique(labels).tolist().index(i)
	val_predicted.append(str(valy_[id]))

	df['Role'] = val_predicted

	return df

	def get_taxonomy_V2(df):
	from sklearn.cluster import OPTICS, cluster_optics_dbscan

	df.columns = ['Job Title']

	################## Predict Function #######################
	X = model.encode(df['Job Title'])

	val_pred = knn1.predict(X)

	df['Pred1'] = val_pred
	df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Pred1'], axis=1)
	X = model.encode(df['text'])

	clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
	clust.fit(X)
	labels = clust.labels_

	valy_ = []
	for id in np.unique(labels):
	valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))

	val_predicted = []
	for i in labels:
	id = np.unique(labels).tolist().index(i)
	val_predicted.append(str(valy_[id]))

	df['Function'] = val_predicted

	################## Predict Departmebnt #######################
	X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1))

	val_pred = knn2.predict(X)

	df['Pred1'] = val_pred
	df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Pred1'], axis=1)
	X = model.encode(df['text'])

	clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
	clust.fit(X)
	labels = clust.labels_

	valy_ = []
	for id in np.unique(labels):
	valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))

	val_predicted = []
	for i in labels:
	id = np.unique(labels).tolist().index(i)
	val_predicted.append(str(valy_[id]))

	df['Department'] = val_predicted

	################## Predict Role #######################
	X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1))

	val_pred = knn3.predict(X)

	df['Pred1'] = val_pred
	df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'] + ' ' + x['Pred1'], axis=1)
	X = model.encode(df['text'])

	clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
	clust.fit(X)
	labels = clust.labels_

	valy_ = []
	for id in np.unique(labels):
	valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))

	val_predicted = []
	for i in labels:
	id = np.unique(labels).tolist().index(i)
	val_predicted.append(str(valy_[id]))

	df['Role'] = val_predicted

	return df

	if __name__ == '__main__':
	model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds = get_artifacts()
	job_title = st.text_input('Put the job title here - ', value="")
	if job_title != "":
	predicted_function, predicted_department, predicted_role = get_all_labels(job_title)
	st.markdown("Function: " + predicted_function)
	st.markdown("Department: " + predicted_department)
	st.markdown("Role: " + predicted_role)

	uploaded_file = st.file_uploader("Or, choose a csv file to see taxonomy")
	if uploaded_file is not None:
	# To read file as bytes:
	bytes_data = uploaded_file.getvalue()

	# To convert to a string based IO:
	stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))

	# To read file as string:
	string_data = stringio.read()

	# Can be used wherever a "file-like" object is accepted:
	df = pd.read_csv(uploaded_file, header=None)

	#predicted_functions, predicted_departments, predicted_roles = [], [], []
	df.columns = ['Job Title']
	#for i in range(df.shape[0]):
	# predicted_function, predicted_department, predicted_role = get_all_labels(df['Job Title'].iloc[i])
	# predicted_functions.append(predicted_function)
	# predicted_departments.append(predicted_department)
	# predicted_roles.append(predicted_role)

	#df['Function'] = predicted_functions
	#df['Department'] = predicted_departments
	#df['Role'] = predicted_roles

	df = get_taxonomy_V2(df)
	df = df[['Job Title','Function','Department','Role']]

	st.table(df)

	st.download_button(
	"Press to Download",
	df.to_csv(index=False).encode('utf-8'),
	"job_titles.csv",
	"text/csv",
	key='download-csv'
	)

	fig = px.sunburst(df, path=["Function", 'Department', 'Role', 'Job Title'])
	st.plotly_chart(fig, use_container_width=True)