victor7246 commited on
Commit
7a03731
·
verified ·
1 Parent(s): 61ae30d

Upload 10 files

Browse files
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Taxonomy
3
- emoji:
4
- colorFrom: yellow
5
- colorTo: blue
6
  sdk: streamlit
7
  sdk_version: 1.36.0
8
  app_file: app.py
 
1
  ---
2
+ title: TeamsClassification
3
+ emoji: 📉
4
+ colorFrom: green
5
+ colorTo: gray
6
  sdk: streamlit
7
  sdk_version: 1.36.0
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pickle
3
+ from sentence_transformers import SentenceTransformer
4
+ import pandas as pd
5
+ from io import StringIO
6
+ from sklearn.cluster import AgglomerativeClustering
7
+ import numpy as np
8
+ import plotly.express as px
9
+ from statistics import mode
10
+
11
+ st.title("Extract job function, department and role for a given job title")
12
+
13
+ @st.cache_resource
14
+ def get_artifacts():
15
+ model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
16
+ knn1 = pickle.load(open("model_function.pkl",'rb'))
17
+ knn2 = pickle.load(open("model_department.pkl",'rb'))
18
+ knn3 = pickle.load(open("model_role.pkl",'rb'))
19
+
20
+ knn4 = pickle.load(open("model_function_taxonomy.pkl",'rb'))
21
+ knn5 = pickle.load(open("model_department_taxonomy.pkl",'rb'))
22
+ knn6 = pickle.load(open("model_role_taxonomy.pkl",'rb'))
23
+ thresholds = pickle.load(open("thresholds.pkl",'rb'))
24
+
25
+ return model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds
26
+
27
+ def get_all_labels(job_title):
28
+ x = model.encode([job_title])
29
+ predicted_function = knn1.predict(x)[0]
30
+ x = model.encode([job_title + ' ' + predicted_function])
31
+ predicted_department = knn2.predict(x)[0]
32
+ x = model.encode([job_title + ' ' + predicted_function + ' ' + predicted_department])
33
+ predicted_role = knn3.predict(x)[0]
34
+
35
+ return predicted_function, predicted_department, predicted_role
36
+
37
+ def get_taxonomy_V1(df):
38
+ ################## Predict Function #######################
39
+ X = model.encode(df['Job Title'])
40
+
41
+ clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['function'], metric='cosine', linkage='average')
42
+ clust.fit(X)
43
+ labels = clust.labels_
44
+
45
+ X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
46
+ valy_ = knn4.predict(X2)
47
+
48
+ val_predicted = []
49
+ for i in labels:
50
+ id = np.unique(labels).tolist().index(i)
51
+ val_predicted.append(str(valy_[id]))
52
+
53
+ df['Function'] = val_predicted
54
+
55
+ ################## Predict Departmebnt #######################
56
+ X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1))
57
+
58
+ clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['department'], metric='cosine', linkage='average')
59
+ clust.fit(X)
60
+ labels = clust.labels_
61
+
62
+ X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
63
+ valy_ = knn5.predict(X2)
64
+
65
+ val_predicted = []
66
+ for i in labels:
67
+ id = np.unique(labels).tolist().index(i)
68
+ val_predicted.append(str(valy_[id]))
69
+
70
+ df['Department'] = val_predicted
71
+
72
+ ################## Predict Role #######################
73
+ X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1))
74
+
75
+ clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['role'], metric='cosine', linkage='average')
76
+ clust.fit(X)
77
+ labels = clust.labels_
78
+
79
+ X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
80
+ valy_ = knn6.predict(X2)
81
+
82
+ val_predicted = []
83
+ for i in labels:
84
+ id = np.unique(labels).tolist().index(i)
85
+ val_predicted.append(str(valy_[id]))
86
+
87
+ df['Role'] = val_predicted
88
+
89
+ return df
90
+
91
+ def get_taxonomy_V2(df):
92
+ from sklearn.cluster import OPTICS, cluster_optics_dbscan
93
+
94
+ df.columns = ['Job Title']
95
+
96
+ ################## Predict Function #######################
97
+ X = model.encode(df['Job Title'])
98
+
99
+ val_pred = knn1.predict(X)
100
+
101
+ df['Pred1'] = val_pred
102
+ df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Pred1'], axis=1)
103
+ X = model.encode(df['text'])
104
+
105
+ clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
106
+ clust.fit(X)
107
+ labels = clust.labels_
108
+
109
+ valy_ = []
110
+ for id in np.unique(labels):
111
+ valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))
112
+
113
+ val_predicted = []
114
+ for i in labels:
115
+ id = np.unique(labels).tolist().index(i)
116
+ val_predicted.append(str(valy_[id]))
117
+
118
+ df['Function'] = val_predicted
119
+
120
+ ################## Predict Departmebnt #######################
121
+ X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1))
122
+
123
+ val_pred = knn2.predict(X)
124
+
125
+ df['Pred1'] = val_pred
126
+ df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Pred1'], axis=1)
127
+ X = model.encode(df['text'])
128
+
129
+ clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
130
+ clust.fit(X)
131
+ labels = clust.labels_
132
+
133
+ valy_ = []
134
+ for id in np.unique(labels):
135
+ valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))
136
+
137
+ val_predicted = []
138
+ for i in labels:
139
+ id = np.unique(labels).tolist().index(i)
140
+ val_predicted.append(str(valy_[id]))
141
+
142
+ df['Department'] = val_predicted
143
+
144
+ ################## Predict Role #######################
145
+ X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1))
146
+
147
+ val_pred = knn3.predict(X)
148
+
149
+ df['Pred1'] = val_pred
150
+ df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'] + ' ' + x['Pred1'], axis=1)
151
+ X = model.encode(df['text'])
152
+
153
+ clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
154
+ clust.fit(X)
155
+ labels = clust.labels_
156
+
157
+ valy_ = []
158
+ for id in np.unique(labels):
159
+ valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))
160
+
161
+ val_predicted = []
162
+ for i in labels:
163
+ id = np.unique(labels).tolist().index(i)
164
+ val_predicted.append(str(valy_[id]))
165
+
166
+ df['Role'] = val_predicted
167
+
168
+ return df
169
+
170
+ if __name__ == '__main__':
171
+ model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds = get_artifacts()
172
+ job_title = st.text_input('Put the job title here - ', value="")
173
+ if job_title != "":
174
+ predicted_function, predicted_department, predicted_role = get_all_labels(job_title)
175
+ st.markdown("Function: " + predicted_function)
176
+ st.markdown("Department: " + predicted_department)
177
+ st.markdown("Role: " + predicted_role)
178
+
179
+ uploaded_file = st.file_uploader("Or, choose a csv file to see taxonomy")
180
+ if uploaded_file is not None:
181
+ # To read file as bytes:
182
+ bytes_data = uploaded_file.getvalue()
183
+
184
+ # To convert to a string based IO:
185
+ stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
186
+
187
+ # To read file as string:
188
+ string_data = stringio.read()
189
+
190
+ # Can be used wherever a "file-like" object is accepted:
191
+ df = pd.read_csv(uploaded_file, header=None)
192
+
193
+ #predicted_functions, predicted_departments, predicted_roles = [], [], []
194
+ df.columns = ['Job Title']
195
+ #for i in range(df.shape[0]):
196
+ # predicted_function, predicted_department, predicted_role = get_all_labels(df['Job Title'].iloc[i])
197
+ # predicted_functions.append(predicted_function)
198
+ # predicted_departments.append(predicted_department)
199
+ # predicted_roles.append(predicted_role)
200
+
201
+ #df['Function'] = predicted_functions
202
+ #df['Department'] = predicted_departments
203
+ #df['Role'] = predicted_roles
204
+
205
+ df = get_taxonomy_V2(df)
206
+ df = df[['Job Title','Function','Department','Role']]
207
+
208
+ st.table(df)
209
+
210
+ st.download_button(
211
+ "Press to Download",
212
+ df.to_csv(index=False).encode('utf-8'),
213
+ "job_titles.csv",
214
+ "text/csv",
215
+ key='download-csv'
216
+ )
217
+
218
+ fig = px.sunburst(df, path=["Function", 'Department', 'Role', 'Job Title'])
219
+ st.plotly_chart(fig, use_container_width=True)
model_department.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19d9f57b668aaf7714eca2f4bc39f542323a6e6ed0db5da97ebd76af9cef42d7
3
+ size 132
model_department_taxonomy.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6b269010f05da6056c75034c17908b7ecb686d38a29430435725e6469a45e39
3
+ size 131
model_function.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e48e629f9405ba2350d2aa73ac1162605d9ca281439defe277726f365a77e97a
3
+ size 132
model_function_taxonomy.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1647d488e084340df4ac83a9541c4b7fa95055c70f9837f6deafa5504d6575a
3
+ size 131
model_role.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:486ea545e5ef30cbc6cfdec3f601ee9e408e2166b6a2a2bdac2254094d11fe16
3
+ size 132
model_role_taxonomy.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a319f73cf80d752db7219aa59d9d3dea1c0d2467a151578273c01ec17e3a3e17
3
+ size 131
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sentence-transformers
2
+ pandas
3
+ scikit-learn
4
+ streamlit
5
+ plotly
thresholds.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28caa04070bdba2e91e25a424c0bd2822fb9b5befa8422a670e8790ce23121c3
3
+ size 128