edouardlgp commited on
Commit
e0dce9c
·
verified ·
1 Parent(s): 5d87b03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +201 -0
app.py CHANGED
@@ -160,6 +160,207 @@ def classify_occupational_group_by_level(responsibilities: List[str]) -> dict:
160
  result['error'] = str(e)
161
  return result
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  def get_skills_info_esco(Level_5_code):
164
 
165
  try:
 
160
  result['error'] = str(e)
161
  return result
162
 
163
+
164
+
165
+ def classify_esco_by_hierarchical_level(responsibilities: List[str]) -> dict:
166
+ """
167
+ Classifies job responsibilities into occupational groups at 4 levels,
168
+ [European Skills, Competences, Qualifications, and Occupations (ESCO)](https://esco.ec.europa.eu/en)
169
+ returning codes, names, and descriptions for each level.
170
+ Args:
171
+ responsibilities: List of job responsibility strings
172
+ Returns:
173
+ Dictionary containing classification information or error message
174
+ """
175
+
176
+ esco_df = pd.read_csv(
177
+ "ISCOGroups_en.csv",
178
+ dtype={'code': str} # Force 'code' to be read as string
179
+ )
180
+
181
+ esco_level5_df = pd.read_csv(
182
+ "occupations_en.csv",
183
+ dtype={'code': str, 'iscoGroup': str, } # Force 'code' to be read as string
184
+ )
185
+
186
+ result = {}
187
+
188
+ ######################## Level 1 ###################
189
+ # Get all top-level codes (single character/digit)
190
+ top_level_codes = sorted({
191
+ code for code in esco_df['code']
192
+ if len(code) == 1 and code.isalnum()
193
+ })
194
+
195
+ level1_code = None
196
+ if top_level_codes:
197
+ level1_df = esco_df[esco_df['code'].isin(top_level_codes)]
198
+ job_occupation_list = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
199
+ for _, row in level1_df.iterrows())
200
+ list1_output = level1_df["code"].tolist() # Convert Series to list
201
+ list1 = ", ".join(map(str, list1_output)) # Join elements with comma
202
+
203
+ user_prompt1 = f"""
204
+ Here is a list of job responsibilities:
205
+ {responsibilities}
206
+
207
+ Select the most relevant top-level code from these options:
208
+ {job_occupation_list}
209
+
210
+ Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.
211
+ **Important:**
212
+ - Return ONLY the code, nothing else.
213
+ - The code should be exactly as shown in the list.
214
+ - Do not include any additional text or explanation.
215
+ """
216
+ level1_code = gpt_call("Identify top-level occupational group", user_prompt1).strip()
217
+ level1_code = code_sanitize(level1_code, list1_output)
218
+ result.update(get_level_ESCO_info(level1_df, level1_code, 'Level_1'))
219
+
220
+ ######################## Level 2 ###################
221
+
222
+ level2_code = None
223
+ if level1_code:
224
+ level2_df = esco_df[
225
+ (esco_df['code'].str.startswith(level1_code)) & (esco_df['code'].str.len() == len(level1_code) + 1)
226
+ ]
227
+ if not level2_df.empty:
228
+ level2_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
229
+ for _, row in level2_df.iterrows())
230
+ list2_output = level2_df["code"].tolist() # Convert Series to list
231
+ list2 = ", ".join(map(str, list2_output)) # Join elements with comma
232
+
233
+ user_prompt2 = f"""
234
+ Here is a list of job responsibilities:
235
+ {responsibilities}
236
+
237
+ Here is a list of level 2 Occupation classifications within {level1_code}:
238
+ {level2_options}
239
+
240
+ Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
241
+ **Important:**
242
+ - Return ONLY the code, nothing else.
243
+ - The code should be exactly as shown in the list.
244
+ - Do not include any additional text or explanation.
245
+ """
246
+ level2_code = gpt_call("Identify second-level occupational group", user_prompt2).strip()
247
+ level2_code = code_sanitize(level2_code, list2_output)
248
+ result.update(get_level_ESCO_info(level2_df, level2_code, 'Level_2'))
249
+
250
+ ######################## Level 3 ###################
251
+ level3_code = None
252
+ if level2_code:
253
+ level3_df = esco_df[
254
+ (esco_df['code'].str.startswith(level2_code)) & (esco_df['code'].str.len() == len(level2_code) + 1)
255
+ ]
256
+ if not level3_df.empty:
257
+ level3_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
258
+ for _, row in level3_df.iterrows())
259
+ list3_output = level3_df["code"].tolist() # Convert Series to list
260
+ list3 = ", ".join(map(str, list3_output)) # Join elements with comma
261
+
262
+ user_prompt3 = f"""
263
+ Here is a list of job responsibilities:
264
+ {responsibilities}
265
+
266
+ Here is a list of level 3 Occupation classifications within {level2_code}:
267
+ {level3_options}
268
+
269
+ Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.
270
+
271
+ **Important:**
272
+ - Return ONLY the code, nothing else.
273
+ - The code should be exactly as shown in the list.
274
+ - Do not include any additional text or explanation.
275
+
276
+ """
277
+ level3_code = gpt_call("Identify third-level occupational group", user_prompt3).strip()
278
+ level3_code = code_sanitize(level3_code, list3_output)
279
+ result.update(get_level_ESCO_info(level3_df, level3_code, 'Level_3'))
280
+
281
+ ######################## Level 4 ###################
282
+ level4_code = None
283
+ if level3_code:
284
+ level4_df = esco_df[
285
+ (esco_df['code'].str.startswith(level3_code)) & (esco_df['code'].str.len() == len(level3_code) + 1)
286
+ ]
287
+ if not level4_df.empty:
288
+ level4_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
289
+ for _, row in level4_df.iterrows())
290
+ list4_output = level4_df["code"].tolist() # Convert Series to list
291
+ list4 = ", ".join(map(str, list4_output)) # Join elements with comma
292
+ user_prompt4 = f"""
293
+ Here is a list of job responsibilities:
294
+ {responsibilities}
295
+
296
+ Here is a list of level 4 Occupation classifications within {level3_code}:
297
+ {level4_options}
298
+
299
+ Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
300
+ **Important:**
301
+ - Return ONLY the code, nothing else.
302
+ - The code should be exactly as shown in the list.
303
+ - Do not include any additional text or explanation.
304
+ """
305
+
306
+ level4_code = gpt_call("Identify fourth-level occupational group", user_prompt4).strip()
307
+ level4_code = code_sanitize(level4_code, list4_output)
308
+ result.update(get_level_ESCO_info(level4_df, level4_code, 'Level_4'))
309
+
310
+ ######################## Level 5 ###################
311
+ level5_code = None
312
+ if level4_code:
313
+ level5_df = esco_level5_df[
314
+ (esco_level5_df['iscoGroup'].str.startswith(level4_code))
315
+ ]
316
+ if not level5_df.empty:
317
+ level5_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
318
+ for _, row in level5_df.iterrows())
319
+
320
+ list5_output = level5_df["code"].tolist() # Convert Series to list
321
+ list5 = ", ".join(map(str, list5_output)) # Join elements with comma
322
+ user_prompt5 = f"""
323
+ Here is a list of job responsibilities:
324
+ {responsibilities}
325
+
326
+ Here is a list of level 4 Occupation classifications within {level4_code}:
327
+ {level5_options}
328
+
329
+ Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list5}.
330
+ **Important:**
331
+ - Return ONLY the code as stated in the provided list, nothing else.
332
+ - The code should be exactly as shown in the list.
333
+ - Do not include any additional text, occupation code or explanation.
334
+ """
335
+
336
+ level5_code = gpt_call("Identify fifth-level occupational group", user_prompt5).strip()
337
+ # Handle the case where the LLM might return just the code part
338
+ level5_code = code_sanitize(level5_code, list5_output)
339
+ result.update(get_level_ESCO_info(level5_df, level5_code, 'Level_5'))
340
+
341
+ ## Et voila!!
342
+ return result
343
+
344
+
345
+
346
+ def get_level_ESCO_info(df, code, level_name):
347
+ """Helper function to get level info with error handling"""
348
+ matches = df[df['code'] == code]
349
+ if len(matches) == 0:
350
+ print(f"Warning: No {level_name} found for ESCO code {code}")
351
+ return {
352
+ f'{level_name}_ESCO_code': code,
353
+ f'{level_name}_ESCO_name': 'UNKNOWN',
354
+ f'{level_name}_ESCO_desc': 'No matching occupation found'
355
+ }
356
+ info = matches.iloc[0]
357
+ return {
358
+ f'{level_name}_ESCO_code': code,
359
+ f'{level_name}_ESCO_name': info['preferredLabel'],
360
+ f'{level_name}_ESCO_desc': info.get('description', '')
361
+ }
362
+
363
+
364
  def get_skills_info_esco(Level_5_code):
365
 
366
  try: