edouardlgp commited on
Commit
8c0497d
·
verified ·
1 Parent(s): c8b21ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +929 -4
app.py CHANGED
@@ -4,6 +4,34 @@ import pandas as pd
4
  import re
5
  import warnings
6
  import logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # Configure logging for pdfminer
9
  logging.getLogger('pdfminer').setLevel(logging.ERROR) # Only show errors, not warnings
@@ -38,6 +66,800 @@ def extract_text_from_pdf(pdf_path, suppress_warnings=True):
38
  text += "\n"
39
  return text
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def process_pdf(file):
42
  """
43
  Processes the uploaded PDF file and returns the extracted text.
@@ -47,18 +869,121 @@ def process_pdf(file):
47
 
48
  try:
49
  extracted_text = extract_text_from_pdf(file.name)
50
- return extracted_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  except Exception as e:
52
  return f"Error processing PDF: {str(e)}"
53
 
54
  # Create the Gradio interface
55
  with gr.Blocks() as demo:
56
- gr.Markdown("# PDF Text Extractor")
57
- gr.Markdown("Upload a PDF file to extract its text content.")
58
 
59
  with gr.Row():
60
  with gr.Column():
61
- file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
62
  submit_btn = gr.Button("Extract Text")
63
  with gr.Column():
64
  text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
 
4
  import re
5
  import warnings
6
  import logging
7
+ import os
8
+ from dotenv import load_dotenv
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+
14
+ import openai
15
+ def gpt_call(system_prompt: str, user_prompt: str) -> str:
16
+ try:
17
+ client = openai.AzureOpenAI(
18
+ api_key=os.getenv("AZURE_OPENAI_API_KEY"),
19
+ azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
20
+ api_version=os.getenv("OPENAI_API_VERSION"),
21
+ )
22
+ response = client.chat.completions.create(
23
+ model=os.getenv("AZURE_DEPLOYMENT_NAME"),
24
+ messages=[
25
+ {"role": "system", "content": system_prompt},
26
+ {"role": "user", "content": user_prompt}
27
+ ],
28
+ temperature=0.3 # setting a low temp to be conservative
29
+ )
30
+ return response.choices[0].message.content.strip()
31
+ except OpenAIError as e:
32
+ return f"ERROR: {e}"
33
+
34
+
35
 
36
  # Configure logging for pdfminer
37
  logging.getLogger('pdfminer').setLevel(logging.ERROR) # Only show errors, not warnings
 
66
  text += "\n"
67
  return text
68
 
69
+
70
+
71
+
72
+ def extract_section_from_pdf(full_text, section_title):
73
+ """
74
+ Uses OpenAI to extract a specific section (e.g., "Responsibilities and Accountabilities") from the full text.
75
+ """
76
+ user_prompt = f"""
77
+
78
+ Carefully evaluate the provided position description (PD) document and extract thecontent of the section titled "{section_title}" from the following text.
79
+
80
+ Return only the content of the section, without the title.
81
+ If the section cannot be found or explicitly mentioned in the text, use ""N/A"" as the default value.
82
+ Do not repeat in the extracted text the name of the section.
83
+ Extract precisely all the related text.
84
+
85
+ Text of the position description:
86
+ {full_text}
87
+
88
+ Section to identify: "{section_title}":
89
+ """
90
+
91
+ return gpt_call("You are an HR expert working for IOM.", user_prompt)
92
+
93
+
94
+ def classify_job_family(responsibilities: List[str]) -> str:
95
+ job_families_df = pd.read_csv("job_families1.csv")
96
+ job_family_list = "\n".join(f"- {row['Job_family']}: {row['Job_subfamily']}" for _, row in job_families_df.iterrows())
97
+ user_prompt = f"""
98
+
99
+ Here is a list of job responsibilities:
100
+
101
+ {responsibilities}
102
+
103
+ Here is a list of Job families
104
+ {job_family_list}
105
+
106
+ Based on the responsibilities, suggest the most relevant job family and subfamily from the list above.
107
+
108
+ **Important:**
109
+ - Return ONLY the job family, nothing else.
110
+ - The job family should be exactly as shown in the list.
111
+ - Do not include any additional text or explanation.
112
+ """
113
+
114
+ return gpt_call("Suggest job family and subfamily based on responsibilities.", user_prompt)
115
+
116
+
117
+ def get_level_CCOG_info(df, code, level_name):
118
+ """Helper function to get level info with error handling"""
119
+ occupational_groups_df = pd.read_csv("occupational_groups.csv")
120
+ matches = df[df['code'] == code]
121
+ if len(matches) == 0:
122
+ print(f"Warning: No {level_name} found for CCOG code {code}")
123
+ return {
124
+ f'{level_name}_CCOG_code': code,
125
+ f'{level_name}_CCOG_name': 'UNKNOWN',
126
+ f'{level_name}_CCOG_desc': 'No matching occupation found'
127
+ }
128
+ info = matches.iloc[0]
129
+ return {
130
+ f'{level_name}_CCOG_code': code,
131
+ f'{level_name}_CCOG_name': info['occupation'],
132
+ f'{level_name}_CCOG_desc': info.get('occupation_description', '')
133
+ }
134
+
135
+ def code_sanitize(input_string, valid_codes):
136
+ """
137
+ Checks if any of the valid_codes exists as a substring in input_string.
138
+ Returns the first matching code, otherwise None.
139
+ """
140
+ for code in valid_codes:
141
+ if code in input_string: # Checks for exact substring match
142
+ return code
143
+ return None
144
+
145
+ def classify_occupational_group_by_level(responsibilities: List[str]) -> dict:
146
+ """
147
+ Classifies job responsibilities into occupational groups at 4 levels,
148
+ The [Common Classification of Occupational Groups (CCOG)](https://icsc.un.org/Resources/HRPD/JobEvaluation/CCOG_9_2015.pdf)
149
+ returning codes, names, and descriptions for each level.
150
+ Args:
151
+ responsibilities: List of job responsibility strings
152
+ Returns:
153
+ Dictionary containing classification information or error message
154
+ """
155
+ occupational_groups_df = pd.read_csv("occupational_groups.csv")
156
+ result = {}
157
+
158
+ try:
159
+ ######################## Level 1 ###################
160
+ level1_df = occupational_groups_df[occupational_groups_df['level'] == "Level 1"]
161
+ job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']}"
162
+ for _, row in level1_df.iterrows())
163
+ #print(job_occupation_list)
164
+ list1_output = level1_df["code"].tolist() # Convert Series to list
165
+ list1 = ", ".join(map(str, list1_output)) # Join elements with comma
166
+ #print(list1)
167
+
168
+ user_prompt1 = f"""
169
+ Here is a list of job responsibilities:
170
+ {responsibilities}
171
+
172
+ Here is a list of level 1 Occupation classifications:
173
+ {job_occupation_list}
174
+
175
+ Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.
176
+
177
+ **Important:**
178
+ - Return ONLY the code, nothing else.
179
+ - The code should be exactly as shown in the list.
180
+ - Do not include any additional text or explanation.
181
+ """
182
+ #print(user_prompt1)
183
+ level1_code = gpt_call("Identify level 1 occupational group", user_prompt1).strip()
184
+ level1_code = code_sanitize(level1_code, list1_output)
185
+ #print(level1_code)
186
+ result.update(get_level_CCOG_info(level1_df, level1_code, 'Level_1'))
187
+
188
+ ######################## Level 2 ###################
189
+ level2_df = occupational_groups_df[
190
+ (occupational_groups_df['level'] == "Level 2") &
191
+ (occupational_groups_df['code'].str.startswith(level1_code))
192
+ ]
193
+ job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row['occupation_description']}"
194
+ for _, row in level2_df.iterrows())
195
+ #print(job_occupation_list)
196
+ list2_output = level2_df["code"].tolist() # Convert Series to list
197
+ list2 = ", ".join(map(str, list2_output)) # Join elements with comma
198
+ #print(list2)
199
+
200
+ user_prompt2 = f"""
201
+ Here is a list of job responsibilities:
202
+ {responsibilities}
203
+
204
+ Here is a list of level 2 Occupation classifications within {level1_code}:
205
+ {job_occupation_list}
206
+
207
+ Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
208
+ **Important:**
209
+ - Return ONLY the code, nothing else.
210
+ - The code should be exactly as shown in the list.
211
+ - Do not include any additional text or explanation.
212
+ """
213
+ #print(user_prompt2)
214
+ level2_code = gpt_call("Identify level 2 occupational group", user_prompt2).strip()
215
+ level2_code = code_sanitize(level2_code, list2_output)
216
+ #print(level2_code)
217
+ result.update(get_level_CCOG_info(level2_df, level2_code, 'Level_2'))
218
+
219
+ ######################## Level 3 ###################
220
+ level3_df = occupational_groups_df[
221
+ (occupational_groups_df['level'] == "Level 3") &
222
+ (occupational_groups_df['code'].str.startswith(level2_code))
223
+ ]
224
+ job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row['occupation_description']}"
225
+ for _, row in level3_df.iterrows())
226
+ #print(job_occupation_list)
227
+ list3_output = level3_df["code"].tolist() # Convert Series to list
228
+ list3 = ", ".join(map(str, list3_output)) # Join elements with comma
229
+ #print(list3)
230
+
231
+ user_prompt3 = f"""
232
+ Here is a list of job responsibilities:
233
+ {responsibilities}
234
+
235
+ Here is a list of level 3 Occupation classifications within {level2_code}:
236
+ {job_occupation_list}
237
+
238
+ Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.
239
+
240
+ **Important:**
241
+ - Return ONLY the code, nothing else.
242
+ - The code should be exactly as shown in the list.
243
+ - Do not include any additional text or explanation.
244
+
245
+ """
246
+ level3_code = gpt_call("Identify level 3 occupational group", user_prompt3).strip()
247
+ level3_code = code_sanitize(level3_code, list3_output)
248
+ result.update(get_level_CCOG_info(level3_df, level3_code, 'Level_3'))
249
+
250
+ ######################## Level 4 ###################
251
+ level4_df = occupational_groups_df[
252
+ (occupational_groups_df['level'] == "Level 4") &
253
+ (occupational_groups_df['code'].str.startswith(level3_code))
254
+ ]
255
+ job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} : {row['occupation_description']}"
256
+ for _, row in level4_df.iterrows())
257
+ #print(job_occupation_list)
258
+ list4_output = level4_df["code"].tolist() # Convert Series to list
259
+ list4 = ", ".join(map(str, list4_output)) # Join elements with comma
260
+ #print(list4)
261
+ user_prompt4 = f"""
262
+ Here is a list of job responsibilities:
263
+ {responsibilities}
264
+
265
+ Here is a list of level 4 Occupation classifications within {level3_code}:
266
+ {job_occupation_list}
267
+
268
+ Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
269
+ **Important:**
270
+ - Return ONLY the code, nothing else.
271
+ - The code should be exactly as shown in the list.
272
+ - Do not include any additional text or explanation.
273
+ """
274
+
275
+ level4_code = gpt_call("Identify final occupational group", user_prompt4).strip()
276
+ level4_code = code_sanitize(level4_code, list4_output)
277
+ result.update(get_level_CCOG_info(level4_df, level4_code, 'Level_4'))
278
+
279
+ except Exception as e:
280
+ print(f"Error during classification: {str(e)}")
281
+ result['error'] = str(e)
282
+
283
+ return result
284
+
285
+ from typing import List, Dict
286
+ import pandas as pd
287
+ esco_df = pd.read_csv(
288
+ "ISCOGroups_en.csv",
289
+ dtype={'code': str} # Force 'code' to be read as string
290
+ )
291
+
292
+
293
+ esco_level5_df = pd.read_csv(
294
+ "occupations_en.csv",
295
+ dtype={'code': str, 'iscoGroup': str, } # Force 'code' to be read as string
296
+ )
297
+
298
+ def get_level_ESCO_info(df, code, level_name):
299
+ """Helper function to get level info with error handling"""
300
+ matches = df[df['code'] == code]
301
+ if len(matches) == 0:
302
+ print(f"Warning: No {level_name} found for ESCO code {code}")
303
+ return {
304
+ f'{level_name}_ESCO_code': code,
305
+ f'{level_name}_ESCO_name': 'UNKNOWN',
306
+ f'{level_name}_ESCO_desc': 'No matching occupation found'
307
+ }
308
+ info = matches.iloc[0]
309
+ return {
310
+ f'{level_name}_ESCO_code': code,
311
+ f'{level_name}_ESCO_name': info['preferredLabel'],
312
+ f'{level_name}_ESCO_desc': info.get('description', '')
313
+ }
314
+
315
+ def classify_esco_by_hierarchical_level(responsibilities: List[str]) -> dict:
316
+ """
317
+ Classifies job responsibilities into occupational groups at 4 levels,
318
+ [European Skills, Competences, Qualifications, and Occupations (ESCO)](https://esco.ec.europa.eu/en)
319
+ returning codes, names, and descriptions for each level.
320
+ Args:
321
+ responsibilities: List of job responsibility strings
322
+ Returns:
323
+ Dictionary containing classification information or error message
324
+ """
325
+
326
+ esco_df = pd.read_csv(
327
+ "ISCOGroups_en.csv",
328
+ dtype={'code': str} # Force 'code' to be read as string
329
+ )
330
+ # print(esco_df.columns)
331
+
332
+ esco_level5_df = pd.read_csv(
333
+ "occupations_en.csv",
334
+ dtype={'code': str, 'iscoGroup': str, } # Force 'code' to be read as string
335
+ )
336
+ # print(esco_level5_df.columns)
337
+
338
+ result = {}
339
+ ######################## Level 1 ###################
340
+ # Get all top-level codes (single character/digit)
341
+ top_level_codes = sorted({
342
+ code for code in esco_df['code']
343
+ if len(code) == 1 and code.isalnum()
344
+ })
345
+
346
+ level1_code = None
347
+ if top_level_codes:
348
+ level1_df = esco_df[esco_df['code'].isin(top_level_codes)]
349
+ job_occupation_list = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
350
+ for _, row in level1_df.iterrows())
351
+ #print(job_occupation_list)
352
+ list1_output = level1_df["code"].tolist() # Convert Series to list
353
+ list1 = ", ".join(map(str, list1_output)) # Join elements with comma
354
+ #print(list1)
355
+
356
+ user_prompt1 = f"""
357
+ Here is a list of job responsibilities:
358
+ {responsibilities}
359
+
360
+ Select the most relevant top-level code from these options:
361
+ {job_occupation_list}
362
+
363
+ Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.
364
+ **Important:**
365
+ - Return ONLY the code, nothing else.
366
+ - The code should be exactly as shown in the list.
367
+ - Do not include any additional text or explanation.
368
+ """
369
+ level1_code = gpt_call("Identify top-level occupational group", user_prompt1).strip()
370
+ level1_code = code_sanitize(level1_code, list1_output)
371
+ result.update(get_level_ESCO_info(level1_df, level1_code, 'Level_1'))
372
+
373
+
374
+ ######################## Level 2 ###################
375
+
376
+ level2_code = None
377
+ if level1_code:
378
+ level2_df = esco_df[
379
+ (esco_df['code'].str.startswith(level1_code)) & (esco_df['code'].str.len() == len(level1_code) + 1)
380
+ ]
381
+ if not level2_df.empty:
382
+ level2_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
383
+ for _, row in level2_df.iterrows())
384
+ #print(job_occupation_list)
385
+ list2_output = level2_df["code"].tolist() # Convert Series to list
386
+ list2 = ", ".join(map(str, list2_output)) # Join elements with comma
387
+ #print(list2)
388
+
389
+ user_prompt2 = f"""
390
+ Here is a list of job responsibilities:
391
+ {responsibilities}
392
+
393
+ Here is a list of level 2 Occupation classifications within {level1_code}:
394
+ {level2_options}
395
+
396
+ Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
397
+ **Important:**
398
+ - Return ONLY the code, nothing else.
399
+ - The code should be exactly as shown in the list.
400
+ - Do not include any additional text or explanation.
401
+ """
402
+ level2_code = gpt_call("Identify second-level occupational group", user_prompt2).strip()
403
+ level2_code = code_sanitize(level2_code, list2_output)
404
+ result.update(get_level_ESCO_info(level2_df, level2_code, 'Level_2'))
405
+
406
+
407
+ ######################## Level 3 ###################
408
+ level3_code = None
409
+ if level2_code:
410
+ level3_df = esco_df[
411
+ (esco_df['code'].str.startswith(level2_code)) & (esco_df['code'].str.len() == len(level2_code) + 1)
412
+ ]
413
+ if not level3_df.empty:
414
+ level3_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
415
+ for _, row in level3_df.iterrows())
416
+ #print(job_occupation_list)
417
+ list3_output = level3_df["code"].tolist() # Convert Series to list
418
+ list3 = ", ".join(map(str, list3_output)) # Join elements with comma
419
+ #print(list3)
420
+
421
+ user_prompt3 = f"""
422
+ Here is a list of job responsibilities:
423
+ {responsibilities}
424
+
425
+ Here is a list of level 3 Occupation classifications within {level2_code}:
426
+ {level3_options}
427
+
428
+ Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.
429
+
430
+ **Important:**
431
+ - Return ONLY the code, nothing else.
432
+ - The code should be exactly as shown in the list.
433
+ - Do not include any additional text or explanation.
434
+
435
+ """
436
+ level3_code = gpt_call("Identify third-level occupational group", user_prompt3).strip()
437
+ level3_code = code_sanitize(level3_code, list3_output)
438
+ result.update(get_level_ESCO_info(level3_df, level3_code, 'Level_3'))
439
+
440
+ ######################## Level 4 ###################
441
+ level4_code = None
442
+ if level3_code:
443
+ level4_df = esco_df[
444
+ (esco_df['code'].str.startswith(level3_code)) & (esco_df['code'].str.len() == len(level3_code) + 1)
445
+ ]
446
+ if not level4_df.empty:
447
+ level4_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
448
+ for _, row in level4_df.iterrows())
449
+ #print(job_occupation_list)
450
+ list4_output = level4_df["code"].tolist() # Convert Series to list
451
+ list4 = ", ".join(map(str, list4_output)) # Join elements with comma
452
+ #print(list4)
453
+ user_prompt4 = f"""
454
+ Here is a list of job responsibilities:
455
+ {responsibilities}
456
+
457
+ Here is a list of level 4 Occupation classifications within {level3_code}:
458
+ {level4_options}
459
+
460
+ Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
461
+ **Important:**
462
+ - Return ONLY the code, nothing else.
463
+ - The code should be exactly as shown in the list.
464
+ - Do not include any additional text or explanation.
465
+ """
466
+ level4_code = gpt_call("Identify fourth-level occupational group", user_prompt4).strip()
467
+ level4_code = code_sanitize(level4_code, list4_output)
468
+ result.update(get_level_ESCO_info(level4_df, level4_code, 'Level_4'))
469
+
470
+ ######################## Level 5 ###################
471
+ level5_code = None
472
+ if level4_code:
473
+ level5_df = esco_level5_df[
474
+ (esco_level5_df['iscoGroup'].str.startswith(level4_code))
475
+ ]
476
+ if not level5_df.empty:
477
+ level5_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
478
+ for _, row in level5_df.iterrows())
479
+
480
+ #print(job_occupation_list)
481
+ list5_output = level5_df["code"].tolist() # Convert Series to list
482
+ list5 = ", ".join(map(str, list5_output)) # Join elements with comma
483
+ #print(list5)
484
+ user_prompt5 = f"""
485
+ Here is a list of job responsibilities:
486
+ {responsibilities}
487
+
488
+ Here is a list of level 4 Occupation classifications within {level4_code}:
489
+ {level5_options}
490
+
491
+ Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list5}.
492
+ **Important:**
493
+ - Return ONLY the code as stated in the provided list, nothing else.
494
+ - The code should be exactly as shown in the list.
495
+ - Do not include any additional text, occupation code or explanation.
496
+ """
497
+
498
+ level5_code = gpt_call("Identify fifth-level occupational group", user_prompt5).strip()
499
+ # Handle the case where the LLM might return just the code part
500
+ level5_code = code_sanitize(level5_code, list5_output)
501
+ result.update(get_level_ESCO_info(level5_df, level5_code, 'Level_5'))
502
+
503
+ ## Et voila!!
504
+ return result
505
+
506
+
507
+
508
+ def get_skills_info_esco(Level_5_code):
509
+ """Helper function to get level info with error handling"""
510
+ esco_level5_df = pd.read_csv(
511
+ "occupations_en.csv",
512
+ dtype={'code': str, 'iscoGroup': str, } # Force 'code' to be read as string
513
+ )
514
+
515
+ # Find the matching occupation
516
+ matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
517
+
518
+ # Get the conceptUri(s) for the matched occupation
519
+ conceptUris = matches['conceptUri'].values.tolist()
520
+
521
+ esco_skill_map_df = pd.read_csv(
522
+ "occupationSkillRelations_en.csv"
523
+ )
524
+ # Find all skills related to that occupationUri (using isin to match any from the list)
525
+ skills = esco_skill_map_df[esco_skill_map_df['occupationUri'].isin(conceptUris)]
526
+
527
+ # Get the list of skillUris
528
+ skillUris = skills['skillUri'].values.tolist()
529
+
530
+ esco_skill_df = pd.read_csv(
531
+ "skills_en.csv"
532
+ )
533
+ # Get the full skill details from esco_skill_df
534
+ thisskillslist = esco_skill_df[esco_skill_df['conceptUri'].isin(skillUris)]
535
+
536
+ result= thisskillslist[['preferredLabel','conceptUri', 'description']].drop_duplicates()
537
+ result = result.rename(columns={
538
+ 'preferredLabel': 'skill_name',
539
+ 'description': 'skill_description',
540
+ 'conceptUri': 'skill_code'
541
+ })
542
+
543
+ return result
544
+
545
+
546
+ def review_skills( Level_5_code: str, top_n: int = 10) -> List[Dict[str, str]]:
547
+ """
548
+ Validate relevant ESCO-style skills for a job responsibilities using a language model.
549
+
550
+ Args:
551
+ Level_5_code: Standard esco occupation code strings..
552
+ top_n (int): The number of skills to return. Defaults to 3.
553
+
554
+ Returns:
555
+ List[Dict[str, str]]: A list of extracted skill dictionaries with keys:
556
+ - skill_name
557
+ - skill_description
558
+ - skill_code
559
+ """
560
+ matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
561
+
562
+ # Get the conceptUri(s) for the matched occupation
563
+ esco_occup = matches['preferredLabel'].values.tolist()
564
+ skill_filtered = get_skills_info_esco(Level_5_code)
565
+
566
+ skill_filtered_options = "\n".join(f"- {row['skill_code']}: {row['skill_name']} - {row['skill_description']}"
567
+ for _, row in skill_filtered.iterrows())
568
+
569
+ prompt = f"""
570
+ Here is a list of skills:
571
+
572
+ {skill_filtered_options}
573
+
574
+ Filter the skills that relevant in the context of the work of the International Organisation for Migration.
575
+
576
+ Ensure that skills is relevant in the context of a {esco_occup} working for non-profit public organisation.
577
+
578
+ Required JSON structure:
579
+ {{
580
+ "skills": [
581
+ {{
582
+ "skill_name": "string",
583
+ "skill_description": "string",
584
+ "skill_code": "string"
585
+ }}
586
+ ]
587
+ }}
588
+
589
+ **Important:**
590
+ - Do not duplicate any records of skills
591
+ - keep only the 10 most relevant skills
592
+ - Return ONLY the JSON object with no other text
593
+ - Use double quotes for all strings
594
+ - No trailing commas in arrays/objects
595
+ - No markdown formatting (no ```json)
596
+ - No text before or after the JSON
597
+ - Escape all special characters in strings
598
+ - Ensure all brackets are properly closed
599
+ - No trailing commas in arrays/objects, especially before closing brackets
600
+ """
601
+
602
+ raw = gpt_call(
603
+ "You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.",
604
+ prompt
605
+ )
606
+
607
+ json_text = _extract_json(raw)
608
+ if not json_text:
609
+ return []
610
+
611
+ try:
612
+ result = json.loads(json_text)
613
+ skills = result.get("skills", [])
614
+ except json.JSONDecodeError as e:
615
+ print(f"❌ JSON parsing error: {e}")
616
+ print(f"🔍 Problematic JSON: {json_text}")
617
+ return []
618
+
619
+ validated_skills = []
620
+ for skill in skills:
621
+ try:
622
+ validated = {
623
+ "skill_name": str(skill["skill_name"]).strip(),
624
+ "skill_description": str(skill["skill_description"]).strip(),
625
+ "skill_code": str(skill["skill_code"]).strip()
626
+ }
627
+ validated_skills.append(validated)
628
+ except (KeyError, TypeError) as e:
629
+ print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
630
+ continue
631
+
632
+ return validated_skills[:top_n]
633
+
634
+
635
+
636
+ def extract_skills(responsibilities: List[str], top_n: int = 10) -> List[Dict[str, str]]:
637
+ """
638
+ Extracts ESCO-style skills from job responsibilities using a language model.
639
+
640
+ Args:
641
+ responsibilities (List[str]): A list of job responsibility strings.
642
+ top_n (int): The number of skills to return. Defaults to 3.
643
+
644
+ Returns:
645
+ List[Dict[str, str]]: A list of extracted skill dictionaries with keys:
646
+ - skill_name
647
+ - skill_description
648
+ - skill_code
649
+ """
650
+
651
+ prompt = f"""
652
+ Here is a list of job responsibilities:
653
+
654
+ {responsibilities}
655
+
656
+ List the required skills and knowledge as bullet points (without numbers) using ESCO-style terms.
657
+
658
+ For each Skill:
659
+
660
+ 1. skill_name: precise skills name as used in ESCO framework
661
+ 2. skill_description: add the long description as mentioned in ESCO framework
662
+ 3. skill_code: include the detailed corresponding ESCO code for that skill.
663
+
664
+ Required JSON structure:
665
+ {{
666
+ "skills": [
667
+ {{
668
+ "skill_name": "string",
669
+ "skill_description": "string",
670
+ "skill_code": "string"
671
+ }}
672
+ ]
673
+ }}
674
+
675
+ **Important:**
676
+ - Return ONLY the JSON object with no other text
677
+ - Use double quotes for all strings
678
+ - No trailing commas in arrays/objects
679
+ - No markdown formatting (no ```json)
680
+ - No text before or after the JSON
681
+ - Escape all special characters in strings
682
+ - Ensure all brackets are properly closed
683
+ """
684
+
685
+ raw = gpt_call(
686
+ "You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.",
687
+ prompt
688
+ )
689
+
690
+ json_text = _extract_json(raw)
691
+ if not json_text:
692
+ return []
693
+
694
+ try:
695
+ result = json.loads(json_text)
696
+ skills = result.get("skills", [])
697
+ except json.JSONDecodeError as e:
698
+ print(f"❌ JSON parsing error: {e}")
699
+ print(f"🔍 Problematic JSON: {json_text}")
700
+ return []
701
+
702
+ validated_skills = []
703
+ for skill in skills:
704
+ try:
705
+ validated = {
706
+ "skill_name": str(skill["skill_name"]).strip(),
707
+ "skill_description": str(skill["skill_description"]).strip(),
708
+ "skill_code": str(skill["skill_code"]).strip()
709
+ }
710
+ validated_skills.append(validated)
711
+ except (KeyError, TypeError) as e:
712
+ print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
713
+ continue
714
+
715
+ return validated_skills[:top_n]
716
+
717
+
718
+ def map_proficiency_and_assessment(skills: List[str], responsibilities: List[str]) -> List[Dict]:
719
+ """
720
+ Maps each skill to its contextual importance, expected proficiency level,
721
+ and assessment strategy based on job responsibilities.
722
+
723
+ Args:
724
+ skills (List[str]): List of skill names.
725
+ responsibilities (List[str]): List of job responsibilities.
726
+
727
+ Returns:
728
+ List[Dict]: A list of dictionaries containing skill metadata:
729
+ - skill_name
730
+ - importance (essential / optional)
731
+ - type ("skill/competence" or "knowledge")
732
+ - proficiency_level (Basic, Intermediate, Advanced)
733
+ - distinctive_elements
734
+ - resume_signals
735
+ - assessment_method
736
+ """
737
+
738
+ prompt = f"""
739
+ Here is a list of job responsibilities: {responsibilities} that have been associated with the following skills: {skills}
740
+
741
+ For each skill, accounting for the context defined within the responsibilities, return a JSON object with:
742
+ - skill_name: the name of the skill
743
+ - importance: essential or optional
744
+ - type: "skill/competence" or "knowledge"
745
+ - proficiency_level: Basic, Intermediate, or Advanced
746
+ - distinctive_elements: what specific and distinctive elements are required at this defined proficiency level?
747
+ - resume_signals: what to look for in a resume to assess this skill?
748
+ - assessment_method: what is the preferred assessment method to accurately assess this skill?
749
+
750
+ Respond ONLY with a list of dictionaries in valid JSON.
751
+ Use double quotes for all strings. No markdown, no commentary, no trailing commas.
752
+ """
753
+
754
+ raw = gpt_call("Define proficiency level and assessment for each skill.", prompt)
755
+
756
+ json_text = _extract_json_array(raw)
757
+ if not json_text:
758
+ return []
759
+
760
+ try:
761
+ results = json.loads(json_text)
762
+ except json.JSONDecodeError as e:
763
+ print(f"❌ JSON parsing error: {e}")
764
+ print(f"🔍 Problematic JSON: {json_text}")
765
+ return []
766
+
767
+ validated = []
768
+ for item in results:
769
+ try:
770
+ validated.append({
771
+ "skill_name": str(item["skill_name"]).strip(),
772
+ "importance": item["importance"].strip().lower(),
773
+ "type": item["type"].strip().lower(),
774
+ "proficiency_level": item["proficiency_level"].strip().capitalize(),
775
+ "distinctive_elements": item["distinctive_elements"].strip(),
776
+ "resume_signals": item["resume_signals"].strip(),
777
+ "assessment_method": item["assessment_method"].strip()
778
+ })
779
+ except (KeyError, TypeError) as e:
780
+ print(f"⚠️ Skipping invalid item: {item}. Error: {e}")
781
+ continue
782
+
783
+ return validated
784
+
785
+ def _extract_json_array(raw: str) -> str:
786
+ """
787
+ Attempts to extract a clean JSON array from raw GPT output.
788
+ """
789
+ json_start = raw.find('[')
790
+ json_end = raw.rfind(']') + 1
791
+
792
+ if json_start == -1 or json_end == 0:
793
+ print(f"❌ No JSON array found in response: {raw}")
794
+ return ""
795
+
796
+ json_text = raw[json_start:json_end]
797
+
798
+ # Cleanup
799
+ json_text = re.sub(r',\s*([}\]])', r'\1', json_text) # Remove trailing commas
800
+ json_text = re.sub(r'[\n\r\t]', ' ', json_text) # Remove control chars
801
+ json_text = re.sub(r'(?<!\\)"', '"', json_text) # Fix quotes if needed
802
+
803
+ return json_text
804
+
805
+ def extract_qualification(responsibilities: List[str]) -> List[str]:
806
+
807
+ prompt = f"""
808
+ Here is a list of job responsibilities: {responsibilities}
809
+
810
+ Infer the required level within the European Qualifications Framework (EQF) to implement them.
811
+ Identify the potential diplomas to testify such qualification
812
+ """
813
+
814
+ raw = gpt_call("You are an HR expert that excel in developing compentency based interview questions.", prompt)
815
+ return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
816
+
817
+ def build_interview(responsibilities: List[str], skill_assess: List[str]) -> List[str]:
818
+
819
+ prompt = f"""
820
+
821
+ Here is a list of job responsibilities: {responsibilities} and related skills: {skill_assess}
822
+
823
+ Output: A structured 40-minute interview with:
824
+
825
+ Opening questions (5 min)
826
+
827
+ Core competency-based questions (30 min, 5-6 questions)
828
+
829
+ Closing & candidate questions (5 min)
830
+
831
+
832
+ """
833
+
834
+ raw = gpt_call("You are an HR expert that excel in developing compentency based interview questions.", prompt)
835
+ return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
836
+
837
+
838
+
839
+ def _extract_json(raw: str) -> str:
840
+ """
841
+ Attempts to extract and clean a JSON object from a raw string.
842
+ """
843
+ json_start = raw.find('{')
844
+ json_end = raw.rfind('}') + 1
845
+
846
+ if json_start == -1 or json_end == 0:
847
+ print(f"❌ No JSON found in response: {raw}")
848
+ return ""
849
+
850
+ json_text = raw[json_start:json_end]
851
+
852
+ # Clean common issues
853
+ json_text = re.sub(r',\s*([}\]])', r'\1', json_text) # Remove trailing commas
854
+ json_text = re.sub(r'[\n\r\t]', ' ', json_text) # Remove control characters
855
+ json_text = re.sub(r'\s{2,}', ' ', json_text) # Collapse multiple spaces
856
+ json_text = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', json_text) # Escape lone backslashes
857
+ json_text = json_text.strip()
858
+
859
+ return json_text
860
+
861
+
862
+
863
  def process_pdf(file):
864
  """
865
  Processes the uploaded PDF file and returns the extracted text.
 
869
 
870
  try:
871
  extracted_text = extract_text_from_pdf(file.name)
872
+
873
+ # Extract responsibilities section
874
+ responsibilities = extract_section_from_pdf(full_text, section_title="Responsibilities and Accountabilities")
875
+ if not responsibilities:
876
+ print(f"Skipping {os.path.basename(file_path)} - no responsibilities section found")
877
+ return None
878
+
879
+ # Main processing
880
+ job_family = classify_job_family(responsibilities)
881
+ occ_group = classify_occupational_group_by_level(responsibilities)
882
+ esco_occ = classify_esco_by_hierarchical_level(responsibilities)
883
+ qualification = extract_qualification(responsibilities)
884
+ skills = extract_skills(responsibilities)
885
+ skill_map = map_proficiency_and_assessment(skills, responsibilities)
886
+
887
+ # Check if we have ESCO level 5 code
888
+ has_esco = esco_occ.get("Level_5_ESCO_code") is not None
889
+
890
+ # ESCO-based skills processing (only if we have Level 5 code)
891
+ skill_esco_extract = []
892
+ skill_esco_map = []
893
+ if has_esco:
894
+ Level_5_code = esco_occ["Level_5_ESCO_code"]
895
+ skill_esco_extract = review_skills(Level_5_code)
896
+ skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
897
+ else:
898
+ print(f"No Level 5 ESCO code found for {os.path.basename(file_path)}, skipping ESCO skills mapping")
899
+
900
+ time.sleep(6) # Rate limiting delay
901
+
902
+ # Join original skills with assessment
903
+ assessment_lookup = {item['skill_name']: item for item in skill_map}
904
+ joined_skills = [
905
+ {
906
+ "skill_name": skill["skill_name"],
907
+ "skill_description": skill["skill_description"],
908
+ "skill_code": skill["skill_code"],
909
+ "importance": assessment_lookup.get(skill["skill_name"], {}).get("importance"),
910
+ "type": assessment_lookup.get(skill["skill_name"], {}).get("type"),
911
+ "proficiency_level": assessment_lookup.get(skill["skill_name"], {}).get("proficiency_level"),
912
+ "distinctive_elements": assessment_lookup.get(skill["skill_name"], {}).get("distinctive_elements"),
913
+ "resume_signals": assessment_lookup.get(skill["skill_name"], {}).get("resume_signals"),
914
+ "assessment_method": assessment_lookup.get(skill["skill_name"], {}).get("assessment_method")
915
+ }
916
+ for skill in skills
917
+ ]
918
+
919
+ # Join ESCO skills with assessment (only if we processed them)
920
+ joined_skills_esco = []
921
+ if has_esco and skill_esco_extract:
922
+ assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
923
+ joined_skills_esco = [
924
+ {
925
+ "skill_name": skill["skill_name"],
926
+ "skill_description": skill["skill_description"],
927
+ "skill_code": skill["skill_code"],
928
+ **assessment_esco_lookup.get(skill["skill_name"], {})
929
+ }
930
+ for skill in skill_esco_extract
931
+ ]
932
+
933
+ interview = build_interview(responsibilities, skills)
934
+
935
+ # Prepare base result dictionary
936
+ result = {
937
+ "file": os.path.basename(file_path),
938
+ "responsibilities": responsibilities,
939
+ "job_family": job_fam1['Job_family'].values[0],
940
+ "job_subfamily": job_fam1['Job_subfamily'].values[0],
941
+ "classified_job_family": job_family,
942
+ **{f"Level_{i}_CCOG_{field}": occ_group.get(f"Level_{i}_CCOG_{field}")
943
+ for i in range(1, 5) for field in ["code", "name", "desc"]},
944
+ "qualification": qualification,
945
+ "interview": interview,
946
+ "skills": {
947
+ "file": os.path.basename(file_path),
948
+ "job_family": job_fam1['Job_family'].values[0],
949
+ "job_subfamily": job_fam1['Job_subfamily'].values[0],
950
+ "skills": joined_skills
951
+ }
952
+ }
953
+
954
+ # Add ESCO fields only if we have them
955
+ if has_esco:
956
+ result.update({
957
+ **{f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}")
958
+ for i in range(1, 6) for field in ["code", "name", "desc"]},
959
+ "skills_esco": {
960
+ "file": os.path.basename(file_path),
961
+ "job_family": job_fam1['Job_family'].values[0],
962
+ "job_subfamily": job_fam1['Job_subfamily'].values[0],
963
+ "skills": joined_skills_esco
964
+ }
965
+ })
966
+ else:
967
+ # Mark ESCO fields as null if not available
968
+ result.update({
969
+ **{f"Level_{i}_ESCO_{field}": None
970
+ for i in range(1, 6) for field in ["code", "name", "desc"]},
971
+ "skills_esco": None
972
+ })
973
+
974
+ return result
975
+
976
  except Exception as e:
977
  return f"Error processing PDF: {str(e)}"
978
 
979
  # Create the Gradio interface
980
  with gr.Blocks() as demo:
981
+ gr.Markdown("# Standardise Job Description!")
982
+ gr.Markdown("Identify Job Family, Occupation, Qualification, match Skills and suggest interview questions.")
983
 
984
  with gr.Row():
985
  with gr.Column():
986
+ file_input = gr.File(label="Upload a Job Description PDF file", file_types=[".pdf"])
987
  submit_btn = gr.Button("Extract Text")
988
  with gr.Column():
989
  text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)