edouardlgp commited on
Commit
8d94714
·
verified ·
1 Parent(s): e60de6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -701
app.py CHANGED
@@ -6,7 +6,6 @@ import warnings
6
  import logging
7
  import os
8
  from dotenv import load_dotenv
9
- import os
10
  import json
11
  from concurrent.futures import ThreadPoolExecutor
12
  from typing import List, Dict, Optional
@@ -16,54 +15,47 @@ import time
16
  # Load environment variables
17
  load_dotenv()
18
 
 
 
 
 
 
19
 
20
- import openai
21
- def gpt_call(system_prompt: str, user_prompt: str) -> str:
22
  try:
23
  client = openai.AzureOpenAI(
24
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),
25
  azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
26
  api_version=os.getenv("OPENAI_API_VERSION"),
27
  )
28
- response = client.chat.completions.create(
 
 
 
 
 
 
 
 
29
  model=os.getenv("AZURE_DEPLOYMENT_NAME"),
30
  messages=[
31
- {"role": "system", "content": system_prompt},
32
- {"role": "user", "content": user_prompt}
33
  ],
34
- temperature=0.3 # setting a low temp to be conservative
35
  )
36
  return response.choices[0].message.content.strip()
37
- except OpenAIError as e:
38
  return f"ERROR: {e}"
39
 
40
-
41
-
42
- # Configure logging for pdfminer
43
- logging.getLogger('pdfminer').setLevel(logging.ERROR) # Only show errors, not warnings
44
-
45
- def extract_text_from_pdf(pdf_path, suppress_warnings=True):
46
- """
47
- Extracts all text from a PDF, including text from nested tables and complex layouts.
48
-
49
- Parameters:
50
- pdf_path (str): Path to the PDF file
51
- suppress_warnings (bool): Whether to suppress PDF parsing warnings (default: True)
52
- """
53
  text = ""
54
-
55
- # Create a custom filter for the specific warning
56
- if suppress_warnings:
57
- warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")
58
-
59
  with pdfplumber.open(pdf_path) as pdf:
60
  for page in pdf.pages:
61
- # Extract text from the page
62
  page_text = page.extract_text()
63
  if page_text:
64
  text += page_text + "\n"
65
-
66
- # Extract text from tables (if any)
67
  for table in page.extract_tables():
68
  for row in table:
69
  for cell in row:
@@ -72,57 +64,36 @@ def extract_text_from_pdf(pdf_path, suppress_warnings=True):
72
  text += "\n"
73
  return text
74
 
75
-
76
-
77
-
78
- def extract_section_from_pdf(full_text, section_title):
79
- """
80
- Uses OpenAI to extract a specific section (e.g., "Responsibilities and Accountabilities") from the full text.
81
- """
82
  user_prompt = f"""
83
-
84
- Carefully evaluate the provided position description (PD) document and extract thecontent of the section titled "{section_title}" from the following text.
85
-
86
  Return only the content of the section, without the title.
87
- If the section cannot be found or explicitly mentioned in the text, use ""N/A"" as the default value.
88
  Do not repeat in the extracted text the name of the section.
89
  Extract precisely all the related text.
90
-
91
- Text of the position description:
92
  {full_text}
93
-
94
  Section to identify: "{section_title}":
95
  """
96
-
97
  return gpt_call("You are an HR expert working for IOM.", user_prompt)
98
 
99
-
100
  def classify_job_family(responsibilities: List[str]) -> str:
101
  job_families_df = pd.read_csv("job_families1.csv")
102
  job_family_list = "\n".join(f"- {row['Job_family']}: {row['Job_subfamily']}" for _, row in job_families_df.iterrows())
103
  user_prompt = f"""
104
-
105
- Here is a list of job responsibilities:
106
-
107
- {responsibilities}
108
-
109
- Here is a list of Job families
110
- {job_family_list}
111
-
112
- Based on the responsibilities, suggest the most relevant job family and subfamily from the list above.
113
-
114
- **Important:**
115
  - Return ONLY the job family, nothing else.
116
  - The job family should be exactly as shown in the list.
117
  - Do not include any additional text or explanation.
118
- """
119
-
120
  return gpt_call("Suggest job family and subfamily based on responsibilities.", user_prompt)
121
 
122
-
123
  def get_level_CCOG_info(df, code, level_name):
124
- """Helper function to get level info with error handling"""
125
- occupational_groups_df = pd.read_csv("occupational_groups.csv")
126
  matches = df[df['code'] == code]
127
  if len(matches) == 0:
128
  print(f"Warning: No {level_name} found for CCOG code {code}")
@@ -139,481 +110,90 @@ def get_level_CCOG_info(df, code, level_name):
139
  }
140
 
141
  def code_sanitize(input_string, valid_codes):
142
- """
143
- Checks if any of the valid_codes exists as a substring in input_string.
144
- Returns the first matching code, otherwise None.
145
- """
146
  for code in valid_codes:
147
- if code in input_string: # Checks for exact substring match
148
  return code
149
  return None
150
 
151
  def classify_occupational_group_by_level(responsibilities: List[str]) -> dict:
152
- """
153
- Classifies job responsibilities into occupational groups at 4 levels,
154
- The [Common Classification of Occupational Groups (CCOG)](https://icsc.un.org/Resources/HRPD/JobEvaluation/CCOG_9_2015.pdf)
155
- returning codes, names, and descriptions for each level.
156
- Args:
157
- responsibilities: List of job responsibility strings
158
- Returns:
159
- Dictionary containing classification information or error message
160
- """
161
  occupational_groups_df = pd.read_csv("occupational_groups.csv")
162
  result = {}
163
-
164
  try:
165
- ######################## Level 1 ###################
166
- level1_df = occupational_groups_df[occupational_groups_df['level'] == "Level 1"]
167
- job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']}"
168
- for _, row in level1_df.iterrows())
169
- #print(job_occupation_list)
170
- list1_output = level1_df["code"].tolist() # Convert Series to list
171
- list1 = ", ".join(map(str, list1_output)) # Join elements with comma
172
- #print(list1)
173
-
174
- user_prompt1 = f"""
175
- Here is a list of job responsibilities:
176
- {responsibilities}
177
-
178
- Here is a list of level 1 Occupation classifications:
179
- {job_occupation_list}
180
-
181
- Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.
182
-
183
- **Important:**
184
- - Return ONLY the code, nothing else.
185
- - The code should be exactly as shown in the list.
186
- - Do not include any additional text or explanation.
187
- """
188
- #print(user_prompt1)
189
- level1_code = gpt_call("Identify level 1 occupational group", user_prompt1).strip()
190
- level1_code = code_sanitize(level1_code, list1_output)
191
- #print(level1_code)
192
- result.update(get_level_CCOG_info(level1_df, level1_code, 'Level_1'))
193
-
194
- ######################## Level 2 ###################
195
- level2_df = occupational_groups_df[
196
- (occupational_groups_df['level'] == "Level 2") &
197
- (occupational_groups_df['code'].str.startswith(level1_code))
198
- ]
199
- job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row['occupation_description']}"
200
- for _, row in level2_df.iterrows())
201
- #print(job_occupation_list)
202
- list2_output = level2_df["code"].tolist() # Convert Series to list
203
- list2 = ", ".join(map(str, list2_output)) # Join elements with comma
204
- #print(list2)
205
-
206
- user_prompt2 = f"""
207
- Here is a list of job responsibilities:
208
- {responsibilities}
209
-
210
- Here is a list of level 2 Occupation classifications within {level1_code}:
211
- {job_occupation_list}
212
-
213
- Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
214
- **Important:**
215
- - Return ONLY the code, nothing else.
216
- - The code should be exactly as shown in the list.
217
- - Do not include any additional text or explanation.
218
- """
219
- #print(user_prompt2)
220
- level2_code = gpt_call("Identify level 2 occupational group", user_prompt2).strip()
221
- level2_code = code_sanitize(level2_code, list2_output)
222
- #print(level2_code)
223
- result.update(get_level_CCOG_info(level2_df, level2_code, 'Level_2'))
224
-
225
- ######################## Level 3 ###################
226
- level3_df = occupational_groups_df[
227
- (occupational_groups_df['level'] == "Level 3") &
228
- (occupational_groups_df['code'].str.startswith(level2_code))
229
- ]
230
- job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row['occupation_description']}"
231
- for _, row in level3_df.iterrows())
232
- #print(job_occupation_list)
233
- list3_output = level3_df["code"].tolist() # Convert Series to list
234
- list3 = ", ".join(map(str, list3_output)) # Join elements with comma
235
- #print(list3)
236
-
237
- user_prompt3 = f"""
238
- Here is a list of job responsibilities:
239
- {responsibilities}
240
-
241
- Here is a list of level 3 Occupation classifications within {level2_code}:
242
- {job_occupation_list}
243
-
244
- Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.
245
-
246
- **Important:**
247
- - Return ONLY the code, nothing else.
248
- - The code should be exactly as shown in the list.
249
- - Do not include any additional text or explanation.
250
-
251
- """
252
- level3_code = gpt_call("Identify level 3 occupational group", user_prompt3).strip()
253
- level3_code = code_sanitize(level3_code, list3_output)
254
- result.update(get_level_CCOG_info(level3_df, level3_code, 'Level_3'))
255
-
256
- ######################## Level 4 ###################
257
- level4_df = occupational_groups_df[
258
- (occupational_groups_df['level'] == "Level 4") &
259
- (occupational_groups_df['code'].str.startswith(level3_code))
260
- ]
261
- job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} : {row['occupation_description']}"
262
- for _, row in level4_df.iterrows())
263
- #print(job_occupation_list)
264
- list4_output = level4_df["code"].tolist() # Convert Series to list
265
- list4 = ", ".join(map(str, list4_output)) # Join elements with comma
266
- #print(list4)
267
- user_prompt4 = f"""
268
- Here is a list of job responsibilities:
269
- {responsibilities}
270
-
271
- Here is a list of level 4 Occupation classifications within {level3_code}:
272
- {job_occupation_list}
273
-
274
- Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
275
- **Important:**
276
- - Return ONLY the code, nothing else.
277
- - The code should be exactly as shown in the list.
278
- - Do not include any additional text or explanation.
279
- """
280
-
281
- level4_code = gpt_call("Identify final occupational group", user_prompt4).strip()
282
- level4_code = code_sanitize(level4_code, list4_output)
283
- result.update(get_level_CCOG_info(level4_df, level4_code, 'Level_4'))
284
-
285
  except Exception as e:
286
  print(f"Error during classification: {str(e)}")
287
  result['error'] = str(e)
288
-
289
- return result
290
-
291
- from typing import List, Dict
292
- import pandas as pd
293
- esco_df = pd.read_csv(
294
- "ISCOGroups_en.csv",
295
- dtype={'code': str} # Force 'code' to be read as string
296
- )
297
-
298
-
299
- esco_level5_df = pd.read_csv(
300
- "occupations_en.csv",
301
- dtype={'code': str, 'iscoGroup': str, } # Force 'code' to be read as string
302
- )
303
-
304
- def get_level_ESCO_info(df, code, level_name):
305
- """Helper function to get level info with error handling"""
306
- matches = df[df['code'] == code]
307
- if len(matches) == 0:
308
- print(f"Warning: No {level_name} found for ESCO code {code}")
309
- return {
310
- f'{level_name}_ESCO_code': code,
311
- f'{level_name}_ESCO_name': 'UNKNOWN',
312
- f'{level_name}_ESCO_desc': 'No matching occupation found'
313
- }
314
- info = matches.iloc[0]
315
- return {
316
- f'{level_name}_ESCO_code': code,
317
- f'{level_name}_ESCO_name': info['preferredLabel'],
318
- f'{level_name}_ESCO_desc': info.get('description', '')
319
- }
320
-
321
- def classify_esco_by_hierarchical_level(responsibilities: List[str]) -> dict:
322
- """
323
- Classifies job responsibilities into occupational groups at 4 levels,
324
- [European Skills, Competences, Qualifications, and Occupations (ESCO)](https://esco.ec.europa.eu/en)
325
- returning codes, names, and descriptions for each level.
326
- Args:
327
- responsibilities: List of job responsibility strings
328
- Returns:
329
- Dictionary containing classification information or error message
330
- """
331
-
332
- esco_df = pd.read_csv(
333
- "ISCOGroups_en.csv",
334
- dtype={'code': str} # Force 'code' to be read as string
335
- )
336
- # print(esco_df.columns)
337
-
338
- esco_level5_df = pd.read_csv(
339
- "occupations_en.csv",
340
- dtype={'code': str, 'iscoGroup': str, } # Force 'code' to be read as string
341
- )
342
- # print(esco_level5_df.columns)
343
-
344
- result = {}
345
- ######################## Level 1 ###################
346
- # Get all top-level codes (single character/digit)
347
- top_level_codes = sorted({
348
- code for code in esco_df['code']
349
- if len(code) == 1 and code.isalnum()
350
- })
351
-
352
- level1_code = None
353
- if top_level_codes:
354
- level1_df = esco_df[esco_df['code'].isin(top_level_codes)]
355
- job_occupation_list = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
356
- for _, row in level1_df.iterrows())
357
- #print(job_occupation_list)
358
- list1_output = level1_df["code"].tolist() # Convert Series to list
359
- list1 = ", ".join(map(str, list1_output)) # Join elements with comma
360
- #print(list1)
361
-
362
- user_prompt1 = f"""
363
- Here is a list of job responsibilities:
364
- {responsibilities}
365
-
366
- Select the most relevant top-level code from these options:
367
- {job_occupation_list}
368
-
369
- Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.
370
- **Important:**
371
- - Return ONLY the code, nothing else.
372
- - The code should be exactly as shown in the list.
373
- - Do not include any additional text or explanation.
374
- """
375
- level1_code = gpt_call("Identify top-level occupational group", user_prompt1).strip()
376
- level1_code = code_sanitize(level1_code, list1_output)
377
- result.update(get_level_ESCO_info(level1_df, level1_code, 'Level_1'))
378
-
379
-
380
- ######################## Level 2 ###################
381
-
382
- level2_code = None
383
- if level1_code:
384
- level2_df = esco_df[
385
- (esco_df['code'].str.startswith(level1_code)) & (esco_df['code'].str.len() == len(level1_code) + 1)
386
- ]
387
- if not level2_df.empty:
388
- level2_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
389
- for _, row in level2_df.iterrows())
390
- #print(job_occupation_list)
391
- list2_output = level2_df["code"].tolist() # Convert Series to list
392
- list2 = ", ".join(map(str, list2_output)) # Join elements with comma
393
- #print(list2)
394
-
395
- user_prompt2 = f"""
396
- Here is a list of job responsibilities:
397
- {responsibilities}
398
-
399
- Here is a list of level 2 Occupation classifications within {level1_code}:
400
- {level2_options}
401
-
402
- Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
403
- **Important:**
404
- - Return ONLY the code, nothing else.
405
- - The code should be exactly as shown in the list.
406
- - Do not include any additional text or explanation.
407
- """
408
- level2_code = gpt_call("Identify second-level occupational group", user_prompt2).strip()
409
- level2_code = code_sanitize(level2_code, list2_output)
410
- result.update(get_level_ESCO_info(level2_df, level2_code, 'Level_2'))
411
-
412
-
413
- ######################## Level 3 ###################
414
- level3_code = None
415
- if level2_code:
416
- level3_df = esco_df[
417
- (esco_df['code'].str.startswith(level2_code)) & (esco_df['code'].str.len() == len(level2_code) + 1)
418
- ]
419
- if not level3_df.empty:
420
- level3_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
421
- for _, row in level3_df.iterrows())
422
- #print(job_occupation_list)
423
- list3_output = level3_df["code"].tolist() # Convert Series to list
424
- list3 = ", ".join(map(str, list3_output)) # Join elements with comma
425
- #print(list3)
426
-
427
- user_prompt3 = f"""
428
- Here is a list of job responsibilities:
429
- {responsibilities}
430
-
431
- Here is a list of level 3 Occupation classifications within {level2_code}:
432
- {level3_options}
433
-
434
- Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.
435
-
436
- **Important:**
437
- - Return ONLY the code, nothing else.
438
- - The code should be exactly as shown in the list.
439
- - Do not include any additional text or explanation.
440
-
441
- """
442
- level3_code = gpt_call("Identify third-level occupational group", user_prompt3).strip()
443
- level3_code = code_sanitize(level3_code, list3_output)
444
- result.update(get_level_ESCO_info(level3_df, level3_code, 'Level_3'))
445
-
446
- ######################## Level 4 ###################
447
- level4_code = None
448
- if level3_code:
449
- level4_df = esco_df[
450
- (esco_df['code'].str.startswith(level3_code)) & (esco_df['code'].str.len() == len(level3_code) + 1)
451
- ]
452
- if not level4_df.empty:
453
- level4_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
454
- for _, row in level4_df.iterrows())
455
- #print(job_occupation_list)
456
- list4_output = level4_df["code"].tolist() # Convert Series to list
457
- list4 = ", ".join(map(str, list4_output)) # Join elements with comma
458
- #print(list4)
459
- user_prompt4 = f"""
460
- Here is a list of job responsibilities:
461
- {responsibilities}
462
-
463
- Here is a list of level 4 Occupation classifications within {level3_code}:
464
- {level4_options}
465
-
466
- Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
467
- **Important:**
468
- - Return ONLY the code, nothing else.
469
- - The code should be exactly as shown in the list.
470
- - Do not include any additional text or explanation.
471
- """
472
- level4_code = gpt_call("Identify fourth-level occupational group", user_prompt4).strip()
473
- level4_code = code_sanitize(level4_code, list4_output)
474
- result.update(get_level_ESCO_info(level4_df, level4_code, 'Level_4'))
475
-
476
- ######################## Level 5 ###################
477
- level5_code = None
478
- if level4_code:
479
- level5_df = esco_level5_df[
480
- (esco_level5_df['iscoGroup'].str.startswith(level4_code))
481
- ]
482
- if not level5_df.empty:
483
- level5_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
484
- for _, row in level5_df.iterrows())
485
-
486
- #print(job_occupation_list)
487
- list5_output = level5_df["code"].tolist() # Convert Series to list
488
- list5 = ", ".join(map(str, list5_output)) # Join elements with comma
489
- #print(list5)
490
- user_prompt5 = f"""
491
- Here is a list of job responsibilities:
492
- {responsibilities}
493
-
494
- Here is a list of level 4 Occupation classifications within {level4_code}:
495
- {level5_options}
496
-
497
- Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list5}.
498
- **Important:**
499
- - Return ONLY the code as stated in the provided list, nothing else.
500
- - The code should be exactly as shown in the list.
501
- - Do not include any additional text, occupation code or explanation.
502
- """
503
-
504
- level5_code = gpt_call("Identify fifth-level occupational group", user_prompt5).strip()
505
- # Handle the case where the LLM might return just the code part
506
- level5_code = code_sanitize(level5_code, list5_output)
507
- result.update(get_level_ESCO_info(level5_df, level5_code, 'Level_5'))
508
-
509
- ## Et voila!!
510
  return result
511
-
512
-
513
 
514
  def get_skills_info_esco(Level_5_code):
515
- """Helper function to get level info with error handling"""
516
- esco_level5_df = pd.read_csv(
517
- "occupations_en.csv",
518
- dtype={'code': str, 'iscoGroup': str, } # Force 'code' to be read as string
519
- )
520
-
521
- # Find the matching occupation
522
  matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
523
-
524
- # Get the conceptUri(s) for the matched occupation
525
  conceptUris = matches['conceptUri'].values.tolist()
526
-
527
- esco_skill_map_df = pd.read_csv(
528
- "occupationSkillRelations_en.csv"
529
- )
530
- # Find all skills related to that occupationUri (using isin to match any from the list)
531
  skills = esco_skill_map_df[esco_skill_map_df['occupationUri'].isin(conceptUris)]
532
-
533
- # Get the list of skillUris
534
  skillUris = skills['skillUri'].values.tolist()
535
-
536
- esco_skill_df = pd.read_csv(
537
- "skills_en.csv"
538
- )
539
- # Get the full skill details from esco_skill_df
540
  thisskillslist = esco_skill_df[esco_skill_df['conceptUri'].isin(skillUris)]
 
 
 
541
 
542
- result= thisskillslist[['preferredLabel','conceptUri', 'description']].drop_duplicates()
543
- result = result.rename(columns={
544
- 'preferredLabel': 'skill_name',
545
- 'description': 'skill_description',
546
- 'conceptUri': 'skill_code'
547
- })
548
-
549
- return result
550
-
551
-
552
- def review_skills( Level_5_code: str, top_n: int = 10) -> List[Dict[str, str]]:
553
- """
554
- Validate relevant ESCO-style skills for a job responsibilities using a language model.
555
-
556
- Args:
557
- Level_5_code: Standard esco occupation code strings..
558
- top_n (int): The number of skills to return. Defaults to 3.
559
-
560
- Returns:
561
- List[Dict[str, str]]: A list of extracted skill dictionaries with keys:
562
- - skill_name
563
- - skill_description
564
- - skill_code
565
- """
566
  matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
567
-
568
- # Get the conceptUri(s) for the matched occupation
569
  esco_occup = matches['preferredLabel'].values.tolist()
570
  skill_filtered = get_skills_info_esco(Level_5_code)
571
-
572
- skill_filtered_options = "\n".join(f"- {row['skill_code']}: {row['skill_name']} - {row['skill_description']}"
573
- for _, row in skill_filtered.iterrows())
574
-
575
  prompt = f"""
576
- Here is a list of skills:
577
-
578
- {skill_filtered_options}
579
-
580
- Filter the skills that relevant in the context of the work of the International Organisation for Migration.
581
-
582
- Ensure that skills is relevant in the context of a {esco_occup} working for non-profit public organisation.
583
-
584
- Required JSON structure:
585
- {{
586
- "skills": [
587
- {{
588
- "skill_name": "string",
589
- "skill_description": "string",
590
- "skill_code": "string"
591
- }}
592
- ]
593
- }}
594
-
595
- **Important:**
596
- - Do not duplicate any records of skills
597
- - keep only the 10 most relevant skills
598
- - Return ONLY the JSON object with no other text
599
- - Use double quotes for all strings
600
- - No trailing commas in arrays/objects
601
- - No markdown formatting (no ```json)
602
- - No text before or after the JSON
603
- - Escape all special characters in strings
604
- - Ensure all brackets are properly closed
605
- - No trailing commas in arrays/objects, especially before closing brackets
606
- """
607
-
608
- raw = gpt_call(
609
- "You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.",
610
- prompt
611
- )
612
-
613
  json_text = _extract_json(raw)
614
  if not json_text:
615
  return []
616
-
617
  try:
618
  result = json.loads(json_text)
619
  skills = result.get("skills", [])
@@ -621,7 +201,6 @@ Required JSON structure:
621
  print(f"❌ JSON parsing error: {e}")
622
  print(f"🔍 Problematic JSON: {json_text}")
623
  return []
624
-
625
  validated_skills = []
626
  for skill in skills:
627
  try:
@@ -634,69 +213,40 @@ Required JSON structure:
634
  except (KeyError, TypeError) as e:
635
  print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
636
  continue
637
-
638
- return validated_skills[:top_n]
639
-
640
-
641
 
642
  def extract_skills(responsibilities: List[str], top_n: int = 10) -> List[Dict[str, str]]:
643
- """
644
- Extracts ESCO-style skills from job responsibilities using a language model.
645
-
646
- Args:
647
- responsibilities (List[str]): A list of job responsibility strings.
648
- top_n (int): The number of skills to return. Defaults to 3.
649
-
650
- Returns:
651
- List[Dict[str, str]]: A list of extracted skill dictionaries with keys:
652
- - skill_name
653
- - skill_description
654
- - skill_code
655
- """
656
-
657
  prompt = f"""
658
- Here is a list of job responsibilities:
659
-
660
- {responsibilities}
661
-
662
- List the required skills and knowledge as bullet points (without numbers) using ESCO-style terms.
663
-
664
- For each Skill:
665
-
666
- 1. skill_name: precise skills name as used in ESCO framework
667
- 2. skill_description: add the long description as mentioned in ESCO framework
668
- 3. skill_code: include the detailed corresponding ESCO code for that skill.
669
-
670
- Required JSON structure:
671
- {{
672
- "skills": [
673
- {{
674
- "skill_name": "string",
675
- "skill_description": "string",
676
- "skill_code": "string"
677
- }}
678
- ]
679
- }}
680
-
681
- **Important:**
682
- - Return ONLY the JSON object with no other text
683
- - Use double quotes for all strings
684
- - No trailing commas in arrays/objects
685
- - No markdown formatting (no ```json)
686
- - No text before or after the JSON
687
- - Escape all special characters in strings
688
- - Ensure all brackets are properly closed
689
- """
690
-
691
- raw = gpt_call(
692
- "You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.",
693
- prompt
694
- )
695
-
696
  json_text = _extract_json(raw)
697
  if not json_text:
698
  return []
699
-
700
  try:
701
  result = json.loads(json_text)
702
  skills = result.get("skills", [])
@@ -704,7 +254,6 @@ Required JSON structure:
704
  print(f"❌ JSON parsing error: {e}")
705
  print(f"🔍 Problematic JSON: {json_text}")
706
  return []
707
-
708
  validated_skills = []
709
  for skill in skills:
710
  try:
@@ -717,59 +266,32 @@ Required JSON structure:
717
  except (KeyError, TypeError) as e:
718
  print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
719
  continue
720
-
721
  return validated_skills[:top_n]
722
 
723
-
724
  def map_proficiency_and_assessment(skills: List[str], responsibilities: List[str]) -> List[Dict]:
725
- """
726
- Maps each skill to its contextual importance, expected proficiency level,
727
- and assessment strategy based on job responsibilities.
728
-
729
- Args:
730
- skills (List[str]): List of skill names.
731
- responsibilities (List[str]): List of job responsibilities.
732
-
733
- Returns:
734
- List[Dict]: A list of dictionaries containing skill metadata:
735
- - skill_name
736
- - importance (essential / optional)
737
- - type ("skill/competence" or "knowledge")
738
- - proficiency_level (Basic, Intermediate, Advanced)
739
- - distinctive_elements
740
- - resume_signals
741
- - assessment_method
742
- """
743
-
744
  prompt = f"""
745
- Here is a list of job responsibilities: {responsibilities} that have been associated with the following skills: {skills}
746
-
747
- For each skill, accounting for the context defined within the responsibilities, return a JSON object with:
748
- - skill_name: the name of the skill
749
- - importance: essential or optional
750
- - type: "skill/competence" or "knowledge"
751
- - proficiency_level: Basic, Intermediate, or Advanced
752
- - distinctive_elements: what specific and distinctive elements are required at this defined proficiency level?
753
- - resume_signals: what to look for in a resume to assess this skill?
754
- - assessment_method: what is the preferred assessment method to accurately assess this skill?
755
-
756
- Respond ONLY with a list of dictionaries in valid JSON.
757
- Use double quotes for all strings. No markdown, no commentary, no trailing commas.
758
- """
759
-
760
  raw = gpt_call("Define proficiency level and assessment for each skill.", prompt)
761
-
762
  json_text = _extract_json_array(raw)
763
  if not json_text:
764
  return []
765
-
766
  try:
767
  results = json.loads(json_text)
768
  except json.JSONDecodeError as e:
769
  print(f"❌ JSON parsing error: {e}")
770
  print(f"🔍 Problematic JSON: {json_text}")
771
  return []
772
-
773
  validated = []
774
  for item in results:
775
  try:
@@ -785,115 +307,70 @@ Use double quotes for all strings. No markdown, no commentary, no trailing comma
785
  except (KeyError, TypeError) as e:
786
  print(f"⚠️ Skipping invalid item: {item}. Error: {e}")
787
  continue
788
-
789
  return validated
790
 
791
  def _extract_json_array(raw: str) -> str:
792
- """
793
- Attempts to extract a clean JSON array from raw GPT output.
794
- """
795
  json_start = raw.find('[')
796
  json_end = raw.rfind(']') + 1
797
-
798
  if json_start == -1 or json_end == 0:
799
  print(f"❌ No JSON array found in response: {raw}")
800
  return ""
801
-
802
  json_text = raw[json_start:json_end]
803
-
804
- # Cleanup
805
- json_text = re.sub(r',\s*([}\]])', r'\1', json_text) # Remove trailing commas
806
- json_text = re.sub(r'[\n\r\t]', ' ', json_text) # Remove control chars
807
- json_text = re.sub(r'(?<!\\)"', '"', json_text) # Fix quotes if needed
808
-
809
  return json_text
810
 
811
  def extract_qualification(responsibilities: List[str]) -> List[str]:
812
-
813
  prompt = f"""
814
- Here is a list of job responsibilities: {responsibilities}
815
-
816
- Infer the required level within the European Qualifications Framework (EQF) to implement them.
817
- Identify the potential diplomas to testify such qualification
818
- """
819
-
820
- raw = gpt_call("You are an HR expert that excel in developing compentency based interview questions.", prompt)
821
  return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
822
 
823
  def build_interview(responsibilities: List[str], skill_assess: List[str]) -> List[str]:
824
-
825
  prompt = f"""
826
-
827
- Here is a list of job responsibilities: {responsibilities} and related skills: {skill_assess}
828
-
829
- Output: A structured 40-minute interview with:
830
-
831
- Opening questions (5 min)
832
-
833
- Core competency-based questions (30 min, 5-6 questions)
834
-
835
- Closing & candidate questions (5 min)
836
-
837
-
838
- """
839
-
840
- raw = gpt_call("You are an HR expert that excel in developing compentency based interview questions.", prompt)
841
  return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
842
 
843
-
844
-
845
  def _extract_json(raw: str) -> str:
846
- """
847
- Attempts to extract and clean a JSON object from a raw string.
848
- """
849
  json_start = raw.find('{')
850
  json_end = raw.rfind('}') + 1
851
-
852
  if json_start == -1 or json_end == 0:
853
  print(f"❌ No JSON found in response: {raw}")
854
  return ""
855
-
856
  json_text = raw[json_start:json_end]
857
-
858
- # Clean common issues
859
- json_text = re.sub(r',\s*([}\]])', r'\1', json_text) # Remove trailing commas
860
- json_text = re.sub(r'[\n\r\t]', ' ', json_text) # Remove control characters
861
- json_text = re.sub(r'\s{2,}', ' ', json_text) # Collapse multiple spaces
862
- json_text = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', json_text) # Escape lone backslashes
863
  json_text = json_text.strip()
864
-
865
- return json_text
866
-
867
-
868
 
869
  def process_pdf(file):
870
- """
871
- Processes the uploaded PDF file and returns the extracted text.
872
- """
873
  if file is None:
874
  return "Please upload a PDF file."
875
-
876
  try:
877
  extracted_text = extract_text_from_pdf(file.name)
878
-
879
- # Extract responsibilities section
880
- responsibilities = extract_section_from_pdf(full_text, section_title="Responsibilities and Accountabilities")
881
  if not responsibilities:
882
- print(f"Skipping {os.path.basename(file_path)} - no responsibilities section found")
883
  return None
884
-
885
- # Main processing
886
  job_family = classify_job_family(responsibilities)
887
  occ_group = classify_occupational_group_by_level(responsibilities)
888
  esco_occ = classify_esco_by_hierarchical_level(responsibilities)
889
  qualification = extract_qualification(responsibilities)
890
  skills = extract_skills(responsibilities)
891
  skill_map = map_proficiency_and_assessment(skills, responsibilities)
892
-
893
- # Check if we have ESCO level 5 code
894
  has_esco = esco_occ.get("Level_5_ESCO_code") is not None
895
-
896
- # ESCO-based skills processing (only if we have Level 5 code)
897
  skill_esco_extract = []
898
  skill_esco_map = []
899
  if has_esco:
@@ -901,11 +378,8 @@ def process_pdf(file):
901
  skill_esco_extract = review_skills(Level_5_code)
902
  skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
903
  else:
904
- print(f"No Level 5 ESCO code found for {os.path.basename(file_path)}, skipping ESCO skills mapping")
905
-
906
- time.sleep(6) # Rate limiting delay
907
-
908
- # Join original skills with assessment
909
  assessment_lookup = {item['skill_name']: item for item in skill_map}
910
  joined_skills = [
911
  {
@@ -921,8 +395,6 @@ def process_pdf(file):
921
  }
922
  for skill in skills
923
  ]
924
-
925
- # Join ESCO skills with assessment (only if we processed them)
926
  joined_skills_esco = []
927
  if has_esco and skill_esco_extract:
928
  assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
@@ -935,71 +407,59 @@ def process_pdf(file):
935
  }
936
  for skill in skill_esco_extract
937
  ]
938
-
939
  interview = build_interview(responsibilities, skills)
940
-
941
- # Prepare base result dictionary
942
  result = {
943
- "file": os.path.basename(file_path),
944
  "responsibilities": responsibilities,
945
  "job_family": job_fam1['Job_family'].values[0],
946
  "job_subfamily": job_fam1['Job_subfamily'].values[0],
947
  "classified_job_family": job_family,
948
- **{f"Level_{i}_CCOG_{field}": occ_group.get(f"Level_{i}_CCOG_{field}")
949
  for i in range(1, 5) for field in ["code", "name", "desc"]},
950
  "qualification": qualification,
951
  "interview": interview,
952
  "skills": {
953
- "file": os.path.basename(file_path),
954
  "job_family": job_fam1['Job_family'].values[0],
955
  "job_subfamily": job_fam1['Job_subfamily'].values[0],
956
  "skills": joined_skills
957
  }
958
  }
959
-
960
- # Add ESCO fields only if we have them
961
  if has_esco:
962
  result.update({
963
- **{f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}")
964
  for i in range(1, 6) for field in ["code", "name", "desc"]},
965
  "skills_esco": {
966
- "file": os.path.basename(file_path),
967
  "job_family": job_fam1['Job_family'].values[0],
968
  "job_subfamily": job_fam1['Job_subfamily'].values[0],
969
  "skills": joined_skills_esco
970
  }
971
  })
972
  else:
973
- # Mark ESCO fields as null if not available
974
  result.update({
975
- **{f"Level_{i}_ESCO_{field}": None
976
  for i in range(1, 6) for field in ["code", "name", "desc"]},
977
  "skills_esco": None
978
  })
979
-
980
  return result
981
-
982
  except Exception as e:
983
  return f"Error processing PDF: {str(e)}"
984
 
985
- # Create the Gradio interface
986
  with gr.Blocks() as demo:
987
- gr.Markdown("# Standardise Job Description!")
988
  gr.Markdown("Identify Job Family, Occupation, Qualification, match Skills and suggest interview questions.")
989
-
990
  with gr.Row():
991
  with gr.Column():
992
  file_input = gr.File(label="Upload a Job Description PDF file", file_types=[".pdf"])
993
  submit_btn = gr.Button("Extract Text")
994
  with gr.Column():
995
  text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
996
-
997
  submit_btn.click(
998
  fn=process_pdf,
999
  inputs=file_input,
1000
  outputs=text_output
1001
  )
1002
 
1003
- # Run the app
1004
  if __name__ == "__main__":
1005
- demo.launch()
 
6
  import logging
7
  import os
8
  from dotenv import load_dotenv
 
9
  import json
10
  from concurrent.futures import ThreadPoolExecutor
11
  from typing import List, Dict, Optional
 
15
  # Load environment variables
16
  load_dotenv()
17
 
18
+ # Configure logging for pdfminer
19
+ logging.getLogger('pdfminer').setLevel(logging.ERROR)
20
+
21
+ # Suppress specific warnings
22
+ warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")
23
 
24
+ # Initialize OpenAI client
25
+ def initialize_openai_client():
26
  try:
27
  client = openai.AzureOpenAI(
28
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),
29
  azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
30
  api_version=os.getenv("OPENAI_API_VERSION"),
31
  )
32
+ return client
33
+ except Exception as e:
34
+ raise Exception(f"Failed to initialize OpenAI client: {e}")
35
+
36
+ client = initialize_openai_client()
37
+
38
+ def gpt_call(system_prompt: str, user_prompt: str) -> str:
39
+ try:
40
+ response = client.chat.completions.create(
41
  model=os.getenv("AZURE_DEPLOYMENT_NAME"),
42
  messages=[
43
+ {"role": "system", "content": system_prompt},
44
+ {"role": "user", "content": user_prompt}
45
  ],
46
+ temperature=0.3
47
  )
48
  return response.choices[0].message.content.strip()
49
+ except Exception as e:
50
  return f"ERROR: {e}"
51
 
52
+ def extract_text_from_pdf(pdf_path: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
53
  text = ""
 
 
 
 
 
54
  with pdfplumber.open(pdf_path) as pdf:
55
  for page in pdf.pages:
 
56
  page_text = page.extract_text()
57
  if page_text:
58
  text += page_text + "\n"
 
 
59
  for table in page.extract_tables():
60
  for row in table:
61
  for cell in row:
 
64
  text += "\n"
65
  return text
66
 
67
+ def extract_section_from_pdf(full_text: str, section_title: str) -> str:
 
 
 
 
 
 
68
  user_prompt = f"""
69
+ Carefully evaluate the provided position description (PD) document and extract the content of the section titled "{section_title}" from the following text.
 
 
70
  Return only the content of the section, without the title.
71
+ If the section cannot be found or explicitly mentioned in the text, use "N/A" as the default value.
72
  Do not repeat in the extracted text the name of the section.
73
  Extract precisely all the related text.
74
+ Text of the position description:
 
75
  {full_text}
 
76
  Section to identify: "{section_title}":
77
  """
 
78
  return gpt_call("You are an HR expert working for IOM.", user_prompt)
79
 
 
80
  def classify_job_family(responsibilities: List[str]) -> str:
81
  job_families_df = pd.read_csv("job_families1.csv")
82
  job_family_list = "\n".join(f"- {row['Job_family']}: {row['Job_subfamily']}" for _, row in job_families_df.iterrows())
83
  user_prompt = f"""
84
+ Here is a list of job responsibilities:
85
+ {responsibilities}
86
+ Here is a list of Job families:
87
+ {job_family_list}
88
+ Based on the responsibilities, suggest the most relevant job family and subfamily from the list above.
89
+ **Important:**
 
 
 
 
 
90
  - Return ONLY the job family, nothing else.
91
  - The job family should be exactly as shown in the list.
92
  - Do not include any additional text or explanation.
93
+ """
 
94
  return gpt_call("Suggest job family and subfamily based on responsibilities.", user_prompt)
95
 
 
96
  def get_level_CCOG_info(df, code, level_name):
 
 
97
  matches = df[df['code'] == code]
98
  if len(matches) == 0:
99
  print(f"Warning: No {level_name} found for CCOG code {code}")
 
110
  }
111
 
112
  def code_sanitize(input_string, valid_codes):
 
 
 
 
113
  for code in valid_codes:
114
+ if code in input_string:
115
  return code
116
  return None
117
 
118
  def classify_occupational_group_by_level(responsibilities: List[str]) -> dict:
 
 
 
 
 
 
 
 
 
119
  occupational_groups_df = pd.read_csv("occupational_groups.csv")
120
  result = {}
 
121
  try:
122
+ for level in range(1, 5):
123
+ level_df = occupational_groups_df[occupational_groups_df['level'] == f"Level {level}"]
124
+ if level > 1:
125
+ prev_level_code = result[f'Level_{level-1}_CCOG_code']
126
+ level_df = level_df[level_df['code'].str.startswith(prev_level_code)]
127
+ job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row.get('occupation_description', '')}" for _, row in level_df.iterrows())
128
+ list_output = level_df["code"].tolist()
129
+ user_prompt = f"""
130
+ Here is a list of job responsibilities:
131
+ {responsibilities}
132
+ Here is a list of level {level} Occupation classifications:
133
+ {job_occupation_list}
134
+ Based on the responsibilities, suggest the most relevant level {level} Occupation code from within this list: {', '.join(map(str, list_output))}.
135
+ **Important:**
136
+ - Return ONLY the code, nothing else.
137
+ - The code should be exactly as shown in the list.
138
+ - Do not include any additional text or explanation.
139
+ """
140
+ level_code = gpt_call(f"Identify level {level} occupational group", user_prompt).strip()
141
+ level_code = code_sanitize(level_code, list_output)
142
+ result.update(get_level_CCOG_info(level_df, level_code, f'Level_{level}'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  except Exception as e:
144
  print(f"Error during classification: {str(e)}")
145
  result['error'] = str(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  return result
 
 
147
 
148
  def get_skills_info_esco(Level_5_code):
149
+ esco_level5_df = pd.read_csv("occupations_en.csv", dtype={'code': str, 'iscoGroup': str})
 
 
 
 
 
 
150
  matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
 
 
151
  conceptUris = matches['conceptUri'].values.tolist()
152
+ esco_skill_map_df = pd.read_csv("occupationSkillRelations_en.csv")
 
 
 
 
153
  skills = esco_skill_map_df[esco_skill_map_df['occupationUri'].isin(conceptUris)]
 
 
154
  skillUris = skills['skillUri'].values.tolist()
155
+ esco_skill_df = pd.read_csv("skills_en.csv")
 
 
 
 
156
  thisskillslist = esco_skill_df[esco_skill_df['conceptUri'].isin(skillUris)]
157
+ result = thisskillslist[['preferredLabel', 'conceptUri', 'description']].drop_duplicates()
158
+ result = result.rename(columns={'preferredLabel': 'skill_name', 'description': 'skill_description', 'conceptUri': 'skill_code'})
159
+ return result
160
 
161
+ def review_skills(Level_5_code: str, top_n: int = 10) -> List[Dict[str, str]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
 
 
163
  esco_occup = matches['preferredLabel'].values.tolist()
164
  skill_filtered = get_skills_info_esco(Level_5_code)
165
+ skill_filtered_options = "\n".join(f"- {row['skill_code']}: {row['skill_name']} - {row['skill_description']}" for _, row in skill_filtered.iterrows())
 
 
 
166
  prompt = f"""
167
+ Here is a list of skills:
168
+ {skill_filtered_options}
169
+ Filter the skills that are relevant in the context of the work of the International Organisation for Migration.
170
+ Ensure that skills are relevant in the context of a {esco_occup} working for a non-profit public organization.
171
+ Required JSON structure:
172
+ {{
173
+ "skills": [
174
+ {{
175
+ "skill_name": "string",
176
+ "skill_description": "string",
177
+ "skill_code": "string"
178
+ }}
179
+ ]
180
+ }}
181
+ **Important:**
182
+ - Do not duplicate any records of skills
183
+ - Keep only the 10 most relevant skills
184
+ - Return ONLY the JSON object with no other text
185
+ - Use double quotes for all strings
186
+ - No trailing commas in arrays/objects
187
+ - No markdown formatting (no ```json)
188
+ - No text before or after the JSON
189
+ - Escape all special characters in strings
190
+ - Ensure all brackets are properly closed
191
+ - No trailing commas in arrays/objects, especially before closing brackets
192
+ """
193
+ raw = gpt_call("You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.", prompt)
 
 
 
 
 
 
 
 
 
 
194
  json_text = _extract_json(raw)
195
  if not json_text:
196
  return []
 
197
  try:
198
  result = json.loads(json_text)
199
  skills = result.get("skills", [])
 
201
  print(f"❌ JSON parsing error: {e}")
202
  print(f"🔍 Problematic JSON: {json_text}")
203
  return []
 
204
  validated_skills = []
205
  for skill in skills:
206
  try:
 
213
  except (KeyError, TypeError) as e:
214
  print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
215
  continue
216
+ return validated_skills[\:top_n]
 
 
 
217
 
218
  def extract_skills(responsibilities: List[str], top_n: int = 10) -> List[Dict[str, str]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  prompt = f"""
220
+ Here is a list of job responsibilities:
221
+ {responsibilities}
222
+ List the required skills and knowledge as bullet points (without numbers) using ESCO-style terms.
223
+ For each Skill:
224
+ 1. skill_name: precise skills name as used in ESCO framework
225
+ 2. skill_description: add the long description as mentioned in ESCO framework
226
+ 3. skill_code: include the detailed corresponding ESCO code for that skill.
227
+ Required JSON structure:
228
+ {{
229
+ "skills": [
230
+ {{
231
+ "skill_name": "string",
232
+ "skill_description": "string",
233
+ "skill_code": "string"
234
+ }}
235
+ ]
236
+ }}
237
+ **Important:**
238
+ - Return ONLY the JSON object with no other text
239
+ - Use double quotes for all strings
240
+ - No trailing commas in arrays/objects
241
+ - No markdown formatting (no ```json)
242
+ - No text before or after the JSON
243
+ - Escape all special characters in strings
244
+ - Ensure all brackets are properly closed
245
+ """
246
+ raw = gpt_call("You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.", prompt)
 
 
 
 
 
 
 
 
 
 
 
247
  json_text = _extract_json(raw)
248
  if not json_text:
249
  return []
 
250
  try:
251
  result = json.loads(json_text)
252
  skills = result.get("skills", [])
 
254
  print(f"❌ JSON parsing error: {e}")
255
  print(f"🔍 Problematic JSON: {json_text}")
256
  return []
 
257
  validated_skills = []
258
  for skill in skills:
259
  try:
 
266
  except (KeyError, TypeError) as e:
267
  print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
268
  continue
 
269
  return validated_skills[:top_n]
270
 
 
271
  def map_proficiency_and_assessment(skills: List[str], responsibilities: List[str]) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  prompt = f"""
273
+ Here is a list of job responsibilities: {responsibilities} that have been associated with the following skills: {skills}
274
+ For each skill, accounting for the context defined within the responsibilities, return a JSON object with:
275
+ - skill_name: the name of the skill
276
+ - importance: essential or optional
277
+ - type: "skill/competence" or "knowledge"
278
+ - proficiency_level: Basic, Intermediate, or Advanced
279
+ - distinctive_elements: what specific and distinctive elements are required at this defined proficiency level?
280
+ - resume_signals: what to look for in a resume to assess this skill?
281
+ - assessment_method: what is the preferred assessment method to accurately assess this skill?
282
+ Respond ONLY with a list of dictionaries in valid JSON.
283
+ Use double quotes for all strings. No markdown, no commentary, no trailing commas.
284
+ """
 
 
 
285
  raw = gpt_call("Define proficiency level and assessment for each skill.", prompt)
 
286
  json_text = _extract_json_array(raw)
287
  if not json_text:
288
  return []
 
289
  try:
290
  results = json.loads(json_text)
291
  except json.JSONDecodeError as e:
292
  print(f"❌ JSON parsing error: {e}")
293
  print(f"🔍 Problematic JSON: {json_text}")
294
  return []
 
295
  validated = []
296
  for item in results:
297
  try:
 
307
  except (KeyError, TypeError) as e:
308
  print(f"⚠️ Skipping invalid item: {item}. Error: {e}")
309
  continue
 
310
  return validated
311
 
312
  def _extract_json_array(raw: str) -> str:
 
 
 
313
  json_start = raw.find('[')
314
  json_end = raw.rfind(']') + 1
 
315
  if json_start == -1 or json_end == 0:
316
  print(f"❌ No JSON array found in response: {raw}")
317
  return ""
 
318
  json_text = raw[json_start:json_end]
319
+ json_text = re.sub(r',\s*([}\]])', r'\1', json_text)
320
+ json_text = re.sub(r'[\n\r\t]', ' ', json_text)
321
+ json_text = re.sub(r'(?<!\\)"', '"', json_text)
 
 
 
322
  return json_text
323
 
324
  def extract_qualification(responsibilities: List[str]) -> List[str]:
 
325
  prompt = f"""
326
+ Here is a list of job responsibilities: {responsibilities}
327
+ Infer the required level within the European Qualifications Framework (EQF) to implement them.
328
+ Identify the potential diplomas to testify such qualification
329
+ """
330
+ raw = gpt_call("You are an HR expert that excel in developing competency-based interview questions.", prompt)
 
 
331
  return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
332
 
333
  def build_interview(responsibilities: List[str], skill_assess: List[str]) -> List[str]:
 
334
  prompt = f"""
335
+ Here is a list of job responsibilities: {responsibilities} and related skills: {skill_assess}
336
+ Output: A structured 40-minute interview with:
337
+ Opening questions (5 min)
338
+ Core competency-based questions (30 min, 5-6 questions)
339
+ Closing & candidate questions (5 min)
340
+ """
341
+ raw = gpt_call("You are an HR expert that excel in developing competency-based interview questions.", prompt)
 
 
 
 
 
 
 
 
342
  return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
343
 
 
 
344
  def _extract_json(raw: str) -> str:
 
 
 
345
  json_start = raw.find('{')
346
  json_end = raw.rfind('}') + 1
 
347
  if json_start == -1 or json_end == 0:
348
  print(f"❌ No JSON found in response: {raw}")
349
  return ""
 
350
  json_text = raw[json_start:json_end]
351
+ json_text = re.sub(r',\s*([}\]])', r'\1', json_text)
352
+ json_text = re.sub(r'[\n\r\t]', ' ', json_text)
353
+ json_text = re.sub(r'\s{2,}', ' ', json_text)
354
+ json_text = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', json_text)
 
 
355
  json_text = json_text.strip()
356
+ return json_text
 
 
 
357
 
358
  def process_pdf(file):
 
 
 
359
  if file is None:
360
  return "Please upload a PDF file."
 
361
  try:
362
  extracted_text = extract_text_from_pdf(file.name)
363
+ responsibilities = extract_section_from_pdf(extracted_text, section_title="Responsibilities and Accountabilities")
 
 
364
  if not responsibilities:
365
+ print(f"Skipping {os.path.basename(file.name)} - no responsibilities section found")
366
  return None
 
 
367
  job_family = classify_job_family(responsibilities)
368
  occ_group = classify_occupational_group_by_level(responsibilities)
369
  esco_occ = classify_esco_by_hierarchical_level(responsibilities)
370
  qualification = extract_qualification(responsibilities)
371
  skills = extract_skills(responsibilities)
372
  skill_map = map_proficiency_and_assessment(skills, responsibilities)
 
 
373
  has_esco = esco_occ.get("Level_5_ESCO_code") is not None
 
 
374
  skill_esco_extract = []
375
  skill_esco_map = []
376
  if has_esco:
 
378
  skill_esco_extract = review_skills(Level_5_code)
379
  skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
380
  else:
381
+ print(f"No Level 5 ESCO code found for {os.path.basename(file.name)}, skipping ESCO skills mapping")
382
+ time.sleep(6)
 
 
 
383
  assessment_lookup = {item['skill_name']: item for item in skill_map}
384
  joined_skills = [
385
  {
 
395
  }
396
  for skill in skills
397
  ]
 
 
398
  joined_skills_esco = []
399
  if has_esco and skill_esco_extract:
400
  assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
 
407
  }
408
  for skill in skill_esco_extract
409
  ]
 
410
  interview = build_interview(responsibilities, skills)
 
 
411
  result = {
412
+ "file": os.path.basename(file.name),
413
  "responsibilities": responsibilities,
414
  "job_family": job_fam1['Job_family'].values[0],
415
  "job_subfamily": job_fam1['Job_subfamily'].values[0],
416
  "classified_job_family": job_family,
417
+ **{f"Level_{i}_CCOG_{field}": occ_group.get(f"Level_{i}_CCOG_{field}")
418
  for i in range(1, 5) for field in ["code", "name", "desc"]},
419
  "qualification": qualification,
420
  "interview": interview,
421
  "skills": {
422
+ "file": os.path.basename(file.name),
423
  "job_family": job_fam1['Job_family'].values[0],
424
  "job_subfamily": job_fam1['Job_subfamily'].values[0],
425
  "skills": joined_skills
426
  }
427
  }
 
 
428
  if has_esco:
429
  result.update({
430
+ **{f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}")
431
  for i in range(1, 6) for field in ["code", "name", "desc"]},
432
  "skills_esco": {
433
+ "file": os.path.basename(file.name),
434
  "job_family": job_fam1['Job_family'].values[0],
435
  "job_subfamily": job_fam1['Job_subfamily'].values[0],
436
  "skills": joined_skills_esco
437
  }
438
  })
439
  else:
 
440
  result.update({
441
+ **{f"Level_{i}_ESCO_{field}": None
442
  for i in range(1, 6) for field in ["code", "name", "desc"]},
443
  "skills_esco": None
444
  })
 
445
  return result
 
446
  except Exception as e:
447
  return f"Error processing PDF: {str(e)}"
448
 
 
449
  with gr.Blocks() as demo:
450
+ gr.Markdown("# Standardize Job Description!")
451
  gr.Markdown("Identify Job Family, Occupation, Qualification, match Skills and suggest interview questions.")
 
452
  with gr.Row():
453
  with gr.Column():
454
  file_input = gr.File(label="Upload a Job Description PDF file", file_types=[".pdf"])
455
  submit_btn = gr.Button("Extract Text")
456
  with gr.Column():
457
  text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
 
458
  submit_btn.click(
459
  fn=process_pdf,
460
  inputs=file_input,
461
  outputs=text_output
462
  )
463
 
 
464
  if __name__ == "__main__":
465
+ demo.launch()