Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,34 @@ import pandas as pd
|
|
| 4 |
import re
|
| 5 |
import warnings
|
| 6 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# Configure logging for pdfminer
|
| 9 |
logging.getLogger('pdfminer').setLevel(logging.ERROR) # Only show errors, not warnings
|
|
@@ -38,6 +66,800 @@ def extract_text_from_pdf(pdf_path, suppress_warnings=True):
|
|
| 38 |
text += "\n"
|
| 39 |
return text
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
def process_pdf(file):
|
| 42 |
"""
|
| 43 |
Processes the uploaded PDF file and returns the extracted text.
|
|
@@ -47,18 +869,121 @@ def process_pdf(file):
|
|
| 47 |
|
| 48 |
try:
|
| 49 |
extracted_text = extract_text_from_pdf(file.name)
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
except Exception as e:
|
| 52 |
return f"Error processing PDF: {str(e)}"
|
| 53 |
|
| 54 |
# Create the Gradio interface
|
| 55 |
with gr.Blocks() as demo:
|
| 56 |
-
gr.Markdown("#
|
| 57 |
-
gr.Markdown("
|
| 58 |
|
| 59 |
with gr.Row():
|
| 60 |
with gr.Column():
|
| 61 |
-
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
| 62 |
submit_btn = gr.Button("Extract Text")
|
| 63 |
with gr.Column():
|
| 64 |
text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
|
|
|
|
| 4 |
import re
|
| 5 |
import warnings
|
| 6 |
import logging
|
| 7 |
+
import os
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
# Load environment variables
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
import openai
|
| 15 |
+
def gpt_call(system_prompt: str, user_prompt: str) -> str:
|
| 16 |
+
try:
|
| 17 |
+
client = openai.AzureOpenAI(
|
| 18 |
+
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
|
| 19 |
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
| 20 |
+
api_version=os.getenv("OPENAI_API_VERSION"),
|
| 21 |
+
)
|
| 22 |
+
response = client.chat.completions.create(
|
| 23 |
+
model=os.getenv("AZURE_DEPLOYMENT_NAME"),
|
| 24 |
+
messages=[
|
| 25 |
+
{"role": "system", "content": system_prompt},
|
| 26 |
+
{"role": "user", "content": user_prompt}
|
| 27 |
+
],
|
| 28 |
+
temperature=0.3 # setting a low temp to be conservative
|
| 29 |
+
)
|
| 30 |
+
return response.choices[0].message.content.strip()
|
| 31 |
+
except OpenAIError as e:
|
| 32 |
+
return f"ERROR: {e}"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
|
| 36 |
# Configure logging for pdfminer
|
| 37 |
logging.getLogger('pdfminer').setLevel(logging.ERROR) # Only show errors, not warnings
|
|
|
|
| 66 |
text += "\n"
|
| 67 |
return text
|
| 68 |
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def extract_section_from_pdf(full_text, section_title):
|
| 73 |
+
"""
|
| 74 |
+
Uses OpenAI to extract a specific section (e.g., "Responsibilities and Accountabilities") from the full text.
|
| 75 |
+
"""
|
| 76 |
+
user_prompt = f"""
|
| 77 |
+
|
| 78 |
+
Carefully evaluate the provided position description (PD) document and extract thecontent of the section titled "{section_title}" from the following text.
|
| 79 |
+
|
| 80 |
+
Return only the content of the section, without the title.
|
| 81 |
+
If the section cannot be found or explicitly mentioned in the text, use ""N/A"" as the default value.
|
| 82 |
+
Do not repeat in the extracted text the name of the section.
|
| 83 |
+
Extract precisely all the related text.
|
| 84 |
+
|
| 85 |
+
Text of the position description:
|
| 86 |
+
{full_text}
|
| 87 |
+
|
| 88 |
+
Section to identify: "{section_title}":
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
return gpt_call("You are an HR expert working for IOM.", user_prompt)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def classify_job_family(responsibilities: List[str]) -> str:
|
| 95 |
+
job_families_df = pd.read_csv("job_families1.csv")
|
| 96 |
+
job_family_list = "\n".join(f"- {row['Job_family']}: {row['Job_subfamily']}" for _, row in job_families_df.iterrows())
|
| 97 |
+
user_prompt = f"""
|
| 98 |
+
|
| 99 |
+
Here is a list of job responsibilities:
|
| 100 |
+
|
| 101 |
+
{responsibilities}
|
| 102 |
+
|
| 103 |
+
Here is a list of Job families
|
| 104 |
+
{job_family_list}
|
| 105 |
+
|
| 106 |
+
Based on the responsibilities, suggest the most relevant job family and subfamily from the list above.
|
| 107 |
+
|
| 108 |
+
**Important:**
|
| 109 |
+
- Return ONLY the job family, nothing else.
|
| 110 |
+
- The job family should be exactly as shown in the list.
|
| 111 |
+
- Do not include any additional text or explanation.
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
return gpt_call("Suggest job family and subfamily based on responsibilities.", user_prompt)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def get_level_CCOG_info(df, code, level_name):
|
| 118 |
+
"""Helper function to get level info with error handling"""
|
| 119 |
+
occupational_groups_df = pd.read_csv("occupational_groups.csv")
|
| 120 |
+
matches = df[df['code'] == code]
|
| 121 |
+
if len(matches) == 0:
|
| 122 |
+
print(f"Warning: No {level_name} found for CCOG code {code}")
|
| 123 |
+
return {
|
| 124 |
+
f'{level_name}_CCOG_code': code,
|
| 125 |
+
f'{level_name}_CCOG_name': 'UNKNOWN',
|
| 126 |
+
f'{level_name}_CCOG_desc': 'No matching occupation found'
|
| 127 |
+
}
|
| 128 |
+
info = matches.iloc[0]
|
| 129 |
+
return {
|
| 130 |
+
f'{level_name}_CCOG_code': code,
|
| 131 |
+
f'{level_name}_CCOG_name': info['occupation'],
|
| 132 |
+
f'{level_name}_CCOG_desc': info.get('occupation_description', '')
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
def code_sanitize(input_string, valid_codes):
|
| 136 |
+
"""
|
| 137 |
+
Checks if any of the valid_codes exists as a substring in input_string.
|
| 138 |
+
Returns the first matching code, otherwise None.
|
| 139 |
+
"""
|
| 140 |
+
for code in valid_codes:
|
| 141 |
+
if code in input_string: # Checks for exact substring match
|
| 142 |
+
return code
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
def classify_occupational_group_by_level(responsibilities: List[str]) -> dict:
|
| 146 |
+
"""
|
| 147 |
+
Classifies job responsibilities into occupational groups at 4 levels,
|
| 148 |
+
The [Common Classification of Occupational Groups (CCOG)](https://icsc.un.org/Resources/HRPD/JobEvaluation/CCOG_9_2015.pdf)
|
| 149 |
+
returning codes, names, and descriptions for each level.
|
| 150 |
+
Args:
|
| 151 |
+
responsibilities: List of job responsibility strings
|
| 152 |
+
Returns:
|
| 153 |
+
Dictionary containing classification information or error message
|
| 154 |
+
"""
|
| 155 |
+
occupational_groups_df = pd.read_csv("occupational_groups.csv")
|
| 156 |
+
result = {}
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
######################## Level 1 ###################
|
| 160 |
+
level1_df = occupational_groups_df[occupational_groups_df['level'] == "Level 1"]
|
| 161 |
+
job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']}"
|
| 162 |
+
for _, row in level1_df.iterrows())
|
| 163 |
+
#print(job_occupation_list)
|
| 164 |
+
list1_output = level1_df["code"].tolist() # Convert Series to list
|
| 165 |
+
list1 = ", ".join(map(str, list1_output)) # Join elements with comma
|
| 166 |
+
#print(list1)
|
| 167 |
+
|
| 168 |
+
user_prompt1 = f"""
|
| 169 |
+
Here is a list of job responsibilities:
|
| 170 |
+
{responsibilities}
|
| 171 |
+
|
| 172 |
+
Here is a list of level 1 Occupation classifications:
|
| 173 |
+
{job_occupation_list}
|
| 174 |
+
|
| 175 |
+
Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.
|
| 176 |
+
|
| 177 |
+
**Important:**
|
| 178 |
+
- Return ONLY the code, nothing else.
|
| 179 |
+
- The code should be exactly as shown in the list.
|
| 180 |
+
- Do not include any additional text or explanation.
|
| 181 |
+
"""
|
| 182 |
+
#print(user_prompt1)
|
| 183 |
+
level1_code = gpt_call("Identify level 1 occupational group", user_prompt1).strip()
|
| 184 |
+
level1_code = code_sanitize(level1_code, list1_output)
|
| 185 |
+
#print(level1_code)
|
| 186 |
+
result.update(get_level_CCOG_info(level1_df, level1_code, 'Level_1'))
|
| 187 |
+
|
| 188 |
+
######################## Level 2 ###################
|
| 189 |
+
level2_df = occupational_groups_df[
|
| 190 |
+
(occupational_groups_df['level'] == "Level 2") &
|
| 191 |
+
(occupational_groups_df['code'].str.startswith(level1_code))
|
| 192 |
+
]
|
| 193 |
+
job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row['occupation_description']}"
|
| 194 |
+
for _, row in level2_df.iterrows())
|
| 195 |
+
#print(job_occupation_list)
|
| 196 |
+
list2_output = level2_df["code"].tolist() # Convert Series to list
|
| 197 |
+
list2 = ", ".join(map(str, list2_output)) # Join elements with comma
|
| 198 |
+
#print(list2)
|
| 199 |
+
|
| 200 |
+
user_prompt2 = f"""
|
| 201 |
+
Here is a list of job responsibilities:
|
| 202 |
+
{responsibilities}
|
| 203 |
+
|
| 204 |
+
Here is a list of level 2 Occupation classifications within {level1_code}:
|
| 205 |
+
{job_occupation_list}
|
| 206 |
+
|
| 207 |
+
Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
|
| 208 |
+
**Important:**
|
| 209 |
+
- Return ONLY the code, nothing else.
|
| 210 |
+
- The code should be exactly as shown in the list.
|
| 211 |
+
- Do not include any additional text or explanation.
|
| 212 |
+
"""
|
| 213 |
+
#print(user_prompt2)
|
| 214 |
+
level2_code = gpt_call("Identify level 2 occupational group", user_prompt2).strip()
|
| 215 |
+
level2_code = code_sanitize(level2_code, list2_output)
|
| 216 |
+
#print(level2_code)
|
| 217 |
+
result.update(get_level_CCOG_info(level2_df, level2_code, 'Level_2'))
|
| 218 |
+
|
| 219 |
+
######################## Level 3 ###################
|
| 220 |
+
level3_df = occupational_groups_df[
|
| 221 |
+
(occupational_groups_df['level'] == "Level 3") &
|
| 222 |
+
(occupational_groups_df['code'].str.startswith(level2_code))
|
| 223 |
+
]
|
| 224 |
+
job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row['occupation_description']}"
|
| 225 |
+
for _, row in level3_df.iterrows())
|
| 226 |
+
#print(job_occupation_list)
|
| 227 |
+
list3_output = level3_df["code"].tolist() # Convert Series to list
|
| 228 |
+
list3 = ", ".join(map(str, list3_output)) # Join elements with comma
|
| 229 |
+
#print(list3)
|
| 230 |
+
|
| 231 |
+
user_prompt3 = f"""
|
| 232 |
+
Here is a list of job responsibilities:
|
| 233 |
+
{responsibilities}
|
| 234 |
+
|
| 235 |
+
Here is a list of level 3 Occupation classifications within {level2_code}:
|
| 236 |
+
{job_occupation_list}
|
| 237 |
+
|
| 238 |
+
Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.
|
| 239 |
+
|
| 240 |
+
**Important:**
|
| 241 |
+
- Return ONLY the code, nothing else.
|
| 242 |
+
- The code should be exactly as shown in the list.
|
| 243 |
+
- Do not include any additional text or explanation.
|
| 244 |
+
|
| 245 |
+
"""
|
| 246 |
+
level3_code = gpt_call("Identify level 3 occupational group", user_prompt3).strip()
|
| 247 |
+
level3_code = code_sanitize(level3_code, list3_output)
|
| 248 |
+
result.update(get_level_CCOG_info(level3_df, level3_code, 'Level_3'))
|
| 249 |
+
|
| 250 |
+
######################## Level 4 ###################
|
| 251 |
+
level4_df = occupational_groups_df[
|
| 252 |
+
(occupational_groups_df['level'] == "Level 4") &
|
| 253 |
+
(occupational_groups_df['code'].str.startswith(level3_code))
|
| 254 |
+
]
|
| 255 |
+
job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} : {row['occupation_description']}"
|
| 256 |
+
for _, row in level4_df.iterrows())
|
| 257 |
+
#print(job_occupation_list)
|
| 258 |
+
list4_output = level4_df["code"].tolist() # Convert Series to list
|
| 259 |
+
list4 = ", ".join(map(str, list4_output)) # Join elements with comma
|
| 260 |
+
#print(list4)
|
| 261 |
+
user_prompt4 = f"""
|
| 262 |
+
Here is a list of job responsibilities:
|
| 263 |
+
{responsibilities}
|
| 264 |
+
|
| 265 |
+
Here is a list of level 4 Occupation classifications within {level3_code}:
|
| 266 |
+
{job_occupation_list}
|
| 267 |
+
|
| 268 |
+
Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
|
| 269 |
+
**Important:**
|
| 270 |
+
- Return ONLY the code, nothing else.
|
| 271 |
+
- The code should be exactly as shown in the list.
|
| 272 |
+
- Do not include any additional text or explanation.
|
| 273 |
+
"""
|
| 274 |
+
|
| 275 |
+
level4_code = gpt_call("Identify final occupational group", user_prompt4).strip()
|
| 276 |
+
level4_code = code_sanitize(level4_code, list4_output)
|
| 277 |
+
result.update(get_level_CCOG_info(level4_df, level4_code, 'Level_4'))
|
| 278 |
+
|
| 279 |
+
except Exception as e:
|
| 280 |
+
print(f"Error during classification: {str(e)}")
|
| 281 |
+
result['error'] = str(e)
|
| 282 |
+
|
| 283 |
+
return result
|
| 284 |
+
|
| 285 |
+
from typing import List, Dict
|
| 286 |
+
import pandas as pd
|
| 287 |
+
esco_df = pd.read_csv(
|
| 288 |
+
"ISCOGroups_en.csv",
|
| 289 |
+
dtype={'code': str} # Force 'code' to be read as string
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
esco_level5_df = pd.read_csv(
|
| 294 |
+
"occupations_en.csv",
|
| 295 |
+
dtype={'code': str, 'iscoGroup': str, } # Force 'code' to be read as string
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
def get_level_ESCO_info(df, code, level_name):
|
| 299 |
+
"""Helper function to get level info with error handling"""
|
| 300 |
+
matches = df[df['code'] == code]
|
| 301 |
+
if len(matches) == 0:
|
| 302 |
+
print(f"Warning: No {level_name} found for ESCO code {code}")
|
| 303 |
+
return {
|
| 304 |
+
f'{level_name}_ESCO_code': code,
|
| 305 |
+
f'{level_name}_ESCO_name': 'UNKNOWN',
|
| 306 |
+
f'{level_name}_ESCO_desc': 'No matching occupation found'
|
| 307 |
+
}
|
| 308 |
+
info = matches.iloc[0]
|
| 309 |
+
return {
|
| 310 |
+
f'{level_name}_ESCO_code': code,
|
| 311 |
+
f'{level_name}_ESCO_name': info['preferredLabel'],
|
| 312 |
+
f'{level_name}_ESCO_desc': info.get('description', '')
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
def classify_esco_by_hierarchical_level(responsibilities: List[str]) -> dict:
|
| 316 |
+
"""
|
| 317 |
+
Classifies job responsibilities into occupational groups at 4 levels,
|
| 318 |
+
[European Skills, Competences, Qualifications, and Occupations (ESCO)](https://esco.ec.europa.eu/en)
|
| 319 |
+
returning codes, names, and descriptions for each level.
|
| 320 |
+
Args:
|
| 321 |
+
responsibilities: List of job responsibility strings
|
| 322 |
+
Returns:
|
| 323 |
+
Dictionary containing classification information or error message
|
| 324 |
+
"""
|
| 325 |
+
|
| 326 |
+
esco_df = pd.read_csv(
|
| 327 |
+
"ISCOGroups_en.csv",
|
| 328 |
+
dtype={'code': str} # Force 'code' to be read as string
|
| 329 |
+
)
|
| 330 |
+
# print(esco_df.columns)
|
| 331 |
+
|
| 332 |
+
esco_level5_df = pd.read_csv(
|
| 333 |
+
"occupations_en.csv",
|
| 334 |
+
dtype={'code': str, 'iscoGroup': str, } # Force 'code' to be read as string
|
| 335 |
+
)
|
| 336 |
+
# print(esco_level5_df.columns)
|
| 337 |
+
|
| 338 |
+
result = {}
|
| 339 |
+
######################## Level 1 ###################
|
| 340 |
+
# Get all top-level codes (single character/digit)
|
| 341 |
+
top_level_codes = sorted({
|
| 342 |
+
code for code in esco_df['code']
|
| 343 |
+
if len(code) == 1 and code.isalnum()
|
| 344 |
+
})
|
| 345 |
+
|
| 346 |
+
level1_code = None
|
| 347 |
+
if top_level_codes:
|
| 348 |
+
level1_df = esco_df[esco_df['code'].isin(top_level_codes)]
|
| 349 |
+
job_occupation_list = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
|
| 350 |
+
for _, row in level1_df.iterrows())
|
| 351 |
+
#print(job_occupation_list)
|
| 352 |
+
list1_output = level1_df["code"].tolist() # Convert Series to list
|
| 353 |
+
list1 = ", ".join(map(str, list1_output)) # Join elements with comma
|
| 354 |
+
#print(list1)
|
| 355 |
+
|
| 356 |
+
user_prompt1 = f"""
|
| 357 |
+
Here is a list of job responsibilities:
|
| 358 |
+
{responsibilities}
|
| 359 |
+
|
| 360 |
+
Select the most relevant top-level code from these options:
|
| 361 |
+
{job_occupation_list}
|
| 362 |
+
|
| 363 |
+
Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.
|
| 364 |
+
**Important:**
|
| 365 |
+
- Return ONLY the code, nothing else.
|
| 366 |
+
- The code should be exactly as shown in the list.
|
| 367 |
+
- Do not include any additional text or explanation.
|
| 368 |
+
"""
|
| 369 |
+
level1_code = gpt_call("Identify top-level occupational group", user_prompt1).strip()
|
| 370 |
+
level1_code = code_sanitize(level1_code, list1_output)
|
| 371 |
+
result.update(get_level_ESCO_info(level1_df, level1_code, 'Level_1'))
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
######################## Level 2 ###################
|
| 375 |
+
|
| 376 |
+
level2_code = None
|
| 377 |
+
if level1_code:
|
| 378 |
+
level2_df = esco_df[
|
| 379 |
+
(esco_df['code'].str.startswith(level1_code)) & (esco_df['code'].str.len() == len(level1_code) + 1)
|
| 380 |
+
]
|
| 381 |
+
if not level2_df.empty:
|
| 382 |
+
level2_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
|
| 383 |
+
for _, row in level2_df.iterrows())
|
| 384 |
+
#print(job_occupation_list)
|
| 385 |
+
list2_output = level2_df["code"].tolist() # Convert Series to list
|
| 386 |
+
list2 = ", ".join(map(str, list2_output)) # Join elements with comma
|
| 387 |
+
#print(list2)
|
| 388 |
+
|
| 389 |
+
user_prompt2 = f"""
|
| 390 |
+
Here is a list of job responsibilities:
|
| 391 |
+
{responsibilities}
|
| 392 |
+
|
| 393 |
+
Here is a list of level 2 Occupation classifications within {level1_code}:
|
| 394 |
+
{level2_options}
|
| 395 |
+
|
| 396 |
+
Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
|
| 397 |
+
**Important:**
|
| 398 |
+
- Return ONLY the code, nothing else.
|
| 399 |
+
- The code should be exactly as shown in the list.
|
| 400 |
+
- Do not include any additional text or explanation.
|
| 401 |
+
"""
|
| 402 |
+
level2_code = gpt_call("Identify second-level occupational group", user_prompt2).strip()
|
| 403 |
+
level2_code = code_sanitize(level2_code, list2_output)
|
| 404 |
+
result.update(get_level_ESCO_info(level2_df, level2_code, 'Level_2'))
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
######################## Level 3 ###################
|
| 408 |
+
level3_code = None
|
| 409 |
+
if level2_code:
|
| 410 |
+
level3_df = esco_df[
|
| 411 |
+
(esco_df['code'].str.startswith(level2_code)) & (esco_df['code'].str.len() == len(level2_code) + 1)
|
| 412 |
+
]
|
| 413 |
+
if not level3_df.empty:
|
| 414 |
+
level3_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
|
| 415 |
+
for _, row in level3_df.iterrows())
|
| 416 |
+
#print(job_occupation_list)
|
| 417 |
+
list3_output = level3_df["code"].tolist() # Convert Series to list
|
| 418 |
+
list3 = ", ".join(map(str, list3_output)) # Join elements with comma
|
| 419 |
+
#print(list3)
|
| 420 |
+
|
| 421 |
+
user_prompt3 = f"""
|
| 422 |
+
Here is a list of job responsibilities:
|
| 423 |
+
{responsibilities}
|
| 424 |
+
|
| 425 |
+
Here is a list of level 3 Occupation classifications within {level2_code}:
|
| 426 |
+
{level3_options}
|
| 427 |
+
|
| 428 |
+
Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.
|
| 429 |
+
|
| 430 |
+
**Important:**
|
| 431 |
+
- Return ONLY the code, nothing else.
|
| 432 |
+
- The code should be exactly as shown in the list.
|
| 433 |
+
- Do not include any additional text or explanation.
|
| 434 |
+
|
| 435 |
+
"""
|
| 436 |
+
level3_code = gpt_call("Identify third-level occupational group", user_prompt3).strip()
|
| 437 |
+
level3_code = code_sanitize(level3_code, list3_output)
|
| 438 |
+
result.update(get_level_ESCO_info(level3_df, level3_code, 'Level_3'))
|
| 439 |
+
|
| 440 |
+
######################## Level 4 ###################
|
| 441 |
+
level4_code = None
|
| 442 |
+
if level3_code:
|
| 443 |
+
level4_df = esco_df[
|
| 444 |
+
(esco_df['code'].str.startswith(level3_code)) & (esco_df['code'].str.len() == len(level3_code) + 1)
|
| 445 |
+
]
|
| 446 |
+
if not level4_df.empty:
|
| 447 |
+
level4_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
|
| 448 |
+
for _, row in level4_df.iterrows())
|
| 449 |
+
#print(job_occupation_list)
|
| 450 |
+
list4_output = level4_df["code"].tolist() # Convert Series to list
|
| 451 |
+
list4 = ", ".join(map(str, list4_output)) # Join elements with comma
|
| 452 |
+
#print(list4)
|
| 453 |
+
user_prompt4 = f"""
|
| 454 |
+
Here is a list of job responsibilities:
|
| 455 |
+
{responsibilities}
|
| 456 |
+
|
| 457 |
+
Here is a list of level 4 Occupation classifications within {level3_code}:
|
| 458 |
+
{level4_options}
|
| 459 |
+
|
| 460 |
+
Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
|
| 461 |
+
**Important:**
|
| 462 |
+
- Return ONLY the code, nothing else.
|
| 463 |
+
- The code should be exactly as shown in the list.
|
| 464 |
+
- Do not include any additional text or explanation.
|
| 465 |
+
"""
|
| 466 |
+
level4_code = gpt_call("Identify fourth-level occupational group", user_prompt4).strip()
|
| 467 |
+
level4_code = code_sanitize(level4_code, list4_output)
|
| 468 |
+
result.update(get_level_ESCO_info(level4_df, level4_code, 'Level_4'))
|
| 469 |
+
|
| 470 |
+
######################## Level 5 ###################
|
| 471 |
+
level5_code = None
|
| 472 |
+
if level4_code:
|
| 473 |
+
level5_df = esco_level5_df[
|
| 474 |
+
(esco_level5_df['iscoGroup'].str.startswith(level4_code))
|
| 475 |
+
]
|
| 476 |
+
if not level5_df.empty:
|
| 477 |
+
level5_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
|
| 478 |
+
for _, row in level5_df.iterrows())
|
| 479 |
+
|
| 480 |
+
#print(job_occupation_list)
|
| 481 |
+
list5_output = level5_df["code"].tolist() # Convert Series to list
|
| 482 |
+
list5 = ", ".join(map(str, list5_output)) # Join elements with comma
|
| 483 |
+
#print(list5)
|
| 484 |
+
user_prompt5 = f"""
|
| 485 |
+
Here is a list of job responsibilities:
|
| 486 |
+
{responsibilities}
|
| 487 |
+
|
| 488 |
+
Here is a list of level 4 Occupation classifications within {level4_code}:
|
| 489 |
+
{level5_options}
|
| 490 |
+
|
| 491 |
+
Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list5}.
|
| 492 |
+
**Important:**
|
| 493 |
+
- Return ONLY the code as stated in the provided list, nothing else.
|
| 494 |
+
- The code should be exactly as shown in the list.
|
| 495 |
+
- Do not include any additional text, occupation code or explanation.
|
| 496 |
+
"""
|
| 497 |
+
|
| 498 |
+
level5_code = gpt_call("Identify fifth-level occupational group", user_prompt5).strip()
|
| 499 |
+
# Handle the case where the LLM might return just the code part
|
| 500 |
+
level5_code = code_sanitize(level5_code, list5_output)
|
| 501 |
+
result.update(get_level_ESCO_info(level5_df, level5_code, 'Level_5'))
|
| 502 |
+
|
| 503 |
+
## Et voila!!
|
| 504 |
+
return result
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
def get_skills_info_esco(Level_5_code):
|
| 509 |
+
"""Helper function to get level info with error handling"""
|
| 510 |
+
esco_level5_df = pd.read_csv(
|
| 511 |
+
"occupations_en.csv",
|
| 512 |
+
dtype={'code': str, 'iscoGroup': str, } # Force 'code' to be read as string
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
# Find the matching occupation
|
| 516 |
+
matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
|
| 517 |
+
|
| 518 |
+
# Get the conceptUri(s) for the matched occupation
|
| 519 |
+
conceptUris = matches['conceptUri'].values.tolist()
|
| 520 |
+
|
| 521 |
+
esco_skill_map_df = pd.read_csv(
|
| 522 |
+
"occupationSkillRelations_en.csv"
|
| 523 |
+
)
|
| 524 |
+
# Find all skills related to that occupationUri (using isin to match any from the list)
|
| 525 |
+
skills = esco_skill_map_df[esco_skill_map_df['occupationUri'].isin(conceptUris)]
|
| 526 |
+
|
| 527 |
+
# Get the list of skillUris
|
| 528 |
+
skillUris = skills['skillUri'].values.tolist()
|
| 529 |
+
|
| 530 |
+
esco_skill_df = pd.read_csv(
|
| 531 |
+
"skills_en.csv"
|
| 532 |
+
)
|
| 533 |
+
# Get the full skill details from esco_skill_df
|
| 534 |
+
thisskillslist = esco_skill_df[esco_skill_df['conceptUri'].isin(skillUris)]
|
| 535 |
+
|
| 536 |
+
result= thisskillslist[['preferredLabel','conceptUri', 'description']].drop_duplicates()
|
| 537 |
+
result = result.rename(columns={
|
| 538 |
+
'preferredLabel': 'skill_name',
|
| 539 |
+
'description': 'skill_description',
|
| 540 |
+
'conceptUri': 'skill_code'
|
| 541 |
+
})
|
| 542 |
+
|
| 543 |
+
return result
|
| 544 |
+
|
| 545 |
+
|
| 546 |
+
def review_skills( Level_5_code: str, top_n: int = 10) -> List[Dict[str, str]]:
|
| 547 |
+
"""
|
| 548 |
+
Validate relevant ESCO-style skills for a job responsibilities using a language model.
|
| 549 |
+
|
| 550 |
+
Args:
|
| 551 |
+
Level_5_code: Standard esco occupation code strings..
|
| 552 |
+
top_n (int): The number of skills to return. Defaults to 3.
|
| 553 |
+
|
| 554 |
+
Returns:
|
| 555 |
+
List[Dict[str, str]]: A list of extracted skill dictionaries with keys:
|
| 556 |
+
- skill_name
|
| 557 |
+
- skill_description
|
| 558 |
+
- skill_code
|
| 559 |
+
"""
|
| 560 |
+
matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
|
| 561 |
+
|
| 562 |
+
# Get the conceptUri(s) for the matched occupation
|
| 563 |
+
esco_occup = matches['preferredLabel'].values.tolist()
|
| 564 |
+
skill_filtered = get_skills_info_esco(Level_5_code)
|
| 565 |
+
|
| 566 |
+
skill_filtered_options = "\n".join(f"- {row['skill_code']}: {row['skill_name']} - {row['skill_description']}"
|
| 567 |
+
for _, row in skill_filtered.iterrows())
|
| 568 |
+
|
| 569 |
+
prompt = f"""
|
| 570 |
+
Here is a list of skills:
|
| 571 |
+
|
| 572 |
+
{skill_filtered_options}
|
| 573 |
+
|
| 574 |
+
Filter the skills that relevant in the context of the work of the International Organisation for Migration.
|
| 575 |
+
|
| 576 |
+
Ensure that skills is relevant in the context of a {esco_occup} working for non-profit public organisation.
|
| 577 |
+
|
| 578 |
+
Required JSON structure:
|
| 579 |
+
{{
|
| 580 |
+
"skills": [
|
| 581 |
+
{{
|
| 582 |
+
"skill_name": "string",
|
| 583 |
+
"skill_description": "string",
|
| 584 |
+
"skill_code": "string"
|
| 585 |
+
}}
|
| 586 |
+
]
|
| 587 |
+
}}
|
| 588 |
+
|
| 589 |
+
**Important:**
|
| 590 |
+
- Do not duplicate any records of skills
|
| 591 |
+
- keep only the 10 most relevant skills
|
| 592 |
+
- Return ONLY the JSON object with no other text
|
| 593 |
+
- Use double quotes for all strings
|
| 594 |
+
- No trailing commas in arrays/objects
|
| 595 |
+
- No markdown formatting (no ```json)
|
| 596 |
+
- No text before or after the JSON
|
| 597 |
+
- Escape all special characters in strings
|
| 598 |
+
- Ensure all brackets are properly closed
|
| 599 |
+
- No trailing commas in arrays/objects, especially before closing brackets
|
| 600 |
+
"""
|
| 601 |
+
|
| 602 |
+
raw = gpt_call(
|
| 603 |
+
"You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.",
|
| 604 |
+
prompt
|
| 605 |
+
)
|
| 606 |
+
|
| 607 |
+
json_text = _extract_json(raw)
|
| 608 |
+
if not json_text:
|
| 609 |
+
return []
|
| 610 |
+
|
| 611 |
+
try:
|
| 612 |
+
result = json.loads(json_text)
|
| 613 |
+
skills = result.get("skills", [])
|
| 614 |
+
except json.JSONDecodeError as e:
|
| 615 |
+
print(f"❌ JSON parsing error: {e}")
|
| 616 |
+
print(f"🔍 Problematic JSON: {json_text}")
|
| 617 |
+
return []
|
| 618 |
+
|
| 619 |
+
validated_skills = []
|
| 620 |
+
for skill in skills:
|
| 621 |
+
try:
|
| 622 |
+
validated = {
|
| 623 |
+
"skill_name": str(skill["skill_name"]).strip(),
|
| 624 |
+
"skill_description": str(skill["skill_description"]).strip(),
|
| 625 |
+
"skill_code": str(skill["skill_code"]).strip()
|
| 626 |
+
}
|
| 627 |
+
validated_skills.append(validated)
|
| 628 |
+
except (KeyError, TypeError) as e:
|
| 629 |
+
print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
|
| 630 |
+
continue
|
| 631 |
+
|
| 632 |
+
return validated_skills[:top_n]
|
| 633 |
+
|
| 634 |
+
|
| 635 |
+
|
| 636 |
+
def extract_skills(responsibilities: List[str], top_n: int = 10) -> List[Dict[str, str]]:
|
| 637 |
+
"""
|
| 638 |
+
Extracts ESCO-style skills from job responsibilities using a language model.
|
| 639 |
+
|
| 640 |
+
Args:
|
| 641 |
+
responsibilities (List[str]): A list of job responsibility strings.
|
| 642 |
+
top_n (int): The number of skills to return. Defaults to 3.
|
| 643 |
+
|
| 644 |
+
Returns:
|
| 645 |
+
List[Dict[str, str]]: A list of extracted skill dictionaries with keys:
|
| 646 |
+
- skill_name
|
| 647 |
+
- skill_description
|
| 648 |
+
- skill_code
|
| 649 |
+
"""
|
| 650 |
+
|
| 651 |
+
prompt = f"""
|
| 652 |
+
Here is a list of job responsibilities:
|
| 653 |
+
|
| 654 |
+
{responsibilities}
|
| 655 |
+
|
| 656 |
+
List the required skills and knowledge as bullet points (without numbers) using ESCO-style terms.
|
| 657 |
+
|
| 658 |
+
For each Skill:
|
| 659 |
+
|
| 660 |
+
1. skill_name: precise skills name as used in ESCO framework
|
| 661 |
+
2. skill_description: add the long description as mentioned in ESCO framework
|
| 662 |
+
3. skill_code: include the detailed corresponding ESCO code for that skill.
|
| 663 |
+
|
| 664 |
+
Required JSON structure:
|
| 665 |
+
{{
|
| 666 |
+
"skills": [
|
| 667 |
+
{{
|
| 668 |
+
"skill_name": "string",
|
| 669 |
+
"skill_description": "string",
|
| 670 |
+
"skill_code": "string"
|
| 671 |
+
}}
|
| 672 |
+
]
|
| 673 |
+
}}
|
| 674 |
+
|
| 675 |
+
**Important:**
|
| 676 |
+
- Return ONLY the JSON object with no other text
|
| 677 |
+
- Use double quotes for all strings
|
| 678 |
+
- No trailing commas in arrays/objects
|
| 679 |
+
- No markdown formatting (no ```json)
|
| 680 |
+
- No text before or after the JSON
|
| 681 |
+
- Escape all special characters in strings
|
| 682 |
+
- Ensure all brackets are properly closed
|
| 683 |
+
"""
|
| 684 |
+
|
| 685 |
+
raw = gpt_call(
|
| 686 |
+
"You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.",
|
| 687 |
+
prompt
|
| 688 |
+
)
|
| 689 |
+
|
| 690 |
+
json_text = _extract_json(raw)
|
| 691 |
+
if not json_text:
|
| 692 |
+
return []
|
| 693 |
+
|
| 694 |
+
try:
|
| 695 |
+
result = json.loads(json_text)
|
| 696 |
+
skills = result.get("skills", [])
|
| 697 |
+
except json.JSONDecodeError as e:
|
| 698 |
+
print(f"❌ JSON parsing error: {e}")
|
| 699 |
+
print(f"🔍 Problematic JSON: {json_text}")
|
| 700 |
+
return []
|
| 701 |
+
|
| 702 |
+
validated_skills = []
|
| 703 |
+
for skill in skills:
|
| 704 |
+
try:
|
| 705 |
+
validated = {
|
| 706 |
+
"skill_name": str(skill["skill_name"]).strip(),
|
| 707 |
+
"skill_description": str(skill["skill_description"]).strip(),
|
| 708 |
+
"skill_code": str(skill["skill_code"]).strip()
|
| 709 |
+
}
|
| 710 |
+
validated_skills.append(validated)
|
| 711 |
+
except (KeyError, TypeError) as e:
|
| 712 |
+
print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
|
| 713 |
+
continue
|
| 714 |
+
|
| 715 |
+
return validated_skills[:top_n]
|
| 716 |
+
|
| 717 |
+
|
| 718 |
+
def map_proficiency_and_assessment(skills: List[str], responsibilities: List[str]) -> List[Dict]:
|
| 719 |
+
"""
|
| 720 |
+
Maps each skill to its contextual importance, expected proficiency level,
|
| 721 |
+
and assessment strategy based on job responsibilities.
|
| 722 |
+
|
| 723 |
+
Args:
|
| 724 |
+
skills (List[str]): List of skill names.
|
| 725 |
+
responsibilities (List[str]): List of job responsibilities.
|
| 726 |
+
|
| 727 |
+
Returns:
|
| 728 |
+
List[Dict]: A list of dictionaries containing skill metadata:
|
| 729 |
+
- skill_name
|
| 730 |
+
- importance (essential / optional)
|
| 731 |
+
- type ("skill/competence" or "knowledge")
|
| 732 |
+
- proficiency_level (Basic, Intermediate, Advanced)
|
| 733 |
+
- distinctive_elements
|
| 734 |
+
- resume_signals
|
| 735 |
+
- assessment_method
|
| 736 |
+
"""
|
| 737 |
+
|
| 738 |
+
prompt = f"""
|
| 739 |
+
Here is a list of job responsibilities: {responsibilities} that have been associated with the following skills: {skills}
|
| 740 |
+
|
| 741 |
+
For each skill, accounting for the context defined within the responsibilities, return a JSON object with:
|
| 742 |
+
- skill_name: the name of the skill
|
| 743 |
+
- importance: essential or optional
|
| 744 |
+
- type: "skill/competence" or "knowledge"
|
| 745 |
+
- proficiency_level: Basic, Intermediate, or Advanced
|
| 746 |
+
- distinctive_elements: what specific and distinctive elements are required at this defined proficiency level?
|
| 747 |
+
- resume_signals: what to look for in a resume to assess this skill?
|
| 748 |
+
- assessment_method: what is the preferred assessment method to accurately assess this skill?
|
| 749 |
+
|
| 750 |
+
Respond ONLY with a list of dictionaries in valid JSON.
|
| 751 |
+
Use double quotes for all strings. No markdown, no commentary, no trailing commas.
|
| 752 |
+
"""
|
| 753 |
+
|
| 754 |
+
raw = gpt_call("Define proficiency level and assessment for each skill.", prompt)
|
| 755 |
+
|
| 756 |
+
json_text = _extract_json_array(raw)
|
| 757 |
+
if not json_text:
|
| 758 |
+
return []
|
| 759 |
+
|
| 760 |
+
try:
|
| 761 |
+
results = json.loads(json_text)
|
| 762 |
+
except json.JSONDecodeError as e:
|
| 763 |
+
print(f"❌ JSON parsing error: {e}")
|
| 764 |
+
print(f"🔍 Problematic JSON: {json_text}")
|
| 765 |
+
return []
|
| 766 |
+
|
| 767 |
+
validated = []
|
| 768 |
+
for item in results:
|
| 769 |
+
try:
|
| 770 |
+
validated.append({
|
| 771 |
+
"skill_name": str(item["skill_name"]).strip(),
|
| 772 |
+
"importance": item["importance"].strip().lower(),
|
| 773 |
+
"type": item["type"].strip().lower(),
|
| 774 |
+
"proficiency_level": item["proficiency_level"].strip().capitalize(),
|
| 775 |
+
"distinctive_elements": item["distinctive_elements"].strip(),
|
| 776 |
+
"resume_signals": item["resume_signals"].strip(),
|
| 777 |
+
"assessment_method": item["assessment_method"].strip()
|
| 778 |
+
})
|
| 779 |
+
except (KeyError, TypeError) as e:
|
| 780 |
+
print(f"⚠️ Skipping invalid item: {item}. Error: {e}")
|
| 781 |
+
continue
|
| 782 |
+
|
| 783 |
+
return validated
|
| 784 |
+
|
| 785 |
+
def _extract_json_array(raw: str) -> str:
|
| 786 |
+
"""
|
| 787 |
+
Attempts to extract a clean JSON array from raw GPT output.
|
| 788 |
+
"""
|
| 789 |
+
json_start = raw.find('[')
|
| 790 |
+
json_end = raw.rfind(']') + 1
|
| 791 |
+
|
| 792 |
+
if json_start == -1 or json_end == 0:
|
| 793 |
+
print(f"❌ No JSON array found in response: {raw}")
|
| 794 |
+
return ""
|
| 795 |
+
|
| 796 |
+
json_text = raw[json_start:json_end]
|
| 797 |
+
|
| 798 |
+
# Cleanup
|
| 799 |
+
json_text = re.sub(r',\s*([}\]])', r'\1', json_text) # Remove trailing commas
|
| 800 |
+
json_text = re.sub(r'[\n\r\t]', ' ', json_text) # Remove control chars
|
| 801 |
+
json_text = re.sub(r'(?<!\\)"', '"', json_text) # Fix quotes if needed
|
| 802 |
+
|
| 803 |
+
return json_text
|
| 804 |
+
|
| 805 |
+
def extract_qualification(responsibilities: List[str]) -> List[str]:
|
| 806 |
+
|
| 807 |
+
prompt = f"""
|
| 808 |
+
Here is a list of job responsibilities: {responsibilities}
|
| 809 |
+
|
| 810 |
+
Infer the required level within the European Qualifications Framework (EQF) to implement them.
|
| 811 |
+
Identify the potential diplomas to testify such qualification
|
| 812 |
+
"""
|
| 813 |
+
|
| 814 |
+
raw = gpt_call("You are an HR expert that excel in developing compentency based interview questions.", prompt)
|
| 815 |
+
return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
|
| 816 |
+
|
| 817 |
+
def build_interview(responsibilities: List[str], skill_assess: List[str]) -> List[str]:
|
| 818 |
+
|
| 819 |
+
prompt = f"""
|
| 820 |
+
|
| 821 |
+
Here is a list of job responsibilities: {responsibilities} and related skills: {skill_assess}
|
| 822 |
+
|
| 823 |
+
Output: A structured 40-minute interview with:
|
| 824 |
+
|
| 825 |
+
Opening questions (5 min)
|
| 826 |
+
|
| 827 |
+
Core competency-based questions (30 min, 5-6 questions)
|
| 828 |
+
|
| 829 |
+
Closing & candidate questions (5 min)
|
| 830 |
+
|
| 831 |
+
|
| 832 |
+
"""
|
| 833 |
+
|
| 834 |
+
raw = gpt_call("You are an HR expert that excel in developing compentency based interview questions.", prompt)
|
| 835 |
+
return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
|
| 836 |
+
|
| 837 |
+
|
| 838 |
+
|
| 839 |
+
def _extract_json(raw: str) -> str:
|
| 840 |
+
"""
|
| 841 |
+
Attempts to extract and clean a JSON object from a raw string.
|
| 842 |
+
"""
|
| 843 |
+
json_start = raw.find('{')
|
| 844 |
+
json_end = raw.rfind('}') + 1
|
| 845 |
+
|
| 846 |
+
if json_start == -1 or json_end == 0:
|
| 847 |
+
print(f"❌ No JSON found in response: {raw}")
|
| 848 |
+
return ""
|
| 849 |
+
|
| 850 |
+
json_text = raw[json_start:json_end]
|
| 851 |
+
|
| 852 |
+
# Clean common issues
|
| 853 |
+
json_text = re.sub(r',\s*([}\]])', r'\1', json_text) # Remove trailing commas
|
| 854 |
+
json_text = re.sub(r'[\n\r\t]', ' ', json_text) # Remove control characters
|
| 855 |
+
json_text = re.sub(r'\s{2,}', ' ', json_text) # Collapse multiple spaces
|
| 856 |
+
json_text = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', json_text) # Escape lone backslashes
|
| 857 |
+
json_text = json_text.strip()
|
| 858 |
+
|
| 859 |
+
return json_text
|
| 860 |
+
|
| 861 |
+
|
| 862 |
+
|
| 863 |
def process_pdf(file):
|
| 864 |
"""
|
| 865 |
Processes the uploaded PDF file and returns the extracted text.
|
|
|
|
| 869 |
|
| 870 |
try:
|
| 871 |
extracted_text = extract_text_from_pdf(file.name)
|
| 872 |
+
|
| 873 |
+
# Extract responsibilities section
|
| 874 |
+
responsibilities = extract_section_from_pdf(full_text, section_title="Responsibilities and Accountabilities")
|
| 875 |
+
if not responsibilities:
|
| 876 |
+
print(f"Skipping {os.path.basename(file_path)} - no responsibilities section found")
|
| 877 |
+
return None
|
| 878 |
+
|
| 879 |
+
# Main processing
|
| 880 |
+
job_family = classify_job_family(responsibilities)
|
| 881 |
+
occ_group = classify_occupational_group_by_level(responsibilities)
|
| 882 |
+
esco_occ = classify_esco_by_hierarchical_level(responsibilities)
|
| 883 |
+
qualification = extract_qualification(responsibilities)
|
| 884 |
+
skills = extract_skills(responsibilities)
|
| 885 |
+
skill_map = map_proficiency_and_assessment(skills, responsibilities)
|
| 886 |
+
|
| 887 |
+
# Check if we have ESCO level 5 code
|
| 888 |
+
has_esco = esco_occ.get("Level_5_ESCO_code") is not None
|
| 889 |
+
|
| 890 |
+
# ESCO-based skills processing (only if we have Level 5 code)
|
| 891 |
+
skill_esco_extract = []
|
| 892 |
+
skill_esco_map = []
|
| 893 |
+
if has_esco:
|
| 894 |
+
Level_5_code = esco_occ["Level_5_ESCO_code"]
|
| 895 |
+
skill_esco_extract = review_skills(Level_5_code)
|
| 896 |
+
skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
|
| 897 |
+
else:
|
| 898 |
+
print(f"No Level 5 ESCO code found for {os.path.basename(file_path)}, skipping ESCO skills mapping")
|
| 899 |
+
|
| 900 |
+
time.sleep(6) # Rate limiting delay
|
| 901 |
+
|
| 902 |
+
# Join original skills with assessment
|
| 903 |
+
assessment_lookup = {item['skill_name']: item for item in skill_map}
|
| 904 |
+
joined_skills = [
|
| 905 |
+
{
|
| 906 |
+
"skill_name": skill["skill_name"],
|
| 907 |
+
"skill_description": skill["skill_description"],
|
| 908 |
+
"skill_code": skill["skill_code"],
|
| 909 |
+
"importance": assessment_lookup.get(skill["skill_name"], {}).get("importance"),
|
| 910 |
+
"type": assessment_lookup.get(skill["skill_name"], {}).get("type"),
|
| 911 |
+
"proficiency_level": assessment_lookup.get(skill["skill_name"], {}).get("proficiency_level"),
|
| 912 |
+
"distinctive_elements": assessment_lookup.get(skill["skill_name"], {}).get("distinctive_elements"),
|
| 913 |
+
"resume_signals": assessment_lookup.get(skill["skill_name"], {}).get("resume_signals"),
|
| 914 |
+
"assessment_method": assessment_lookup.get(skill["skill_name"], {}).get("assessment_method")
|
| 915 |
+
}
|
| 916 |
+
for skill in skills
|
| 917 |
+
]
|
| 918 |
+
|
| 919 |
+
# Join ESCO skills with assessment (only if we processed them)
|
| 920 |
+
joined_skills_esco = []
|
| 921 |
+
if has_esco and skill_esco_extract:
|
| 922 |
+
assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
|
| 923 |
+
joined_skills_esco = [
|
| 924 |
+
{
|
| 925 |
+
"skill_name": skill["skill_name"],
|
| 926 |
+
"skill_description": skill["skill_description"],
|
| 927 |
+
"skill_code": skill["skill_code"],
|
| 928 |
+
**assessment_esco_lookup.get(skill["skill_name"], {})
|
| 929 |
+
}
|
| 930 |
+
for skill in skill_esco_extract
|
| 931 |
+
]
|
| 932 |
+
|
| 933 |
+
interview = build_interview(responsibilities, skills)
|
| 934 |
+
|
| 935 |
+
# Prepare base result dictionary
|
| 936 |
+
result = {
|
| 937 |
+
"file": os.path.basename(file_path),
|
| 938 |
+
"responsibilities": responsibilities,
|
| 939 |
+
"job_family": job_fam1['Job_family'].values[0],
|
| 940 |
+
"job_subfamily": job_fam1['Job_subfamily'].values[0],
|
| 941 |
+
"classified_job_family": job_family,
|
| 942 |
+
**{f"Level_{i}_CCOG_{field}": occ_group.get(f"Level_{i}_CCOG_{field}")
|
| 943 |
+
for i in range(1, 5) for field in ["code", "name", "desc"]},
|
| 944 |
+
"qualification": qualification,
|
| 945 |
+
"interview": interview,
|
| 946 |
+
"skills": {
|
| 947 |
+
"file": os.path.basename(file_path),
|
| 948 |
+
"job_family": job_fam1['Job_family'].values[0],
|
| 949 |
+
"job_subfamily": job_fam1['Job_subfamily'].values[0],
|
| 950 |
+
"skills": joined_skills
|
| 951 |
+
}
|
| 952 |
+
}
|
| 953 |
+
|
| 954 |
+
# Add ESCO fields only if we have them
|
| 955 |
+
if has_esco:
|
| 956 |
+
result.update({
|
| 957 |
+
**{f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}")
|
| 958 |
+
for i in range(1, 6) for field in ["code", "name", "desc"]},
|
| 959 |
+
"skills_esco": {
|
| 960 |
+
"file": os.path.basename(file_path),
|
| 961 |
+
"job_family": job_fam1['Job_family'].values[0],
|
| 962 |
+
"job_subfamily": job_fam1['Job_subfamily'].values[0],
|
| 963 |
+
"skills": joined_skills_esco
|
| 964 |
+
}
|
| 965 |
+
})
|
| 966 |
+
else:
|
| 967 |
+
# Mark ESCO fields as null if not available
|
| 968 |
+
result.update({
|
| 969 |
+
**{f"Level_{i}_ESCO_{field}": None
|
| 970 |
+
for i in range(1, 6) for field in ["code", "name", "desc"]},
|
| 971 |
+
"skills_esco": None
|
| 972 |
+
})
|
| 973 |
+
|
| 974 |
+
return result
|
| 975 |
+
|
| 976 |
except Exception as e:
|
| 977 |
return f"Error processing PDF: {str(e)}"
|
| 978 |
|
| 979 |
# Create the Gradio interface
|
| 980 |
with gr.Blocks() as demo:
|
| 981 |
+
gr.Markdown("# Standardise Job Description!")
|
| 982 |
+
gr.Markdown("Identify Job Family, Occupation, Qualification, match Skills and suggest interview questions.")
|
| 983 |
|
| 984 |
with gr.Row():
|
| 985 |
with gr.Column():
|
| 986 |
+
file_input = gr.File(label="Upload a Job Description PDF file", file_types=[".pdf"])
|
| 987 |
submit_btn = gr.Button("Extract Text")
|
| 988 |
with gr.Column():
|
| 989 |
text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
|