Spaces:

edouardlgp
/

Job_Classification

Running

File size: 44,040 Bytes

import gradio as gr
import pdfplumber
import pandas as pd
import re
import warnings
import logging
import os
from dotenv import load_dotenv
import json
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict, Optional
import traceback
import time
import openai

# Debugging setup
DEBUG = True
debug_messages = []

def log_debug(message):
    """Log debug messages and keep last 20 entries"""
    if DEBUG:
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        full_message = f"[{timestamp}] {message}"
        debug_messages.append(full_message)
        print(full_message)  # Print to console
        # Keep only the last 20 messages
        if len(debug_messages) > 20:
            debug_messages.pop(0)
        return "\n".join(debug_messages)
    return ""

# Initialize debug logging
log_debug("Application starting...")

# Load environment variables
load_dotenv()

# Configure logging for pdfminer
logging.getLogger('pdfminer').setLevel(logging.ERROR)

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")

 
# ================= DataFrame initializations =================
try:
    job_families_df = pd.read_csv("job_families1.csv", on_bad_lines='skip')
except Exception as e:
    print(f"Error reading job_families1.csv: {e}")
    job_families_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately

try:
    occupational_groups_df = pd.read_csv("occupational_groups.csv", on_bad_lines='skip')
except Exception as e:
    log_debug(f"Error reading occupational_groups.csv: {e}")
    occupational_groups_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately

try:
    esco_df = pd.read_csv("ISCOGroups_en.csv", on_bad_lines='skip', dtype={'code': str}  ) # Force 'code' to be read as string
except Exception as e:
    log_debug(f"Error reading ISCOGroups_en.csv: {e}")
    esco_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately

try:
    esco_level5_df = pd.read_csv("occupations_en.csv", on_bad_lines='skip',  dtype={'code': str, 'iscoGroup': str, }  ) # Force 'code' to be read as string
except Exception as e:
    log_debug(f"Error reading occupations_en.csv: {e}")
    esco_level5_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately    

try:
    esco_skill_df = pd.read_csv("skills_en.csv", on_bad_lines='skip')
except Exception as e:
    log_debug(f"Error reading skills_en.csv: {e}")
    esco_skill_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
        
try:
     esco_skill_map_df = pd.read_csv("occupationSkillRelations_en.csv", on_bad_lines='skip')
except Exception as e:
    log_debug(f"Error reading occupationSkillRelations_en.csv: {e}")
    esco_skill_map_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately


# ================= LLM API =================
def initialize_openai_client():
    try:
        client = openai.AzureOpenAI(
            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
            api_version=os.getenv("OPENAI_API_VERSION"),
        )
        return client
    except Exception as e:
        raise Exception(f"Failed to initialize OpenAI client: {e}")

client = initialize_openai_client()

def gpt_call(system_prompt: str, user_prompt: str) -> str:
    try:
        response = client.chat.completions.create(
            model=os.getenv("AZURE_DEPLOYMENT_NAME"),
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.3
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"ERROR: {e}"
        
# ================= Extract text =================
def extract_text_from_pdf(pdf_path: str) -> str:
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
            for table in page.extract_tables():
                for row in table:
                    for cell in row:
                        if isinstance(cell, str):
                            text += cell + " "
                    text += "\n"
    return text

# ================= AI Functions =================
def extract_section_from_pdf(full_text: str, section_title: str) -> str:
    user_prompt = f"""
    Carefully evaluate the provided position description (PD) document and extract the content of the section titled "{section_title}" from the following text.
    Return only the content of the section, without the title.
    If the section cannot be found or explicitly mentioned in the text, use "N/A" as the default value.
    Do not repeat in the extracted text the name of the section.
    Extract precisely all the related text.
    Text of the position description:
    {full_text}
    Section to identify: "{section_title}":
    """
    return gpt_call("You are an HR expert working for IOM.", user_prompt)

def classify_job_family(responsibilities: List[str]) -> str:    

    job_family_list = "\n".join(f"- {row['Job_family']}: {row['Job_subfamily']}" for _, row in job_families_df.iterrows())
    user_prompt = f"""
    Here is a list of job responsibilities:
    {responsibilities}
    Here is a list of Job families:
    {job_family_list}
    Based on the responsibilities, suggest the most relevant job family and subfamily from the list above.
    **Important:**
    - Return ONLY the job family, nothing else.
    - The job family should be exactly as shown in the list.
    - Do not include any additional text or explanation.
    """
    return gpt_call("Suggest job family and subfamily based on responsibilities.", user_prompt)

def get_level_CCOG_info(df, code, level_name):
    matches = df[df['code'] == code]
    if len(matches) == 0:
        log_debug(f"Warning: No {level_name} found for CCOG code {code}")
        return {
            f'{level_name}_CCOG_code': code,
            f'{level_name}_CCOG_name': 'UNKNOWN',
            f'{level_name}_CCOG_desc': 'No matching occupation found'
        }
    info = matches.iloc[0]
    return {
        f'{level_name}_CCOG_code': code,
        f'{level_name}_CCOG_name': info['occupation'],
        f'{level_name}_CCOG_desc': info.get('occupation_description', '')
    }

def code_sanitize(input_string, valid_codes):
    for code in valid_codes:
        if code in input_string:
            return code
    return None

def classify_occupational_group_by_level(responsibilities: List[str]) -> dict:

    result = {}
    try:
        for level in range(1, 5):
            level_df = occupational_groups_df[occupational_groups_df['level'] == f"Level {level}"]
            if level > 1:
                prev_level_code = result[f'Level_{level-1}_CCOG_code']
                level_df = level_df[level_df['code'].str.startswith(prev_level_code)]
            job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row.get('occupation_description', '')}" for _, row in level_df.iterrows())
            list_output = level_df["code"].tolist()
            user_prompt = f"""
            Here is a list of job responsibilities:
            {responsibilities}
            Here is a list of level {level} Occupation classifications:
            {job_occupation_list}
            Based on the responsibilities, suggest the most relevant level {level} Occupation code from within this list: {', '.join(map(str, list_output))}.
            **Important:**
            - Return ONLY the code, nothing else.
            - The code should be exactly as shown in the list.
            - Do not include any additional text or explanation.
            """
            level_code = gpt_call(f"Identify level {level} occupational group", user_prompt).strip()
            level_code = code_sanitize(level_code, list_output)
            result.update(get_level_CCOG_info(level_df, level_code, f'Level_{level}'))
    except Exception as e:
        log_debug(f"Error during classification: {str(e)}")
        result['error'] = str(e)
    return result


 
def classify_esco_by_hierarchical_level(responsibilities: List[str]) -> dict:
    """
    Classifies job responsibilities into occupational groups at 4 levels,
    [European Skills, Competences, Qualifications, and Occupations (ESCO)](https://esco.ec.europa.eu/en)
    returning codes, names, and descriptions for each level.
    Args:
        responsibilities: List of job responsibility strings
    Returns:
        Dictionary containing classification information or error message
    """



    result = {}

    ######################## Level 1 ###################
    # Get all top-level codes (single character/digit)
    top_level_codes = sorted({
        code for code in esco_df['code']
        if len(code) == 1 and code.isalnum()
    })

    level1_code = None
    if top_level_codes:
        level1_df = esco_df[esco_df['code'].isin(top_level_codes)]
        job_occupation_list = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
                                        for _, row in level1_df.iterrows())
        list1_output = level1_df["code"].tolist()  # Convert Series to list
        list1 = ", ".join(map(str, list1_output))  # Join elements with comma

    user_prompt1 = f"""
    Here is a list of job responsibilities:
    {responsibilities}

    Select the most relevant top-level code from these options:
    {job_occupation_list}

    Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.
        **Important:**
        - Return ONLY the code, nothing else.
        - The code should be exactly as shown in the list.
        - Do not include any additional text or explanation.
    """
    level1_code = gpt_call("Identify top-level occupational group", user_prompt1).strip()
    level1_code = code_sanitize(level1_code, list1_output)
    result.update(get_level_ESCO_info(level1_df, level1_code, 'Level_1'))

    ######################## Level 2 ###################

    level2_code = None
    if level1_code:
        level2_df = esco_df[
            (esco_df['code'].str.startswith(level1_code)) & (esco_df['code'].str.len() == len(level1_code) + 1)
        ]
        if not level2_df.empty:
            level2_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
                                        for _, row in level2_df.iterrows())
        list2_output = level2_df["code"].tolist()  # Convert Series to list
        list2 = ", ".join(map(str, list2_output))  # Join elements with comma

        user_prompt2 = f"""
    Here is a list of job responsibilities:
    {responsibilities}

    Here is a list of level 2 Occupation classifications within {level1_code}:
    {level2_options}

    Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
        **Important:**
        - Return ONLY the code, nothing else.
        - The code should be exactly as shown in the list.
        - Do not include any additional text or explanation.
    """
        level2_code = gpt_call("Identify second-level occupational group", user_prompt2).strip()
        level2_code = code_sanitize(level2_code, list2_output)
        result.update(get_level_ESCO_info(level2_df, level2_code, 'Level_2'))

    ######################## Level 3 ###################
    level3_code = None
    if level2_code:
        level3_df = esco_df[
            (esco_df['code'].str.startswith(level2_code)) & (esco_df['code'].str.len() == len(level2_code) + 1)
        ]
        if not level3_df.empty:
            level3_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
                                        for _, row in level3_df.iterrows())
        list3_output = level3_df["code"].tolist()  # Convert Series to list
        list3 = ", ".join(map(str, list3_output))  # Join elements with comma

        user_prompt3 = f"""
    Here is a list of job responsibilities:
    {responsibilities}

    Here is a list of level 3 Occupation classifications within {level2_code}:
    {level3_options}

    Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.

        **Important:**
        - Return ONLY the code, nothing else.
        - The code should be exactly as shown in the list.
        - Do not include any additional text or explanation.

    """
        level3_code = gpt_call("Identify third-level occupational group", user_prompt3).strip()
        level3_code = code_sanitize(level3_code, list3_output)
        result.update(get_level_ESCO_info(level3_df, level3_code, 'Level_3'))

    ######################## Level 4 ###################
    level4_code = None
    if level3_code:
        level4_df = esco_df[
            (esco_df['code'].str.startswith(level3_code)) & (esco_df['code'].str.len() == len(level3_code) + 1)
        ]
        if not level4_df.empty:
            level4_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
                                        for _, row in level4_df.iterrows())
        list4_output = level4_df["code"].tolist()  # Convert Series to list
        list4 = ", ".join(map(str, list4_output))  # Join elements with comma
        user_prompt4 = f"""
    Here is a list of job responsibilities:
    {responsibilities}

    Here is a list of level 4 Occupation classifications within {level3_code}:
    {level4_options}

    Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
        **Important:**
        - Return ONLY the code, nothing else.
        - The code should be exactly as shown in the list.
        - Do not include any additional text or explanation.
    """

        level4_code = gpt_call("Identify fourth-level occupational group", user_prompt4).strip()
        level4_code = code_sanitize(level4_code, list4_output)
        result.update(get_level_ESCO_info(level4_df, level4_code, 'Level_4'))

    ######################## Level 5 ###################
    level5_code = None
    if level4_code:
        level5_df = esco_level5_df[
            (esco_level5_df['iscoGroup'].str.startswith(level4_code))
        ]
        if not level5_df.empty:
            level5_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
                                        for _, row in level5_df.iterrows())

        list5_output = level5_df["code"].tolist()  # Convert Series to list
        list5 = ", ".join(map(str, list5_output))  # Join elements with comma
        user_prompt5 = f"""
    Here is a list of job responsibilities:
    {responsibilities}

    Here is a list of level 4 Occupation classifications within {level4_code}:
    {level5_options}

    Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list5}.
        **Important:**
        - Return ONLY the code as stated in the provided list, nothing else.
        - The code should be exactly as shown in the list.
        - Do not include any additional text, occupation code or explanation.
    """

        level5_code = gpt_call("Identify fifth-level occupational group", user_prompt5).strip()
        # Handle the case where the LLM might return just the code part
        level5_code = code_sanitize(level5_code, list5_output)
        result.update(get_level_ESCO_info(level5_df, level5_code, 'Level_5'))

    ## Et voila!!
    return result



def get_level_ESCO_info(df, code, level_name):
    """Helper function to get level info with error handling"""
    matches = df[df['code'] == code]
    if len(matches) == 0:
        log_debug(f"Warning: No {level_name} found for ESCO code {code}")
        return {
            f'{level_name}_ESCO_code': code,
            f'{level_name}_ESCO_name': 'UNKNOWN',
            f'{level_name}_ESCO_desc': 'No matching occupation found'
        }
    info = matches.iloc[0]
    return {
        f'{level_name}_ESCO_code': code,
        f'{level_name}_ESCO_name': info['preferredLabel'],
        f'{level_name}_ESCO_desc': info.get('description', '')
    }


def get_skills_info_esco(Level_5_code):
 
    matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
    conceptUris = matches['conceptUri'].values.tolist()
         
    skills = esco_skill_map_df[esco_skill_map_df['occupationUri'].isin(conceptUris)]
    skillUris = skills['skillUri'].values.tolist()

    
    thisskillslist = esco_skill_df[esco_skill_df['conceptUri'].isin(skillUris)]
    result = thisskillslist[['preferredLabel', 'conceptUri', 'description']].drop_duplicates()
    result = result.rename(columns={'preferredLabel': 'skill_name', 'description': 'skill_description', 'conceptUri': 'skill_code'})
    return result

def review_skills(Level_5_code: str, top_n: int = 10) -> List[Dict[str, str]]:
    matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
    esco_occup = matches['preferredLabel'].values.tolist()
    skill_filtered = get_skills_info_esco(Level_5_code)
    skill_filtered_options = "\n".join(f"- {row['skill_code']}: {row['skill_name']} - {row['skill_description']}" for _, row in skill_filtered.iterrows())
    prompt = f"""
    Here is a list of skills:
    {skill_filtered_options}
    Filter the skills that are relevant in the context of the work of the International Organisation for Migration.
    Ensure that skills are relevant in the context of a {esco_occup} working for a non-profit public organization.
    Required JSON structure:
    {{
        "skills": [
            {{
                "skill_name": "string",
                "skill_description": "string",
                "skill_code": "string"
            }}
        ]
    }}
    **Important:**
    - Do not duplicate any records of skills
    - Keep only the 10 most relevant skills
    - Return ONLY the JSON object with no other text
    - Use double quotes for all strings
    - No trailing commas in arrays/objects
    - No markdown formatting (no ```json)
    - No text before or after the JSON
    - Escape all special characters in strings
    - Ensure all brackets are properly closed
    - No trailing commas in arrays/objects, especially before closing brackets
    """
    raw = gpt_call("You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.", prompt)
    json_text = _extract_json(raw)
    if not json_text:
        return []
    try:
        result = json.loads(json_text)
        skills = result.get("skills", [])
    except json.JSONDecodeError as e:
        log_debug(f"❌ JSON Skills parsing error: {e}")
        log_debug(f"🔍 Problematic JSON Skills: {json_text}")
        return []
    validated_skills = []
    for skill in skills:
        try:
            validated = {
                "skill_name": str(skill["skill_name"]).strip(),
                "skill_description": str(skill["skill_description"]).strip(),
                "skill_code": str(skill["skill_code"]).strip()
            }
            validated_skills.append(validated)
        except (KeyError, TypeError) as e:
            log_debug(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
            continue
    return validated_skills[:top_n]

def extract_skills(responsibilities: List[str], top_n: int = 10) -> List[Dict[str, str]]:
    prompt = f"""
    Here is a list of job responsibilities:
    {responsibilities}
    List the required skills and knowledge as bullet points (without numbers) using ESCO-style terms.
    For each Skill:
    1. skill_name: precise skills name as used in ESCO framework
    2. skill_description: add the long description as mentioned in ESCO framework
    3. skill_code: include the detailed corresponding ESCO code for that skill.
    Required JSON structure:
    {{
        "skills": [
            {{
                "skill_name": "string",
                "skill_description": "string",
                "skill_code": "string"
            }}
        ]
    }}
    **Important:**
    - Return ONLY the JSON object with no other text
    - Use double quotes for all strings
    - No trailing commas in arrays/objects
    - No markdown formatting (no ```json)
    - No text before or after the JSON
    - Escape all special characters in strings
    - Ensure all brackets are properly closed
    """
    raw = gpt_call("You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.", prompt)
    json_text = _extract_json(raw)
    if not json_text:
        return []
    try:
        result = json.loads(json_text)
        skills = result.get("skills", [])
    except json.JSONDecodeError as e:
        log_debug(f"❌ JSON Skills extrac parsing error: {e}")
        log_debug(f"🔍 Problematic JSON Skills extract: {json_text}")
        return []
    validated_skills = []
    for skill in skills:
        try:
            validated = {
                "skill_name": str(skill["skill_name"]).strip(),
                "skill_description": str(skill["skill_description"]).strip(),
                "skill_code": str(skill["skill_code"]).strip()
            }
            validated_skills.append(validated)
        except (KeyError, TypeError) as e:
            log_debug(f"⚠️ Skipping invalid skill extract: {skill}. Error: {e}")
            continue
    return validated_skills[:top_n]

def map_proficiency_and_assessment(skills: List[str], responsibilities: List[str]) -> List[Dict]:
    prompt = f"""
    Here is a list of job responsibilities: {responsibilities} that have been associated with the following skills: {skills}
    For each skill, accounting for the context defined within the responsibilities, return a JSON object with:
        - skill_name: the name of the skill
        - importance: essential or optional
        - type: "skill/competence" or "knowledge"
        - proficiency_level: Basic, Intermediate, or Advanced
        - distinctive_elements: what specific and distinctive elements are required at this defined proficiency level?
        - resume_signals: what to look for in a resume to assess this skill?
        - assessment_method: what is the preferred assessment method to accurately assess this skill?
    Respond ONLY with a list of dictionaries in valid JSON.
    Use double quotes for all strings. No markdown, no commentary, no trailing commas.
    """
    raw = gpt_call("Define proficiency level and assessment for each skill.", prompt)
    json_text = _extract_json_array(raw)
    if not json_text:
        return []
    try:
        results = json.loads(json_text)
    except json.JSONDecodeError as e:
        log_debug(f"❌ JSON proficiency parsing error: {e}")
        log_debug(f"🔍 Problematic JSON proficiency: {json_text}")
        return []
    validated = []
    for item in results:
        try:
            validated.append({
                "skill_name": str(item["skill_name"]).strip(),
                "importance": item["importance"].strip().lower(),
                "type": item["type"].strip().lower(),
                "proficiency_level": item["proficiency_level"].strip().capitalize(),
                "distinctive_elements": item["distinctive_elements"].strip(),
                "resume_signals": item["resume_signals"].strip(),
                "assessment_method": item["assessment_method"].strip()
            })
        except (KeyError, TypeError) as e:
            log_debug(f"⚠️ Skipping invalid profiency item: {item}. Error: {e}")
            continue
    return validated

def _extract_json_array(raw: str) -> str:
    json_start = raw.find('[')
    json_end = raw.rfind(']') + 1
    if json_start == -1 or json_end == 0:
        log_debug(f"❌ No JSON array found in response: {raw}")
        return ""
    json_text = raw[json_start:json_end]
    json_text = re.sub(r',\s*([}\]])', r'\1', json_text)
    json_text = re.sub(r'[\n\r\t]', ' ', json_text)
    json_text = re.sub(r'(?<!\\)"', '"', json_text)
    return json_text

def extract_qualification(responsibilities: List[str]) -> List[str]:
    prompt = f"""
    Here is a list of job responsibilities: {responsibilities}
    Infer the required level within the European Qualifications Framework (EQF) to implement them.
    Identify the potential diplomas to testify such qualification
    """
    raw = gpt_call("You are an HR expert that excel in developing competency-based interview questions.", prompt)
    return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]

def build_interview(responsibilities: List[str], skill_assess: List[str]) -> List[str]:
    prompt = f"""
    Here is a list of job responsibilities: {responsibilities} and related skills: {skill_assess}
    Output: A structured 40-minute interview with:
        Opening questions (5 min)
        Core competency-based questions (30 min, 5-6 questions)
        Closing & candidate questions (5 min)
    """
    raw = gpt_call("You are an HR expert that excel in developing competency-based interview questions.", prompt)
    return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]

def _extract_json(raw: str) -> str:
    json_start = raw.find('{')
    json_end = raw.rfind('}') + 1
    if json_start == -1 or json_end == 0:
        log_debug(f"❌ No JSON found in response: {raw}")
        return ""
    json_text = raw[json_start:json_end]
    json_text = re.sub(r',\s*([}\]])', r'\1', json_text)
    json_text = re.sub(r'[\n\r\t]', ' ', json_text)
    json_text = re.sub(r'\s{2,}', ' ', json_text)
    json_text = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', json_text)
    json_text = json_text.strip()
    return json_text

# ================= Process Analysis =================
from concurrent.futures import ThreadPoolExecutor

def process_pdf(file):
    if file is None:
        return (
            "Please upload a PDF file.",
            "",
            "",
            "",
            {},
            "",
            [],
            {},
            {},
            "No file uploaded."
        )

    try:
        extracted_text = extract_text_from_pdf(file.name)
        responsibilities = extract_section_from_pdf(extracted_text, section_title="Responsibilities and Accountabilities")
        if not responsibilities:
            log_debug(f"Skipping {os.path.basename(file.name)} - no responsibilities section found")
            return (
                os.path.basename(file.name),
                "",
                "",
                "",
                {},
                "",
                [],
                {},
                {},
                "No responsibilities section found."
            )

        # Use ThreadPoolExecutor to parallelize independent tasks
        with ThreadPoolExecutor() as executor:
            # Submit tasks to the executor
            job_family_future = executor.submit(classify_job_family, responsibilities)
            occ_group_future = executor.submit(classify_occupational_group_by_level, responsibilities)
            esco_occ_future = executor.submit(classify_esco_by_hierarchical_level, responsibilities)
            qualification_future = executor.submit(extract_qualification, responsibilities)
            skills_future = executor.submit(extract_skills, responsibilities)

            # Retrieve results from futures
            job_family = job_family_future.result()
            occ_group = occ_group_future.result()
            esco_occ = esco_occ_future.result()
            qualification = qualification_future.result()
            skills = skills_future.result()

        log_debug(f"Identified {job_family}")

        skill_map = map_proficiency_and_assessment(skills, responsibilities)

        has_esco = esco_occ.get("Level_5_ESCO_code") is not None
        skill_esco_extract = []
        skill_esco_map = []
        if has_esco:
            Level_5_code = esco_occ["Level_5_ESCO_code"]
            skill_esco_extract = review_skills(Level_5_code)
            skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
        else:
            log_debug(f"No Level 5 ESCO code found for {os.path.basename(file.name)}, skipping ESCO skills mapping")

        time.sleep(6)
        assessment_lookup = {item['skill_name']: item for item in skill_map}
        joined_skills = [
            {
                "skill_name": skill["skill_name"],
                "skill_description": skill["skill_description"],
                "skill_code": skill["skill_code"],
                "importance": assessment_lookup.get(skill["skill_name"], {}).get("importance"),
                "type": assessment_lookup.get(skill["skill_name"], {}).get("type"),
                "proficiency_level": assessment_lookup.get(skill["skill_name"], {}).get("proficiency_level"),
                "distinctive_elements": assessment_lookup.get(skill["skill_name"], {}).get("distinctive_elements"),
                "resume_signals": assessment_lookup.get(skill["skill_name"], {}).get("resume_signals"),
                "assessment_method": assessment_lookup.get(skill["skill_name"], {}).get("assessment_method")
            }
            for skill in skills
        ]

        joined_skills_esco = []
        if has_esco and skill_esco_extract:
            assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
            joined_skills_esco = [
                {
                    "skill_name": skill["skill_name"],
                    "skill_description": skill["skill_description"],
                    "skill_code": skill["skill_code"],
                    **assessment_esco_lookup.get(skill["skill_name"], {})
                }
                for skill in skill_esco_extract
            ]

        interview = build_interview(responsibilities, skills)

        # Prepare the results for each output component
        ccoq_levels = {f"Level_{i}_CCOG_{field}": occ_group.get(f"Level_{i}_CCOG_{field}")
                       for i in range(1, 5) for field in ["code", "name", "desc"]}

        if has_esco:
            esco_levels = {f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}")
                           for i in range(1, 6) for field in ["code", "name", "desc"]}
            esco_skills = {
                "file": os.path.basename(file.name),
                "classified_job_family": job_family,
                "skills": joined_skills_esco
            }
        else:
            esco_levels = {f"Level_{i}_ESCO_{field}": None
                           for i in range(1, 6) for field in ["code", "name", "desc"]}
            esco_skills = None

        debug_message = "Processing completed successfully."
        return (
            os.path.basename(file.name),
            responsibilities,
            job_family,
            "\n".join(qualification),
            ccoq_levels,
            "\n".join(interview),
            joined_skills,
            esco_levels,
            esco_skills,
            debug_message if DEBUG else None
        )

    except Exception as e:
        error_message = f"Error processing PDF: {str(e)}"
        return (
            error_message,
            "",
            "",
            "",
            {},
            "",
            [],
            {},
            {},
            error_message
        )
# ================= Build Word Report =================
from docx import Document

def generate_word_document(result):
    doc = Document()

    # Add a title
    doc.add_heading('Job Description Analysis', level=1)

    # Add file name
    doc.add_heading('File Name', level=2)
    doc.add_paragraph(result["file"])

    # Add responsibilities
    doc.add_heading('Responsibilities', level=2)
    doc.add_paragraph(result["responsibilities"])

    # Add job family
    doc.add_heading('Classified Job Family', level=2)
    doc.add_paragraph(result["classified_job_family"])

    # Add qualifications
    doc.add_heading('Qualification', level=2)
    doc.add_paragraph("\n".join(result["qualification"]))

    # Add CCOG Levels
    doc.add_heading('CCOG Levels', level=2)
    for i in range(1, 5):
        for field in ["code", "name", "desc"]:
            key = f"Level_{i}_CCOG_{field}"
            if key in result:
                doc.add_paragraph(f"{key}: {result[key]}")

    # Add interview questions
    doc.add_heading('Interview Questions', level=2)
    doc.add_paragraph("\n".join(result["interview"]))

    # Add skills
    doc.add_heading('Skills', level=2)
    for skill in result["skills"]["skills"]:
        doc.add_paragraph(f"Skill Name: {skill['skill_name']}")
        doc.add_paragraph(f"Description: {skill['skill_description']}")
        doc.add_paragraph(f"Code: {skill['skill_code']}")
        doc.add_paragraph(f"Importance: {skill.get('importance', 'N/A')}")
        doc.add_paragraph(f"Type: {skill.get('type', 'N/A')}")
        doc.add_paragraph(f"Proficiency Level: {skill.get('proficiency_level', 'N/A')}")
        doc.add_paragraph(f"Distinctive Elements: {skill.get('distinctive_elements', 'N/A')}")
        doc.add_paragraph(f"Resume Signals: {skill.get('resume_signals', 'N/A')}")
        doc.add_paragraph(f"Assessment Method: {skill.get('assessment_method', 'N/A')}")
        doc.add_paragraph("")  # Add an empty line for separation

    # Add ESCO Levels if available
    if "skills_esco" in result and result["skills_esco"]:
        doc.add_heading('ESCO Levels', level=2)
        for i in range(1, 6):
            for field in ["code", "name", "desc"]:
                key = f"Level_{i}_ESCO_{field}"
                if key in result:
                    doc.add_paragraph(f"{key}: {result[key]}")

        # Add ESCO Skills
        doc.add_heading('ESCO Skills', level=2)
        for skill in result["skills_esco"]["skills"]:
            doc.add_paragraph(f"Skill Name: {skill['skill_name']}")
            doc.add_paragraph(f"Description: {skill['skill_description']}")
            doc.add_paragraph(f"Code: {skill['skill_code']}")
            doc.add_paragraph(f"Importance: {skill.get('importance', 'N/A')}")
            doc.add_paragraph(f"Type: {skill.get('type', 'N/A')}")
            doc.add_paragraph(f"Proficiency Level: {skill.get('proficiency_level', 'N/A')}")
            doc.add_paragraph(f"Distinctive Elements: {skill.get('distinctive_elements', 'N/A')}")
            doc.add_paragraph(f"Resume Signals: {skill.get('resume_signals', 'N/A')}")
            doc.add_paragraph(f"Assessment Method: {skill.get('assessment_method', 'N/A')}")
            doc.add_paragraph("")  # Add an empty line for separation

    # Save the document to a temporary file
    temp_file_path = "job_description_analysis.docx"
    doc.save(temp_file_path)

    return temp_file_path



# ================= GRADIO INTERFACE =================
with gr.Blocks(
    title="AI-powered tool to review Job Position Description",
css="""
    @import url('https://fonts.googleapis.com/css2?family=Lato:wght@400;700&display=swap');
    @import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css');
/* Completely disable Gradio's dark theme */
.gradio-container.dark {
    --body-background-fill: white !important;
    --background-fill-primary: white !important;
    --background-fill-secondary: #f8f9fa !important;
    --block-background-fill: white !important;
    --input-background-fill: white !important;
    --block-label-text-color: #212529 !important;
    --body-text-color: #212529 !important;
    --block-title-text-color: var(--primary-color) !important;
    --border-color-primary: #dee2e6 !important;
}
.gradio-container.dark .gr-markdown,
.gradio-container.dark .gr-textbox,
.gradio-container.dark .gr-dropdown,
.gradio-container.dark .output-section {
    background: white !important;
    color: #212529 !important;
    border-color: #dee2e6 !important;
}
/* Base Styles */
:root {
    --primary-color: #0033A0;
    --secondary-color: #e67e22;
    --accent-color: #f59e0b;
    --dark-color: #34495e;
    --light-color: #ecf0f1;
    --success-color: #27ae60;
    --warning-color: #f39c12;
    --danger-color: #e74c3c;
    --text-color: #333;
    --text-light: #7f8c8d;
    --border-radius: 8px;
    --box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    --transition: all 0.3s ease;
}
/* Header Styles */
.header {
    text-align: center;
    margin-bottom: 2rem;
    padding: 1rem;
}
.header h1 {
    margin: 0;
    font-family: 'Lato', sans-serif;
    font-size: 2.5rem;
    font-weight: 600;
    color: var(--primary-color);
}
.header p {
    margin: 0.5rem 0 0;
    font-family: 'Lato', sans-serif;
    opacity: 0.9;
    font-size: 1.5rem;
    color: #4b5563;
}
/* Section Titles */
.section-title {
    display: flex;
    align-items: left;
    font-family: 'Lato', sans-serif;
    gap: 0.5rem;
    color: var(--primary-color);
    margin: 1rem 0;
    font-size: 1.25rem;
    font-weight: 600;
}
.section-title i {
    font-size: 1.1em;
    color: var(--accent-color);
}
/* Input Section */
.input-section {
    background: white;
    padding: 0.75rem 0.5rem;
    border: 1px solid #d1d5db;
    border-radius: var(--border-radius);
    box-shadow: var(--box-shadow);
    margin-right: 1rem;
}
/* Output Section */
.output-section {
    background: white;
    padding: 1.5rem;
    border-radius: var(--border-radius);
    box-shadow: var(--box-shadow);
}
/* Form Elements */
.gr-textbox, .gr-dropdown {
    border: 1px solid #ddd;
    border-radius: var(--border-radius) !important;
    padding: 0.75rem 1rem !important;
    transition: var(--transition);
}
.gr-textbox:focus, .gr-dropdown:focus {
    border-color: var(--primary-color) !important;
    box-shadow: 0 0 0 2px rgba(44, 110, 203, 0.2) !important;
    outline: none !important;
}
.gr-textbox::placeholder {
    color: var(--text-light) !important;
    opacity: 0.7 !important;
}
label {
    font-weight: 500 !important;
    color: var(--dark-color) !important;
    margin-bottom: 0.5rem !important;
    display: block !important;
}
/* Buttons */
.btn-primary {
    background: var(--primary-color) !important;
    color: white !important;
    border: none !important;
    border-radius: var(--border-radius) !important;
    padding: 0.75rem 1.5rem !important;
    font-weight: 500 !important;
    transition: var(--transition) !important;
    text-transform: uppercase !important;
    letter-spacing: 0.5px !important;
}
.btn-primary:hover {
    background: #002080 !important;
    transform: translateY(-2px) !important;
    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15) !important;
}
.btn-primary:active {
    transform: translateY(0) !important;
}
/* Output Markdown */
.gr-markdown {
    background: #f9f9f9;
    padding: 1.5rem;
    border-radius: var(--border-radius);
    border-left: 4px solid var(--primary-color);
}
/* Debug Console */
.gr-textbox[label="⚠️ Console Log"] {
    font-family: monospace !important;
    background: #2c3e50 !important;
    color: #ecf0f1 !important;
    border-radius: var(--border-radius) !important;
    padding: 1rem !important;
}
/* Responsive Layout */
@media (max-width: 768px) {
    .gr-row {
        flex-direction: column !important;
    }
    
    .input-section {
        margin-right: 0 !important;
        margin-bottom: 1rem !important;
    }
}
    """,
    head='''
    <meta name="description" content="AI-powered tool to review Job Position Description.">
    <meta name="keywords" content="HR, Position, Job, Skills, Qualification, Interview">
    <meta name="author" content="Edouard Legoupil | IOM Chief Data Officer">
    <link rel="author" href="https://edouard-legoupil.github.io/">
    <meta property="og:title" content="AI-powered tool to review Job Position Description">
    <meta property="og:description" content="AI-powered tool to review Job Position Description">
    <meta property="og:type" content="website">
    <link rel="icon" href="https://www.iom.int/themes/custom/phoenix/favicon.ico" type="image/vnd.microsoft.icon">
    <link rel="apple-touch-icon" href="https://www.iom.int/sites/g/files/tmzbdl486/files/favicon.ico">
    '''
) as demo:

    # Header section
    with gr.Column():
        with gr.Row():
            with gr.Column():
                gr.HTML("""
                <div class="header">
                    <h1>Position Description Review (Demo)</h1>
                    <p>Use AI to standardise an initial draft position description and identify related Job Family, Occupation, Qualification, match Skills and suggest interview questions.</p>
                </div>
                """)
                
    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload a Post Description PDF file", file_types=[".pdf"])
            submit_btn = gr.Button(
                        value="✨ Analyse Post Description",
                        variant="primary",
                        elem_classes="btn-primary"
            )

    with gr.Row():
        with gr.Column():
            responsibilities_output = gr.Textbox(label="List of Responsibilities used for the review", lines=5, interactive=False)
            job_family_output = gr.Textbox(label="Classified Job Family", interactive=False)
            
    with gr.Row():
        with gr.Column():
            gr.Markdown("## CCOG Levels")
            ccoq_levels_output = gr.JSON(label="CCOG Levels")
        with gr.Column():
            gr.Markdown("## ESCO Levels")
            esco_levels_output = gr.JSON(label="ESCO Levels")
            
    with gr.Row():
        with gr.Column():
            gr.Markdown("## Skills")
            skills_output = gr.JSON(label="Skills")


    with gr.Row():
        with gr.Column():
            gr.Markdown("## ESCO Skills")
            esco_skills_output = gr.JSON(label="ESCO Skills")

    with gr.Row():
        with gr.Column():
            qualification_output = gr.Textbox(label="Qualification", lines=5, interactive=False)


    with gr.Row():
        with gr.Column():
            gr.Markdown("## Interview Questions")
            interview_output = gr.Textbox(label="Interview Questions", lines=10, interactive=False)



    with gr.Row():
        with gr.Column():
            download_btn = gr.Button(
                        value="📄 Download Word Document",
                        variant="primary",
                        elem_classes="btn-primary")

    if DEBUG:
        with gr.Row():
            with gr.Column():
                debug_console = gr.Textbox(
                    label="⚠️ Execution Log",
                    interactive=False,
                    elem_classes=["debug-console"]
                )

    submit_btn.click(
        fn=process_pdf,
        inputs=file_input,
        outputs=[
            file_name_output,
            responsibilities_output,
            job_family_output,
            qualification_output,
            ccoq_levels_output,
            interview_output,
            skills_output,
            esco_levels_output,
            esco_skills_output,
            debug_console if DEBUG else None
        ]
    )

    download_btn.click(
        fn=generate_word_document,
        inputs=[file_name_output, responsibilities_output, job_family_output, qualification_output,
                ccoq_levels_output, interview_output, skills_output, esco_levels_output, esco_skills_output],
        outputs=gr.File(label="Download Word Document")
    )

if __name__ == "__main__":
    demo.launch(show_error=True, debug=DEBUG)