import gradio as gr
import pdfplumber
import pandas as pd
import re
import warnings
import logging
import os
from dotenv import load_dotenv
import os
import json
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict, Optional
import traceback
import time

# Load environment variables
load_dotenv()


import openai
def gpt_call(system_prompt: str, user_prompt: str) -> str:
    try:
        client = openai.AzureOpenAI(
            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
            api_version=os.getenv("OPENAI_API_VERSION"),
        )
        response =  client.chat.completions.create(
            model=os.getenv("AZURE_DEPLOYMENT_NAME"),
            messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
            ],
            temperature=0.3 # setting a low temp to be conservative
        )
        return response.choices[0].message.content.strip()
    except OpenAIError as e:
        return f"ERROR: {e}"

        
# Configure logging for pdfminer
logging.getLogger('pdfminer').setLevel(logging.ERROR)  # Only show errors, not warnings

def extract_text_from_pdf(pdf_path, suppress_warnings=True):
    """
    Extracts all text from a PDF, including text from nested tables and complex layouts.
    
    Parameters:
        pdf_path (str): Path to the PDF file
        suppress_warnings (bool): Whether to suppress PDF parsing warnings (default: True)
    """
    text = ""
    
    # Create a custom filter for the specific warning
    if suppress_warnings:
        warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text from the page
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
            
            # Extract text from tables (if any)
            for table in page.extract_tables():
                for row in table:
                    for cell in row:
                        if isinstance(cell, str):
                            text += cell + " "
                    text += "\n"
    return text


def extract_section_from_pdf(full_text, section_title):
    """
    Uses OpenAI to extract a specific section (e.g., "Responsibilities and Accountabilities") from the full text.
    """
    user_prompt = f"""
    
    Carefully evaluate the provided position description (PD) document and extract thecontent of the section titled "{section_title}" from the following text. 
    
    Return only the content of the section, without the title.
    If the section cannot be found or explicitly mentioned in the text, use ""N/A"" as the default value.
    Do not repeat in the extracted text the name of the section.
    Extract precisely all the related text.
    
    Text of the position description: 
    {full_text}

    Section to identify: "{section_title}":
    """
    
    return gpt_call("You are an HR expert working for IOM.", user_prompt)


def classify_job_family(responsibilities: List[str]) -> str:
    job_families_df = pd.read_csv("job_families1.csv")
    job_family_list = "\n".join(f"- {row['Job_family']}: {row['Job_subfamily']}" for _, row in job_families_df.iterrows())
    user_prompt = f"""

Here is a list of job responsibilities:

{responsibilities}

Here is a list of Job families
{job_family_list}

Based on the responsibilities, suggest the most relevant job family and subfamily from the list above.   

 **Important:**
    - Return ONLY the job family, nothing else.
    - The job family should be exactly as shown in the list.
    - Do not include any additional text or explanation.
"""
    
    return gpt_call("Suggest job family and subfamily based on responsibilities.", user_prompt)

    
def get_level_CCOG_info(df, code, level_name):
    """Helper function to get level info with error handling"""
    occupational_groups_df = pd.read_csv("occupational_groups.csv")
    matches = df[df['code'] == code]
    if len(matches) == 0:
        print(f"Warning: No {level_name} found for CCOG code {code}")
        return {
            f'{level_name}_CCOG_code': code,
            f'{level_name}_CCOG_name': 'UNKNOWN',
            f'{level_name}_CCOG_desc': 'No matching occupation found'
        }
    info = matches.iloc[0]
    return {
        f'{level_name}_CCOG_code': code,
        f'{level_name}_CCOG_name': info['occupation'],
        f'{level_name}_CCOG_desc': info.get('occupation_description', '')
    }

def code_sanitize(input_string, valid_codes):
    """
    Checks if any of the valid_codes exists as a substring in input_string.
    Returns the first matching code, otherwise None.
    """
    for code in valid_codes:
        if code in input_string:  # Checks for exact substring match
            return code
    return None

def classify_occupational_group_by_level(responsibilities: List[str]) -> dict:
    """
    Classifies job responsibilities into occupational groups at 4 levels,
    The [Common Classification of Occupational Groups (CCOG)](https://icsc.un.org/Resources/HRPD/JobEvaluation/CCOG_9_2015.pdf)
    returning codes, names, and descriptions for each level.    
    Args:
        responsibilities: List of job responsibility strings
    Returns:
        Dictionary containing classification information or error message
    """
    occupational_groups_df = pd.read_csv("occupational_groups.csv")
    result = {}
    
    try:
        ######################## Level 1 ###################
        level1_df = occupational_groups_df[occupational_groups_df['level'] == "Level 1"]
        job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']}" 
                                     for _, row in level1_df.iterrows())
        #print(job_occupation_list)
        list1_output = level1_df["code"].tolist()  # Convert Series to list
        list1 = ", ".join(map(str, list1_output))  # Join elements with comma
        #print(list1)

        user_prompt1 = f"""
Here is a list of job responsibilities:
{responsibilities}

Here is a list of level 1 Occupation classifications:
{job_occupation_list}

Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.

    **Important:**
    - Return ONLY the code, nothing else.
    - The code should be exactly as shown in the list.
    - Do not include any additional text or explanation.
"""
        #print(user_prompt1)
        level1_code = gpt_call("Identify level 1 occupational group", user_prompt1).strip()        
        level1_code = code_sanitize(level1_code, list1_output)
        #print(level1_code)
        result.update(get_level_CCOG_info(level1_df, level1_code, 'Level_1'))
        
        ######################## Level 2 ###################
        level2_df = occupational_groups_df[
            (occupational_groups_df['level'] == "Level 2") & 
            (occupational_groups_df['code'].str.startswith(level1_code))
        ]
        job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row['occupation_description']}" 
                                     for _, row in level2_df.iterrows())
        #print(job_occupation_list)  
        list2_output = level2_df["code"].tolist()  # Convert Series to list
        list2 = ", ".join(map(str, list2_output))  # Join elements with comma
        #print(list2)      
        
        user_prompt2 = f"""
Here is a list of job responsibilities:
{responsibilities}

Here is a list of level 2 Occupation classifications within {level1_code}:
{job_occupation_list}

Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
    **Important:**
    - Return ONLY the code, nothing else.
    - The code should be exactly as shown in the list.
    - Do not include any additional text or explanation.
"""
        #print(user_prompt2)
        level2_code = gpt_call("Identify level 2 occupational group", user_prompt2).strip()
        level2_code = code_sanitize(level2_code, list2_output)
        #print(level2_code)
        result.update(get_level_CCOG_info(level2_df, level2_code, 'Level_2'))

        ######################## Level 3 ###################
        level3_df = occupational_groups_df[
            (occupational_groups_df['level'] == "Level 3") & 
            (occupational_groups_df['code'].str.startswith(level2_code))
        ]
        job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row['occupation_description']}" 
                                     for _, row in level3_df.iterrows())
        #print(job_occupation_list)        
        list3_output = level3_df["code"].tolist()  # Convert Series to list
        list3 = ", ".join(map(str, list3_output))  # Join elements with comma
        #print(list3)            
        
        user_prompt3 = f"""
Here is a list of job responsibilities:
{responsibilities}

Here is a list of level 3 Occupation classifications within {level2_code}:
{job_occupation_list}

Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.

    **Important:**
    - Return ONLY the code, nothing else.
    - The code should be exactly as shown in the list.
    - Do not include any additional text or explanation.

"""
        level3_code = gpt_call("Identify level 3 occupational group", user_prompt3).strip()
        level3_code = code_sanitize(level3_code, list3_output)
        result.update(get_level_CCOG_info(level3_df, level3_code, 'Level_3'))
        
        ######################## Level 4 ###################
        level4_df = occupational_groups_df[
            (occupational_groups_df['level'] == "Level 4") & 
            (occupational_groups_df['code'].str.startswith(level3_code))
        ]
        job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} : {row['occupation_description']}" 
                                     for _, row in level4_df.iterrows())
        #print(job_occupation_list)  
        list4_output = level4_df["code"].tolist()  # Convert Series to list
        list4 = ", ".join(map(str, list4_output))  # Join elements with comma
        #print(list4)    
        user_prompt4 = f"""
Here is a list of job responsibilities:
{responsibilities}

Here is a list of level 4 Occupation classifications within {level3_code}:
{job_occupation_list}

Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
    **Important:**
    - Return ONLY the code, nothing else.
    - The code should be exactly as shown in the list.
    - Do not include any additional text or explanation.
"""
        
        level4_code = gpt_call("Identify final occupational group", user_prompt4).strip()
        level4_code = code_sanitize(level4_code, list4_output)
        result.update(get_level_CCOG_info(level4_df, level4_code, 'Level_4'))
        
    except Exception as e:
        print(f"Error during classification: {str(e)}")
        result['error'] = str(e)
    
    return result

from typing import List, Dict
import pandas as pd
esco_df = pd.read_csv(
    "ISCOGroups_en.csv",
    dtype={'code': str}  # Force 'code' to be read as string
)


esco_level5_df = pd.read_csv(
    "occupations_en.csv",
    dtype={'code': str, 'iscoGroup': str, }  # Force 'code' to be read as string
)
    
def get_level_ESCO_info(df, code, level_name):
    """Helper function to get level info with error handling"""
    matches = df[df['code'] == code]
    if len(matches) == 0:
        print(f"Warning: No {level_name} found for ESCO code {code}")
        return {
            f'{level_name}_ESCO_code': code,
            f'{level_name}_ESCO_name': 'UNKNOWN',
            f'{level_name}_ESCO_desc': 'No matching occupation found'
        }
    info = matches.iloc[0]
    return {
        f'{level_name}_ESCO_code': code,
        f'{level_name}_ESCO_name': info['preferredLabel'],
        f'{level_name}_ESCO_desc': info.get('description', '')
    }

def classify_esco_by_hierarchical_level(responsibilities: List[str]) -> dict:
    """
    Classifies job responsibilities into occupational groups at 4 levels,
    [European Skills, Competences, Qualifications, and Occupations (ESCO)](https://esco.ec.europa.eu/en)
    returning codes, names, and descriptions for each level.    
    Args:
        responsibilities: List of job responsibility strings
    Returns:
        Dictionary containing classification information or error message
    """

    esco_df = pd.read_csv(
        "ISCOGroups_en.csv",
        dtype={'code': str}  # Force 'code' to be read as string
    )
   # print(esco_df.columns)

    esco_level5_df = pd.read_csv(
        "occupations_en.csv",
        dtype={'code': str, 'iscoGroup': str, }  # Force 'code' to be read as string
    )
   # print(esco_level5_df.columns)

    result = {}
        ######################## Level 1 ###################
    # Get all top-level codes (single character/digit)
    top_level_codes = sorted({
        code for code in esco_df['code']
        if len(code) == 1 and code.isalnum()
    })

    level1_code = None
    if top_level_codes:
        level1_df = esco_df[esco_df['code'].isin(top_level_codes)]
        job_occupation_list = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
                                        for _, row in level1_df.iterrows())
        #print(job_occupation_list)
        list1_output = level1_df["code"].tolist()  # Convert Series to list
        list1 = ", ".join(map(str, list1_output))  # Join elements with comma
        #print(list1)

    user_prompt1 = f"""
Here is a list of job responsibilities:
{responsibilities}

Select the most relevant top-level code from these options:
{job_occupation_list}

Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.
    **Important:**
    - Return ONLY the code, nothing else.
    - The code should be exactly as shown in the list.
    - Do not include any additional text or explanation.
"""
    level1_code = gpt_call("Identify top-level occupational group", user_prompt1).strip()
    level1_code = code_sanitize(level1_code, list1_output)
    result.update(get_level_ESCO_info(level1_df, level1_code, 'Level_1'))


        ######################## Level 2 ###################

    level2_code = None
    if level1_code:
        level2_df = esco_df[
            (esco_df['code'].str.startswith(level1_code)) & (esco_df['code'].str.len() == len(level1_code) + 1)
        ]
        if not level2_df.empty:
            level2_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
                                        for _, row in level2_df.iterrows())
        #print(job_occupation_list)  
        list2_output = level2_df["code"].tolist()  # Convert Series to list
        list2 = ", ".join(map(str, list2_output))  # Join elements with comma
        #print(list2)      
        
        user_prompt2 = f"""
Here is a list of job responsibilities:
{responsibilities}

Here is a list of level 2 Occupation classifications within {level1_code}:
{level2_options}

Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
    **Important:**
    - Return ONLY the code, nothing else.
    - The code should be exactly as shown in the list.
    - Do not include any additional text or explanation.
"""
    level2_code = gpt_call("Identify second-level occupational group", user_prompt2).strip()
    level2_code = code_sanitize(level2_code, list2_output)
    result.update(get_level_ESCO_info(level2_df, level2_code, 'Level_2'))


        ######################## Level 3 ###################
    level3_code = None
    if level2_code:
        level3_df = esco_df[
            (esco_df['code'].str.startswith(level2_code)) & (esco_df['code'].str.len() == len(level2_code) + 1)
        ]
        if not level3_df.empty:
            level3_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
                                        for _, row in level3_df.iterrows())
        #print(job_occupation_list)        
        list3_output = level3_df["code"].tolist()  # Convert Series to list
        list3 = ", ".join(map(str, list3_output))  # Join elements with comma
        #print(list3)            
        
        user_prompt3 = f"""
Here is a list of job responsibilities:
{responsibilities}

Here is a list of level 3 Occupation classifications within {level2_code}:
{level3_options}

Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.

    **Important:**
    - Return ONLY the code, nothing else.
    - The code should be exactly as shown in the list.
    - Do not include any additional text or explanation.

"""
    level3_code = gpt_call("Identify third-level occupational group", user_prompt3).strip()
    level3_code = code_sanitize(level3_code, list3_output)
    result.update(get_level_ESCO_info(level3_df, level3_code, 'Level_3'))

        ######################## Level 4 ###################
    level4_code = None
    if level3_code:
        level4_df = esco_df[
            (esco_df['code'].str.startswith(level3_code)) & (esco_df['code'].str.len() == len(level3_code) + 1)
        ]
        if not level4_df.empty:
            level4_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
                                        for _, row in level4_df.iterrows())
        #print(job_occupation_list)  
        list4_output = level4_df["code"].tolist()  # Convert Series to list
        list4 = ", ".join(map(str, list4_output))  # Join elements with comma
        #print(list4)    
        user_prompt4 = f"""
Here is a list of job responsibilities:
{responsibilities}

Here is a list of level 4 Occupation classifications within {level3_code}:
{level4_options}

Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
    **Important:**
    - Return ONLY the code, nothing else.
    - The code should be exactly as shown in the list.
    - Do not include any additional text or explanation.
"""
    level4_code = gpt_call("Identify fourth-level occupational group", user_prompt4).strip()
    level4_code = code_sanitize(level4_code, list4_output)
    result.update(get_level_ESCO_info(level4_df, level4_code, 'Level_4'))

        ######################## Level 5 ###################
    level5_code = None
    if level4_code:
        level5_df = esco_level5_df[
            (esco_level5_df['iscoGroup'].str.startswith(level4_code))
        ]
        if not level5_df.empty:
            level5_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
                                        for _, row in level5_df.iterrows())

        #print(job_occupation_list)  
        list5_output = level5_df["code"].tolist()  # Convert Series to list
        list5 = ", ".join(map(str, list5_output))  # Join elements with comma
        #print(list5)    
        user_prompt5 = f"""
Here is a list of job responsibilities:
{responsibilities}

Here is a list of level 4 Occupation classifications within {level4_code}:
{level5_options}

Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list5}.
    **Important:**
    - Return ONLY the code as stated in the provided list, nothing else.
    - The code should be exactly as shown in the list.
    - Do not include any additional text, occupation code or explanation.
"""

    level5_code = gpt_call("Identify fifth-level occupational group", user_prompt5).strip()
    # Handle the case where the LLM might return just the code part
    level5_code = code_sanitize(level5_code, list5_output)
    result.update(get_level_ESCO_info(level5_df, level5_code, 'Level_5'))

    ## Et voila!! 
    return result
    

def get_skills_info_esco(Level_5_code):
    """Helper function to get level info with error handling"""
    esco_level5_df = pd.read_csv(
        "occupations_en.csv",
        dtype={'code': str, 'iscoGroup': str, }  # Force 'code' to be read as string
        )

    # Find the matching occupation
    matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]

    # Get the conceptUri(s) for the matched occupation
    conceptUris = matches['conceptUri'].values.tolist()

    esco_skill_map_df = pd.read_csv(
        "occupationSkillRelations_en.csv"
    )
    # Find all skills related to that occupationUri (using isin to match any from the list)
    skills = esco_skill_map_df[esco_skill_map_df['occupationUri'].isin(conceptUris)]

    # Get the list of skillUris
    skillUris = skills['skillUri'].values.tolist()

    esco_skill_df = pd.read_csv(    
        "skills_en.csv"
    )
    # Get the full skill details from esco_skill_df
    thisskillslist = esco_skill_df[esco_skill_df['conceptUri'].isin(skillUris)]

    result=   thisskillslist[['preferredLabel','conceptUri', 'description']].drop_duplicates()
    result = result.rename(columns={
        'preferredLabel': 'skill_name',
        'description': 'skill_description',
        'conceptUri': 'skill_code'
    }) 

    return result 


def review_skills( Level_5_code: str, top_n: int = 10) -> List[Dict[str, str]]:
    """
    Validate relevant ESCO-style skills for a job responsibilities using a language model.

    Args:
        Level_5_code: Standard esco occupation code strings..
        top_n (int): The number of skills to return. Defaults to 3.

    Returns:
        List[Dict[str, str]]: A list of extracted skill dictionaries with keys:
            - skill_name
            - skill_description
            - skill_code
    """
    matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]

    # Get the conceptUri(s) for the matched occupation
    esco_occup = matches['preferredLabel'].values.tolist()
    skill_filtered = get_skills_info_esco(Level_5_code)

    skill_filtered_options = "\n".join(f"- {row['skill_code']}: {row['skill_name']} - {row['skill_description']}"
                                for _, row in skill_filtered.iterrows())
    
    prompt = f"""
Here is a list of skills:

{skill_filtered_options}

Filter the skills that relevant in the context of the work of the International Organisation for Migration. 

Ensure that skills is relevant in the context of a {esco_occup} working for non-profit public organisation.

Required JSON structure:
{{
    "skills": [
        {{
            "skill_name": "string",
            "skill_description": "string",
            "skill_code": "string" 
        }}
    ]
}}

**Important:**
- Do not duplicate any records of skills
- keep only the 10 most relevant skills
- Return ONLY the JSON object with no other text
- Use double quotes for all strings
- No trailing commas in arrays/objects
- No markdown formatting (no ```json)
- No text before or after the JSON
- Escape all special characters in strings
- Ensure all brackets are properly closed
- No trailing commas in arrays/objects, especially before closing brackets 
"""

    raw = gpt_call(
        "You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.",
        prompt
    )

    json_text = _extract_json(raw)
    if not json_text:
        return []

    try:
        result = json.loads(json_text)
        skills = result.get("skills", [])
    except json.JSONDecodeError as e:
        print(f"❌ JSON parsing error: {e}")
        print(f"🔍 Problematic JSON: {json_text}")
        return []

    validated_skills = []
    for skill in skills:
        try:
            validated = {
                "skill_name": str(skill["skill_name"]).strip(),
                "skill_description": str(skill["skill_description"]).strip(),
                "skill_code": str(skill["skill_code"]).strip()
            }
            validated_skills.append(validated)
        except (KeyError, TypeError) as e:
            print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
            continue

    return validated_skills[:top_n]


def extract_skills(responsibilities: List[str], top_n: int = 10) -> List[Dict[str, str]]:
    """
    Extracts ESCO-style skills from job responsibilities using a language model.

    Args:
        responsibilities (List[str]): A list of job responsibility strings.
        top_n (int): The number of skills to return. Defaults to 3.

    Returns:
        List[Dict[str, str]]: A list of extracted skill dictionaries with keys:
            - skill_name
            - skill_description
            - skill_code
    """
    
    prompt = f"""
Here is a list of job responsibilities:

{responsibilities}

List the required skills and knowledge as bullet points (without numbers) using ESCO-style terms. 

For each Skill: 

 1. skill_name: precise skills name as used in ESCO framework
 2. skill_description: add the long description as mentioned in ESCO framework
 3. skill_code: include the detailed corresponding ESCO code for that skill.

Required JSON structure:
{{
    "skills": [
        {{
            "skill_name": "string",
            "skill_description": "string",
            "skill_code": "string" 
        }}
    ]
}}

**Important:**
- Return ONLY the JSON object with no other text
- Use double quotes for all strings
- No trailing commas in arrays/objects
- No markdown formatting (no ```json)
- No text before or after the JSON
- Escape all special characters in strings
- Ensure all brackets are properly closed
"""

    raw = gpt_call(
        "You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.",
        prompt
    )

    json_text = _extract_json(raw)
    if not json_text:
        return []

    try:
        result = json.loads(json_text)
        skills = result.get("skills", [])
    except json.JSONDecodeError as e:
        print(f"❌ JSON parsing error: {e}")
        print(f"🔍 Problematic JSON: {json_text}")
        return []

    validated_skills = []
    for skill in skills:
        try:
            validated = {
                "skill_name": str(skill["skill_name"]).strip(),
                "skill_description": str(skill["skill_description"]).strip(),
                "skill_code": str(skill["skill_code"]).strip()
            }
            validated_skills.append(validated)
        except (KeyError, TypeError) as e:
            print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
            continue

    return validated_skills[:top_n]


def map_proficiency_and_assessment(skills: List[str], responsibilities: List[str]) -> List[Dict]:
    """
    Maps each skill to its contextual importance, expected proficiency level, 
    and assessment strategy based on job responsibilities.

    Args:
        skills (List[str]): List of skill names.
        responsibilities (List[str]): List of job responsibilities.

    Returns:
        List[Dict]: A list of dictionaries containing skill metadata:
            - skill_name
            - importance (essential / optional)
            - type ("skill/competence" or "knowledge")
            - proficiency_level (Basic, Intermediate, Advanced)
            - distinctive_elements
            - resume_signals
            - assessment_method
    """
    
    prompt = f"""
Here is a list of job responsibilities: {responsibilities} that have been associated with the following skills: {skills}

For each skill, accounting for the context defined within the responsibilities, return a JSON object with:
    - skill_name: the name of the skill
    - importance: essential or optional
    - type: "skill/competence" or "knowledge"
    - proficiency_level: Basic, Intermediate, or Advanced
    - distinctive_elements: what specific and distinctive elements are required at this defined proficiency level?
    - resume_signals: what to look for in a resume to assess this skill?
    - assessment_method: what is the preferred assessment method to accurately assess this skill?

Respond ONLY with a list of dictionaries in valid JSON.
Use double quotes for all strings. No markdown, no commentary, no trailing commas.
"""

    raw = gpt_call("Define proficiency level and assessment for each skill.", prompt)

    json_text = _extract_json_array(raw)
    if not json_text:
        return []

    try:
        results = json.loads(json_text)
    except json.JSONDecodeError as e:
        print(f"❌ JSON parsing error: {e}")
        print(f"🔍 Problematic JSON: {json_text}")
        return []

    validated = []
    for item in results:
        try:
            validated.append({
                "skill_name": str(item["skill_name"]).strip(),
                "importance": item["importance"].strip().lower(),
                "type": item["type"].strip().lower(),
                "proficiency_level": item["proficiency_level"].strip().capitalize(),
                "distinctive_elements": item["distinctive_elements"].strip(),
                "resume_signals": item["resume_signals"].strip(),
                "assessment_method": item["assessment_method"].strip()
            })
        except (KeyError, TypeError) as e:
            print(f"⚠️ Skipping invalid item: {item}. Error: {e}")
            continue

    return validated

def _extract_json_array(raw: str) -> str:
    """
    Attempts to extract a clean JSON array from raw GPT output.
    """
    json_start = raw.find('[')
    json_end = raw.rfind(']') + 1

    if json_start == -1 or json_end == 0:
        print(f"❌ No JSON array found in response: {raw}")
        return ""

    json_text = raw[json_start:json_end]

    # Cleanup
    json_text = re.sub(r',\s*([}\]])', r'\1', json_text)  # Remove trailing commas
    json_text = re.sub(r'[\n\r\t]', ' ', json_text)       # Remove control chars
    json_text = re.sub(r'(?<!\\)"', '"', json_text)       # Fix quotes if needed

    return json_text

def extract_qualification(responsibilities: List[str]) -> List[str]:
 
    prompt = f"""
Here is a list of job responsibilities: {responsibilities} 

Infer the required level within the European Qualifications Framework (EQF) to implement them.
Identify the potential diplomas to testify such qualification
"""
 
    raw = gpt_call("You are an HR expert that excel in developing compentency based interview questions.", prompt)
    return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]

def build_interview(responsibilities: List[str], skill_assess: List[str]) -> List[str]:
 
    prompt = f"""

Here is a list of job responsibilities: {responsibilities} and related skills: {skill_assess}

Output: A structured 40-minute interview with:

    Opening questions (5 min)

    Core competency-based questions (30 min, 5-6 questions)

    Closing & candidate questions (5 min)

 
"""
 
    raw = gpt_call("You are an HR expert that excel in developing compentency based interview questions.", prompt)
    return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]

    
def _extract_json(raw: str) -> str:
    """
    Attempts to extract and clean a JSON object from a raw string.
    """
    json_start = raw.find('{')
    json_end = raw.rfind('}') + 1

    if json_start == -1 or json_end == 0:
        print(f"❌ No JSON found in response: {raw}")
        return ""

    json_text = raw[json_start:json_end]

    # Clean common issues
    json_text = re.sub(r',\s*([}\]])', r'\1', json_text)       # Remove trailing commas
    json_text = re.sub(r'[\n\r\t]', ' ', json_text)            # Remove control characters
    json_text = re.sub(r'\s{2,}', ' ', json_text)              # Collapse multiple spaces
    json_text = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', json_text)  # Escape lone backslashes
    json_text = json_text.strip()

    return json_text    


def process_pdf(file):
    """
    Processes the uploaded PDF file and returns the extracted text.
    """
    if file is None:
        return "Please upload a PDF file."
    
    try:
        extracted_text = extract_text_from_pdf(file.name)
        
        # Extract responsibilities section
        responsibilities = extract_section_from_pdf(full_text, section_title="Responsibilities and Accountabilities")
        if not responsibilities:
            print(f"Skipping {os.path.basename(file_path)} - no responsibilities section found")
            return None
            
        # Main processing
        job_family = classify_job_family(responsibilities)
        occ_group = classify_occupational_group_by_level(responsibilities)
        esco_occ = classify_esco_by_hierarchical_level(responsibilities)
        qualification = extract_qualification(responsibilities)
        skills = extract_skills(responsibilities)
        skill_map = map_proficiency_and_assessment(skills, responsibilities)

        # Check if we have ESCO level 5 code
        has_esco = esco_occ.get("Level_5_ESCO_code") is not None
        
        # ESCO-based skills processing (only if we have Level 5 code)
        skill_esco_extract = []
        skill_esco_map = []
        if has_esco:
            Level_5_code = esco_occ["Level_5_ESCO_code"]
            skill_esco_extract = review_skills(Level_5_code)
            skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
        else:
            print(f"No Level 5 ESCO code found for {os.path.basename(file_path)}, skipping ESCO skills mapping")
        
        time.sleep(6)  # Rate limiting delay

        # Join original skills with assessment
        assessment_lookup = {item['skill_name']: item for item in skill_map}
        joined_skills = [
            {
                "skill_name": skill["skill_name"],
                "skill_description": skill["skill_description"],
                "skill_code": skill["skill_code"],
                "importance": assessment_lookup.get(skill["skill_name"], {}).get("importance"),
                "type": assessment_lookup.get(skill["skill_name"], {}).get("type"),
                "proficiency_level": assessment_lookup.get(skill["skill_name"], {}).get("proficiency_level"),
                "distinctive_elements": assessment_lookup.get(skill["skill_name"], {}).get("distinctive_elements"),
                "resume_signals": assessment_lookup.get(skill["skill_name"], {}).get("resume_signals"),
                "assessment_method": assessment_lookup.get(skill["skill_name"], {}).get("assessment_method")
            }
            for skill in skills
        ]

        # Join ESCO skills with assessment (only if we processed them)
        joined_skills_esco = []
        if has_esco and skill_esco_extract:
            assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
            joined_skills_esco = [
                {
                    "skill_name": skill["skill_name"],
                    "skill_description": skill["skill_description"],
                    "skill_code": skill["skill_code"],
                    **assessment_esco_lookup.get(skill["skill_name"], {})
                }
                for skill in skill_esco_extract
            ]

        interview = build_interview(responsibilities, skills)

        # Prepare base result dictionary
        result = {
            "file": os.path.basename(file_path),
            "responsibilities": responsibilities,
            "job_family": job_fam1['Job_family'].values[0],
            "job_subfamily": job_fam1['Job_subfamily'].values[0],
            "classified_job_family": job_family,
            **{f"Level_{i}_CCOG_{field}": occ_group.get(f"Level_{i}_CCOG_{field}") 
               for i in range(1, 5) for field in ["code", "name", "desc"]},
            "qualification": qualification,
            "interview": interview,
            "skills": {
                "file": os.path.basename(file_path),
                "job_family": job_fam1['Job_family'].values[0],
                "job_subfamily": job_fam1['Job_subfamily'].values[0],
                "skills": joined_skills
            }
        }

        # Add ESCO fields only if we have them
        if has_esco:
            result.update({
                **{f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}") 
                   for i in range(1, 6) for field in ["code", "name", "desc"]},
                "skills_esco": {
                    "file": os.path.basename(file_path),
                    "job_family": job_fam1['Job_family'].values[0],
                    "job_subfamily": job_fam1['Job_subfamily'].values[0],
                    "skills": joined_skills_esco
                }
            })
        else:
            # Mark ESCO fields as null if not available
            result.update({
                **{f"Level_{i}_ESCO_{field}": None 
                   for i in range(1, 6) for field in ["code", "name", "desc"]},
                "skills_esco": None
            })

        return result
    
    except Exception as e:
        return f"Error processing PDF: {str(e)}"

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Standardise Job Description!")
    gr.Markdown("Identify Job Family, Occupation, Qualification, match Skills and suggest interview questions.")
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload a Job Description PDF file", file_types=[".pdf"])
            submit_btn = gr.Button("Extract Text")
        with gr.Column():
            text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
    
    submit_btn.click(
        fn=process_pdf,
        inputs=file_input,
        outputs=text_output
    )

# Run the app
if __name__ == "__main__":
    demo.launch()