Spaces:

edouardlgp
/

Job_Classification

Running

App Files Files Community

edouardlgp commited on May 10

Commit

8d94714

verified ·

1 Parent(s): e60de6e

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -701

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ import warnings
 import logging
 import os
 from dotenv import load_dotenv
-import os
 import json
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Optional
@@ -16,54 +15,47 @@ import time
 # Load environment variables
 load_dotenv()
-import openai
-def gpt_call(system_prompt: str, user_prompt: str) -> str:
     try:
         client = openai.AzureOpenAI(
             api_key=os.getenv("AZURE_OPENAI_API_KEY"),
             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
             api_version=os.getenv("OPENAI_API_VERSION"),
         )
-        response =  client.chat.completions.create(
             model=os.getenv("AZURE_DEPLOYMENT_NAME"),
             messages=[
-                    {"role": "system", "content": system_prompt},
-                    {"role": "user", "content": user_prompt}
             ],
-            temperature=0.3 # setting a low temp to be conservative
         )
         return response.choices[0].message.content.strip()
-    except OpenAIError as e:
         return f"ERROR: {e}"
-# Configure logging for pdfminer
-logging.getLogger('pdfminer').setLevel(logging.ERROR)  # Only show errors, not warnings
-def extract_text_from_pdf(pdf_path, suppress_warnings=True):
-    """
-    Extracts all text from a PDF, including text from nested tables and complex layouts.
-    Parameters:
-        pdf_path (str): Path to the PDF file
-        suppress_warnings (bool): Whether to suppress PDF parsing warnings (default: True)
-    """
     text = ""
-    # Create a custom filter for the specific warning
-    if suppress_warnings:
-        warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")
     with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
-            # Extract text from the page
             page_text = page.extract_text()
             if page_text:
                 text += page_text + "\n"
-            # Extract text from tables (if any)
             for table in page.extract_tables():
                 for row in table:
                     for cell in row:
@@ -72,57 +64,36 @@ def extract_text_from_pdf(pdf_path, suppress_warnings=True):
                     text += "\n"
     return text
-def extract_section_from_pdf(full_text, section_title):
-    """
-    Uses OpenAI to extract a specific section (e.g., "Responsibilities and Accountabilities") from the full text.
-    """
     user_prompt = f"""
-    Carefully evaluate the provided position description (PD) document and extract thecontent of the section titled "{section_title}" from the following text.
     Return only the content of the section, without the title.
-    If the section cannot be found or explicitly mentioned in the text, use ""N/A"" as the default value.
     Do not repeat in the extracted text the name of the section.
     Extract precisely all the related text.
-    Text of the position description:
     {full_text}
     Section to identify: "{section_title}":
     """
     return gpt_call("You are an HR expert working for IOM.", user_prompt)
 def classify_job_family(responsibilities: List[str]) -> str:
     job_families_df = pd.read_csv("job_families1.csv")
     job_family_list = "\n".join(f"- {row['Job_family']}: {row['Job_subfamily']}" for _, row in job_families_df.iterrows())
     user_prompt = f"""
-Here is a list of job responsibilities:
-{responsibilities}
-Here is a list of Job families
-{job_family_list}
-Based on the responsibilities, suggest the most relevant job family and subfamily from the list above.
- **Important:**
     - Return ONLY the job family, nothing else.
     - The job family should be exactly as shown in the list.
     - Do not include any additional text or explanation.
-"""
     return gpt_call("Suggest job family and subfamily based on responsibilities.", user_prompt)
 def get_level_CCOG_info(df, code, level_name):
-    """Helper function to get level info with error handling"""
-    occupational_groups_df = pd.read_csv("occupational_groups.csv")
     matches = df[df['code'] == code]
     if len(matches) == 0:
         print(f"Warning: No {level_name} found for CCOG code {code}")
@@ -139,481 +110,90 @@ def get_level_CCOG_info(df, code, level_name):
     }
 def code_sanitize(input_string, valid_codes):
-    """
-    Checks if any of the valid_codes exists as a substring in input_string.
-    Returns the first matching code, otherwise None.
-    """
     for code in valid_codes:
-        if code in input_string:  # Checks for exact substring match
             return code
     return None
 def classify_occupational_group_by_level(responsibilities: List[str]) -> dict:
-    """
-    Classifies job responsibilities into occupational groups at 4 levels,
-    The [Common Classification of Occupational Groups (CCOG)](https://icsc.un.org/Resources/HRPD/JobEvaluation/CCOG_9_2015.pdf)
-    returning codes, names, and descriptions for each level.
-    Args:
-        responsibilities: List of job responsibility strings
-    Returns:
-        Dictionary containing classification information or error message
-    """
     occupational_groups_df = pd.read_csv("occupational_groups.csv")
     result = {}
     try:
-        ######################## Level 1 ###################
-        level1_df = occupational_groups_df[occupational_groups_df['level'] == "Level 1"]
-        job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']}"
-                                     for _, row in level1_df.iterrows())
-        #print(job_occupation_list)
-        list1_output = level1_df["code"].tolist()  # Convert Series to list
-        list1 = ", ".join(map(str, list1_output))  # Join elements with comma
-        #print(list1)
-        user_prompt1 = f"""
-Here is a list of job responsibilities:
-{responsibilities}
-Here is a list of level 1 Occupation classifications:
-{job_occupation_list}
-Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.
-    **Important:**
-    - Return ONLY the code, nothing else.
-    - The code should be exactly as shown in the list.
-    - Do not include any additional text or explanation.
-"""
-        #print(user_prompt1)
-        level1_code = gpt_call("Identify level 1 occupational group", user_prompt1).strip()
-        level1_code = code_sanitize(level1_code, list1_output)
-        #print(level1_code)
-        result.update(get_level_CCOG_info(level1_df, level1_code, 'Level_1'))
-        ######################## Level 2 ###################
-        level2_df = occupational_groups_df[
-            (occupational_groups_df['level'] == "Level 2") &
-            (occupational_groups_df['code'].str.startswith(level1_code))
-        ]
-        job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row['occupation_description']}"
-                                     for _, row in level2_df.iterrows())
-        #print(job_occupation_list)
-        list2_output = level2_df["code"].tolist()  # Convert Series to list
-        list2 = ", ".join(map(str, list2_output))  # Join elements with comma
-        #print(list2)
-        user_prompt2 = f"""
-Here is a list of job responsibilities:
-{responsibilities}
-Here is a list of level 2 Occupation classifications within {level1_code}:
-{job_occupation_list}
-Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
-    **Important:**
-    - Return ONLY the code, nothing else.
-    - The code should be exactly as shown in the list.
-    - Do not include any additional text or explanation.
-"""
-        #print(user_prompt2)
-        level2_code = gpt_call("Identify level 2 occupational group", user_prompt2).strip()
-        level2_code = code_sanitize(level2_code, list2_output)
-        #print(level2_code)
-        result.update(get_level_CCOG_info(level2_df, level2_code, 'Level_2'))
-        ######################## Level 3 ###################
-        level3_df = occupational_groups_df[
-            (occupational_groups_df['level'] == "Level 3") &
-            (occupational_groups_df['code'].str.startswith(level2_code))
-        ]
-        job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row['occupation_description']}"
-                                     for _, row in level3_df.iterrows())
-        #print(job_occupation_list)
-        list3_output = level3_df["code"].tolist()  # Convert Series to list
-        list3 = ", ".join(map(str, list3_output))  # Join elements with comma
-        #print(list3)
-        user_prompt3 = f"""
-Here is a list of job responsibilities:
-{responsibilities}
-Here is a list of level 3 Occupation classifications within {level2_code}:
-{job_occupation_list}
-Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.
-    **Important:**
-    - Return ONLY the code, nothing else.
-    - The code should be exactly as shown in the list.
-    - Do not include any additional text or explanation.
-"""
-        level3_code = gpt_call("Identify level 3 occupational group", user_prompt3).strip()
-        level3_code = code_sanitize(level3_code, list3_output)
-        result.update(get_level_CCOG_info(level3_df, level3_code, 'Level_3'))
-        ######################## Level 4 ###################
-        level4_df = occupational_groups_df[
-            (occupational_groups_df['level'] == "Level 4") &
-            (occupational_groups_df['code'].str.startswith(level3_code))
-        ]
-        job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} : {row['occupation_description']}"
-                                     for _, row in level4_df.iterrows())
-        #print(job_occupation_list)
-        list4_output = level4_df["code"].tolist()  # Convert Series to list
-        list4 = ", ".join(map(str, list4_output))  # Join elements with comma
-        #print(list4)
-        user_prompt4 = f"""
-Here is a list of job responsibilities:
-{responsibilities}
-Here is a list of level 4 Occupation classifications within {level3_code}:
-{job_occupation_list}
-Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
-    **Important:**
-    - Return ONLY the code, nothing else.
-    - The code should be exactly as shown in the list.
-    - Do not include any additional text or explanation.
-"""
-        level4_code = gpt_call("Identify final occupational group", user_prompt4).strip()
-        level4_code = code_sanitize(level4_code, list4_output)
-        result.update(get_level_CCOG_info(level4_df, level4_code, 'Level_4'))
     except Exception as e:
         print(f"Error during classification: {str(e)}")
         result['error'] = str(e)
-    return result
-from typing import List, Dict
-import pandas as pd
-esco_df = pd.read_csv(
-    "ISCOGroups_en.csv",
-    dtype={'code': str}  # Force 'code' to be read as string
-)
-esco_level5_df = pd.read_csv(
-    "occupations_en.csv",
-    dtype={'code': str, 'iscoGroup': str, }  # Force 'code' to be read as string
-)
-def get_level_ESCO_info(df, code, level_name):
-    """Helper function to get level info with error handling"""
-    matches = df[df['code'] == code]
-    if len(matches) == 0:
-        print(f"Warning: No {level_name} found for ESCO code {code}")
-        return {
-            f'{level_name}_ESCO_code': code,
-            f'{level_name}_ESCO_name': 'UNKNOWN',
-            f'{level_name}_ESCO_desc': 'No matching occupation found'
-        }
-    info = matches.iloc[0]
-    return {
-        f'{level_name}_ESCO_code': code,
-        f'{level_name}_ESCO_name': info['preferredLabel'],
-        f'{level_name}_ESCO_desc': info.get('description', '')
-    }
-def classify_esco_by_hierarchical_level(responsibilities: List[str]) -> dict:
-    """
-    Classifies job responsibilities into occupational groups at 4 levels,
-    [European Skills, Competences, Qualifications, and Occupations (ESCO)](https://esco.ec.europa.eu/en)
-    returning codes, names, and descriptions for each level.
-    Args:
-        responsibilities: List of job responsibility strings
-    Returns:
-        Dictionary containing classification information or error message
-    """
-    esco_df = pd.read_csv(
-        "ISCOGroups_en.csv",
-        dtype={'code': str}  # Force 'code' to be read as string
-    )
-   # print(esco_df.columns)
-    esco_level5_df = pd.read_csv(
-        "occupations_en.csv",
-        dtype={'code': str, 'iscoGroup': str, }  # Force 'code' to be read as string
-    )
-   # print(esco_level5_df.columns)
-    result = {}
-        ######################## Level 1 ###################
-    # Get all top-level codes (single character/digit)
-    top_level_codes = sorted({
-        code for code in esco_df['code']
-        if len(code) == 1 and code.isalnum()
-    })
-    level1_code = None
-    if top_level_codes:
-        level1_df = esco_df[esco_df['code'].isin(top_level_codes)]
-        job_occupation_list = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
-                                        for _, row in level1_df.iterrows())
-        #print(job_occupation_list)
-        list1_output = level1_df["code"].tolist()  # Convert Series to list
-        list1 = ", ".join(map(str, list1_output))  # Join elements with comma
-        #print(list1)
-    user_prompt1 = f"""
-Here is a list of job responsibilities:
-{responsibilities}
-Select the most relevant top-level code from these options:
-{job_occupation_list}
-Based on the responsibilities, suggest the most relevant level 1 Occupation code from within this list: {list1}.
-    **Important:**
-    - Return ONLY the code, nothing else.
-    - The code should be exactly as shown in the list.
-    - Do not include any additional text or explanation.
-"""
-    level1_code = gpt_call("Identify top-level occupational group", user_prompt1).strip()
-    level1_code = code_sanitize(level1_code, list1_output)
-    result.update(get_level_ESCO_info(level1_df, level1_code, 'Level_1'))
-        ######################## Level 2 ###################
-    level2_code = None
-    if level1_code:
-        level2_df = esco_df[
-            (esco_df['code'].str.startswith(level1_code)) & (esco_df['code'].str.len() == len(level1_code) + 1)
-        ]
-        if not level2_df.empty:
-            level2_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
-                                        for _, row in level2_df.iterrows())
-        #print(job_occupation_list)
-        list2_output = level2_df["code"].tolist()  # Convert Series to list
-        list2 = ", ".join(map(str, list2_output))  # Join elements with comma
-        #print(list2)
-        user_prompt2 = f"""
-Here is a list of job responsibilities:
-{responsibilities}
-Here is a list of level 2 Occupation classifications within {level1_code}:
-{level2_options}
-Based on the responsibilities, suggest the most relevant level 2 Occupation code from within this list: {list2}.
-    **Important:**
-    - Return ONLY the code, nothing else.
-    - The code should be exactly as shown in the list.
-    - Do not include any additional text or explanation.
-"""
-    level2_code = gpt_call("Identify second-level occupational group", user_prompt2).strip()
-    level2_code = code_sanitize(level2_code, list2_output)
-    result.update(get_level_ESCO_info(level2_df, level2_code, 'Level_2'))
-        ######################## Level 3 ###################
-    level3_code = None
-    if level2_code:
-        level3_df = esco_df[
-            (esco_df['code'].str.startswith(level2_code)) & (esco_df['code'].str.len() == len(level2_code) + 1)
-        ]
-        if not level3_df.empty:
-            level3_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
-                                        for _, row in level3_df.iterrows())
-        #print(job_occupation_list)
-        list3_output = level3_df["code"].tolist()  # Convert Series to list
-        list3 = ", ".join(map(str, list3_output))  # Join elements with comma
-        #print(list3)
-        user_prompt3 = f"""
-Here is a list of job responsibilities:
-{responsibilities}
-Here is a list of level 3 Occupation classifications within {level2_code}:
-{level3_options}
-Based on the responsibilities, suggest the most relevant level 3 Occupation code from within this list: {list3}.
-    **Important:**
-    - Return ONLY the code, nothing else.
-    - The code should be exactly as shown in the list.
-    - Do not include any additional text or explanation.
-"""
-    level3_code = gpt_call("Identify third-level occupational group", user_prompt3).strip()
-    level3_code = code_sanitize(level3_code, list3_output)
-    result.update(get_level_ESCO_info(level3_df, level3_code, 'Level_3'))
-        ######################## Level 4 ###################
-    level4_code = None
-    if level3_code:
-        level4_df = esco_df[
-            (esco_df['code'].str.startswith(level3_code)) & (esco_df['code'].str.len() == len(level3_code) + 1)
-        ]
-        if not level4_df.empty:
-            level4_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
-                                        for _, row in level4_df.iterrows())
-        #print(job_occupation_list)
-        list4_output = level4_df["code"].tolist()  # Convert Series to list
-        list4 = ", ".join(map(str, list4_output))  # Join elements with comma
-        #print(list4)
-        user_prompt4 = f"""
-Here is a list of job responsibilities:
-{responsibilities}
-Here is a list of level 4 Occupation classifications within {level3_code}:
-{level4_options}
-Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list4}.
-    **Important:**
-    - Return ONLY the code, nothing else.
-    - The code should be exactly as shown in the list.
-    - Do not include any additional text or explanation.
-"""
-    level4_code = gpt_call("Identify fourth-level occupational group", user_prompt4).strip()
-    level4_code = code_sanitize(level4_code, list4_output)
-    result.update(get_level_ESCO_info(level4_df, level4_code, 'Level_4'))
-        ######################## Level 5 ###################
-    level5_code = None
-    if level4_code:
-        level5_df = esco_level5_df[
-            (esco_level5_df['iscoGroup'].str.startswith(level4_code))
-        ]
-        if not level5_df.empty:
-            level5_options = "\n".join(f"- {row['code']}: {row['preferredLabel']} - {row['description']}"
-                                        for _, row in level5_df.iterrows())
-        #print(job_occupation_list)
-        list5_output = level5_df["code"].tolist()  # Convert Series to list
-        list5 = ", ".join(map(str, list5_output))  # Join elements with comma
-        #print(list5)
-        user_prompt5 = f"""
-Here is a list of job responsibilities:
-{responsibilities}
-Here is a list of level 4 Occupation classifications within {level4_code}:
-{level5_options}
-Based on the responsibilities, suggest the most relevant level 4 Occupation code from within this list: {list5}.
-    **Important:**
-    - Return ONLY the code as stated in the provided list, nothing else.
-    - The code should be exactly as shown in the list.
-    - Do not include any additional text, occupation code or explanation.
-"""
-    level5_code = gpt_call("Identify fifth-level occupational group", user_prompt5).strip()
-    # Handle the case where the LLM might return just the code part
-    level5_code = code_sanitize(level5_code, list5_output)
-    result.update(get_level_ESCO_info(level5_df, level5_code, 'Level_5'))
-    ## Et voila!!
     return result
 def get_skills_info_esco(Level_5_code):
-    """Helper function to get level info with error handling"""
-    esco_level5_df = pd.read_csv(
-        "occupations_en.csv",
-        dtype={'code': str, 'iscoGroup': str, }  # Force 'code' to be read as string
-        )
-    # Find the matching occupation
     matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
-    # Get the conceptUri(s) for the matched occupation
     conceptUris = matches['conceptUri'].values.tolist()
-    esco_skill_map_df = pd.read_csv(
-        "occupationSkillRelations_en.csv"
-    )
-    # Find all skills related to that occupationUri (using isin to match any from the list)
     skills = esco_skill_map_df[esco_skill_map_df['occupationUri'].isin(conceptUris)]
-    # Get the list of skillUris
     skillUris = skills['skillUri'].values.tolist()
-    esco_skill_df = pd.read_csv(
-        "skills_en.csv"
-    )
-    # Get the full skill details from esco_skill_df
     thisskillslist = esco_skill_df[esco_skill_df['conceptUri'].isin(skillUris)]
-    result=   thisskillslist[['preferredLabel','conceptUri', 'description']].drop_duplicates()
-    result = result.rename(columns={
-        'preferredLabel': 'skill_name',
-        'description': 'skill_description',
-        'conceptUri': 'skill_code'
-    })
-    return result
-def review_skills( Level_5_code: str, top_n: int = 10) -> List[Dict[str, str]]:
-    """
-    Validate relevant ESCO-style skills for a job responsibilities using a language model.
-    Args:
-        Level_5_code: Standard esco occupation code strings..
-        top_n (int): The number of skills to return. Defaults to 3.
-    Returns:
-        List[Dict[str, str]]: A list of extracted skill dictionaries with keys:
-            - skill_name
-            - skill_description
-            - skill_code
-    """
     matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
-    # Get the conceptUri(s) for the matched occupation
     esco_occup = matches['preferredLabel'].values.tolist()
     skill_filtered = get_skills_info_esco(Level_5_code)
-    skill_filtered_options = "\n".join(f"- {row['skill_code']}: {row['skill_name']} - {row['skill_description']}"
-                                for _, row in skill_filtered.iterrows())
     prompt = f"""
-Here is a list of skills:
-{skill_filtered_options}
-Filter the skills that relevant in the context of the work of the International Organisation for Migration.
-Ensure that skills is relevant in the context of a {esco_occup} working for non-profit public organisation.
-Required JSON structure:
-{{
-    "skills": [
-        {{
-            "skill_name": "string",
-            "skill_description": "string",
-            "skill_code": "string"
-        }}
-    ]
-}}
-**Important:**
-- Do not duplicate any records of skills
-- keep only the 10 most relevant skills
-- Return ONLY the JSON object with no other text
-- Use double quotes for all strings
-- No trailing commas in arrays/objects
-- No markdown formatting (no ```json)
-- No text before or after the JSON
-- Escape all special characters in strings
-- Ensure all brackets are properly closed
-- No trailing commas in arrays/objects, especially before closing brackets
-"""
-    raw = gpt_call(
-        "You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.",
-        prompt
-    )
     json_text = _extract_json(raw)
     if not json_text:
         return []
     try:
         result = json.loads(json_text)
         skills = result.get("skills", [])
@@ -621,7 +201,6 @@ Required JSON structure:
         print(f"❌ JSON parsing error: {e}")
         print(f"🔍 Problematic JSON: {json_text}")
         return []
     validated_skills = []
     for skill in skills:
         try:
@@ -634,69 +213,40 @@ Required JSON structure:
         except (KeyError, TypeError) as e:
             print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
             continue
-    return validated_skills[:top_n]
 def extract_skills(responsibilities: List[str], top_n: int = 10) -> List[Dict[str, str]]:
-    """
-    Extracts ESCO-style skills from job responsibilities using a language model.
-    Args:
-        responsibilities (List[str]): A list of job responsibility strings.
-        top_n (int): The number of skills to return. Defaults to 3.
-    Returns:
-        List[Dict[str, str]]: A list of extracted skill dictionaries with keys:
-            - skill_name
-            - skill_description
-            - skill_code
-    """
     prompt = f"""
-Here is a list of job responsibilities:
-{responsibilities}
-List the required skills and knowledge as bullet points (without numbers) using ESCO-style terms.
-For each Skill:
- 1. skill_name: precise skills name as used in ESCO framework
- 2. skill_description: add the long description as mentioned in ESCO framework
- 3. skill_code: include the detailed corresponding ESCO code for that skill.
-Required JSON structure:
-{{
-    "skills": [
-        {{
-            "skill_name": "string",
-            "skill_description": "string",
-            "skill_code": "string"
-        }}
-    ]
-}}
-**Important:**
-- Return ONLY the JSON object with no other text
-- Use double quotes for all strings
-- No trailing commas in arrays/objects
-- No markdown formatting (no ```json)
-- No text before or after the JSON
-- Escape all special characters in strings
-- Ensure all brackets are properly closed
-"""
-    raw = gpt_call(
-        "You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.",
-        prompt
-    )
     json_text = _extract_json(raw)
     if not json_text:
         return []
     try:
         result = json.loads(json_text)
         skills = result.get("skills", [])
@@ -704,7 +254,6 @@ Required JSON structure:
         print(f"❌ JSON parsing error: {e}")
         print(f"🔍 Problematic JSON: {json_text}")
         return []
     validated_skills = []
     for skill in skills:
         try:
@@ -717,59 +266,32 @@ Required JSON structure:
         except (KeyError, TypeError) as e:
             print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
             continue
     return validated_skills[:top_n]
 def map_proficiency_and_assessment(skills: List[str], responsibilities: List[str]) -> List[Dict]:
-    """
-    Maps each skill to its contextual importance, expected proficiency level,
-    and assessment strategy based on job responsibilities.
-    Args:
-        skills (List[str]): List of skill names.
-        responsibilities (List[str]): List of job responsibilities.
-    Returns:
-        List[Dict]: A list of dictionaries containing skill metadata:
-            - skill_name
-            - importance (essential / optional)
-            - type ("skill/competence" or "knowledge")
-            - proficiency_level (Basic, Intermediate, Advanced)
-            - distinctive_elements
-            - resume_signals
-            - assessment_method
-    """
     prompt = f"""
-Here is a list of job responsibilities: {responsibilities} that have been associated with the following skills: {skills}
-For each skill, accounting for the context defined within the responsibilities, return a JSON object with:
-    - skill_name: the name of the skill
-    - importance: essential or optional
-    - type: "skill/competence" or "knowledge"
-    - proficiency_level: Basic, Intermediate, or Advanced
-    - distinctive_elements: what specific and distinctive elements are required at this defined proficiency level?
-    - resume_signals: what to look for in a resume to assess this skill?
-    - assessment_method: what is the preferred assessment method to accurately assess this skill?
-Respond ONLY with a list of dictionaries in valid JSON.
-Use double quotes for all strings. No markdown, no commentary, no trailing commas.
-"""
     raw = gpt_call("Define proficiency level and assessment for each skill.", prompt)
     json_text = _extract_json_array(raw)
     if not json_text:
         return []
     try:
         results = json.loads(json_text)
     except json.JSONDecodeError as e:
         print(f"❌ JSON parsing error: {e}")
         print(f"🔍 Problematic JSON: {json_text}")
         return []
     validated = []
     for item in results:
         try:
@@ -785,115 +307,70 @@ Use double quotes for all strings. No markdown, no commentary, no trailing comma
         except (KeyError, TypeError) as e:
             print(f"⚠️ Skipping invalid item: {item}. Error: {e}")
             continue
     return validated
 def _extract_json_array(raw: str) -> str:
-    """
-    Attempts to extract a clean JSON array from raw GPT output.
-    """
     json_start = raw.find('[')
     json_end = raw.rfind(']') + 1
     if json_start == -1 or json_end == 0:
         print(f"❌ No JSON array found in response: {raw}")
         return ""
     json_text = raw[json_start:json_end]
-    # Cleanup
-    json_text = re.sub(r',\s*([}\]])', r'\1', json_text)  # Remove trailing commas
-    json_text = re.sub(r'[\n\r\t]', ' ', json_text)       # Remove control chars
-    json_text = re.sub(r'(?<!\\)"', '"', json_text)       # Fix quotes if needed
     return json_text
 def extract_qualification(responsibilities: List[str]) -> List[str]:
     prompt = f"""
-Here is a list of job responsibilities: {responsibilities}
-Infer the required level within the European Qualifications Framework (EQF) to implement them.
-Identify the potential diplomas to testify such qualification
-"""
-    raw = gpt_call("You are an HR expert that excel in developing compentency based interview questions.", prompt)
     return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
 def build_interview(responsibilities: List[str], skill_assess: List[str]) -> List[str]:
     prompt = f"""
-Here is a list of job responsibilities: {responsibilities} and related skills: {skill_assess}
-Output: A structured 40-minute interview with:
-    Opening questions (5 min)
-    Core competency-based questions (30 min, 5-6 questions)
-    Closing & candidate questions (5 min)
-"""
-    raw = gpt_call("You are an HR expert that excel in developing compentency based interview questions.", prompt)
     return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
 def _extract_json(raw: str) -> str:
-    """
-    Attempts to extract and clean a JSON object from a raw string.
-    """
     json_start = raw.find('{')
     json_end = raw.rfind('}') + 1
     if json_start == -1 or json_end == 0:
         print(f"❌ No JSON found in response: {raw}")
         return ""
     json_text = raw[json_start:json_end]
-    # Clean common issues
-    json_text = re.sub(r',\s*([}\]])', r'\1', json_text)       # Remove trailing commas
-    json_text = re.sub(r'[\n\r\t]', ' ', json_text)            # Remove control characters
-    json_text = re.sub(r'\s{2,}', ' ', json_text)              # Collapse multiple spaces
-    json_text = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', json_text)  # Escape lone backslashes
     json_text = json_text.strip()
-    return json_text
 def process_pdf(file):
-    """
-    Processes the uploaded PDF file and returns the extracted text.
-    """
     if file is None:
         return "Please upload a PDF file."
     try:
         extracted_text = extract_text_from_pdf(file.name)
-        # Extract responsibilities section
-        responsibilities = extract_section_from_pdf(full_text, section_title="Responsibilities and Accountabilities")
         if not responsibilities:
-            print(f"Skipping {os.path.basename(file_path)} - no responsibilities section found")
             return None
-        # Main processing
         job_family = classify_job_family(responsibilities)
         occ_group = classify_occupational_group_by_level(responsibilities)
         esco_occ = classify_esco_by_hierarchical_level(responsibilities)
         qualification = extract_qualification(responsibilities)
         skills = extract_skills(responsibilities)
         skill_map = map_proficiency_and_assessment(skills, responsibilities)
-        # Check if we have ESCO level 5 code
         has_esco = esco_occ.get("Level_5_ESCO_code") is not None
-        # ESCO-based skills processing (only if we have Level 5 code)
         skill_esco_extract = []
         skill_esco_map = []
         if has_esco:
@@ -901,11 +378,8 @@ def process_pdf(file):
             skill_esco_extract = review_skills(Level_5_code)
             skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
         else:
-            print(f"No Level 5 ESCO code found for {os.path.basename(file_path)}, skipping ESCO skills mapping")
-        time.sleep(6)  # Rate limiting delay
-        # Join original skills with assessment
         assessment_lookup = {item['skill_name']: item for item in skill_map}
         joined_skills = [
             {
@@ -921,8 +395,6 @@ def process_pdf(file):
             }
             for skill in skills
         ]
-        # Join ESCO skills with assessment (only if we processed them)
         joined_skills_esco = []
         if has_esco and skill_esco_extract:
             assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
@@ -935,71 +407,59 @@ def process_pdf(file):
                 }
                 for skill in skill_esco_extract
             ]
         interview = build_interview(responsibilities, skills)
-        # Prepare base result dictionary
         result = {
-            "file": os.path.basename(file_path),
             "responsibilities": responsibilities,
             "job_family": job_fam1['Job_family'].values[0],
             "job_subfamily": job_fam1['Job_subfamily'].values[0],
             "classified_job_family": job_family,
-            **{f"Level_{i}_CCOG_{field}": occ_group.get(f"Level_{i}_CCOG_{field}")
                for i in range(1, 5) for field in ["code", "name", "desc"]},
             "qualification": qualification,
             "interview": interview,
             "skills": {
-                "file": os.path.basename(file_path),
                 "job_family": job_fam1['Job_family'].values[0],
                 "job_subfamily": job_fam1['Job_subfamily'].values[0],
                 "skills": joined_skills
             }
         }
-        # Add ESCO fields only if we have them
         if has_esco:
             result.update({
-                **{f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}")
                    for i in range(1, 6) for field in ["code", "name", "desc"]},
                 "skills_esco": {
-                    "file": os.path.basename(file_path),
                     "job_family": job_fam1['Job_family'].values[0],
                     "job_subfamily": job_fam1['Job_subfamily'].values[0],
                     "skills": joined_skills_esco
                 }
             })
         else:
-            # Mark ESCO fields as null if not available
             result.update({
-                **{f"Level_{i}_ESCO_{field}": None
                    for i in range(1, 6) for field in ["code", "name", "desc"]},
                 "skills_esco": None
             })
         return result
     except Exception as e:
         return f"Error processing PDF: {str(e)}"
-# Create the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Standardise Job Description!")
     gr.Markdown("Identify Job Family, Occupation, Qualification, match Skills and suggest interview questions.")
     with gr.Row():
         with gr.Column():
             file_input = gr.File(label="Upload a Job Description PDF file", file_types=[".pdf"])
             submit_btn = gr.Button("Extract Text")
         with gr.Column():
             text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
     submit_btn.click(
         fn=process_pdf,
         inputs=file_input,
         outputs=text_output
     )
-# Run the app
 if __name__ == "__main__":
-    demo.launch()

 import logging
 import os
 from dotenv import load_dotenv
 import json
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Optional
 # Load environment variables
 load_dotenv()
+# Configure logging for pdfminer
+logging.getLogger('pdfminer').setLevel(logging.ERROR)
+# Suppress specific warnings
+warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")
+# Initialize OpenAI client
+def initialize_openai_client():
     try:
         client = openai.AzureOpenAI(
             api_key=os.getenv("AZURE_OPENAI_API_KEY"),
             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
             api_version=os.getenv("OPENAI_API_VERSION"),
         )
+        return client
+    except Exception as e:
+        raise Exception(f"Failed to initialize OpenAI client: {e}")
+client = initialize_openai_client()
+def gpt_call(system_prompt: str, user_prompt: str) -> str:
+    try:
+        response = client.chat.completions.create(
             model=os.getenv("AZURE_DEPLOYMENT_NAME"),
             messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
             ],
+            temperature=0.3
         )
         return response.choices[0].message.content.strip()
+    except Exception as e:
         return f"ERROR: {e}"
+def extract_text_from_pdf(pdf_path: str) -> str:
     text = ""
     with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
             page_text = page.extract_text()
             if page_text:
                 text += page_text + "\n"
             for table in page.extract_tables():
                 for row in table:
                     for cell in row:
                     text += "\n"
     return text
+def extract_section_from_pdf(full_text: str, section_title: str) -> str:
     user_prompt = f"""
+    Carefully evaluate the provided position description (PD) document and extract the content of the section titled "{section_title}" from the following text.
     Return only the content of the section, without the title.
+    If the section cannot be found or explicitly mentioned in the text, use "N/A" as the default value.
     Do not repeat in the extracted text the name of the section.
     Extract precisely all the related text.
+    Text of the position description:
     {full_text}
     Section to identify: "{section_title}":
     """
     return gpt_call("You are an HR expert working for IOM.", user_prompt)
 def classify_job_family(responsibilities: List[str]) -> str:
     job_families_df = pd.read_csv("job_families1.csv")
     job_family_list = "\n".join(f"- {row['Job_family']}: {row['Job_subfamily']}" for _, row in job_families_df.iterrows())
     user_prompt = f"""
+    Here is a list of job responsibilities:
+    {responsibilities}
+    Here is a list of Job families:
+    {job_family_list}
+    Based on the responsibilities, suggest the most relevant job family and subfamily from the list above.
+    **Important:**
     - Return ONLY the job family, nothing else.
     - The job family should be exactly as shown in the list.
     - Do not include any additional text or explanation.
+    """
     return gpt_call("Suggest job family and subfamily based on responsibilities.", user_prompt)
 def get_level_CCOG_info(df, code, level_name):
     matches = df[df['code'] == code]
     if len(matches) == 0:
         print(f"Warning: No {level_name} found for CCOG code {code}")
     }
 def code_sanitize(input_string, valid_codes):
     for code in valid_codes:
+        if code in input_string:
             return code
     return None
 def classify_occupational_group_by_level(responsibilities: List[str]) -> dict:
     occupational_groups_df = pd.read_csv("occupational_groups.csv")
     result = {}
     try:
+        for level in range(1, 5):
+            level_df = occupational_groups_df[occupational_groups_df['level'] == f"Level {level}"]
+            if level > 1:
+                prev_level_code = result[f'Level_{level-1}_CCOG_code']
+                level_df = level_df[level_df['code'].str.startswith(prev_level_code)]
+            job_occupation_list = "\n".join(f"- {row['code']}: {row['occupation']} - {row.get('occupation_description', '')}" for _, row in level_df.iterrows())
+            list_output = level_df["code"].tolist()
+            user_prompt = f"""
+            Here is a list of job responsibilities:
+            {responsibilities}
+            Here is a list of level {level} Occupation classifications:
+            {job_occupation_list}
+            Based on the responsibilities, suggest the most relevant level {level} Occupation code from within this list: {', '.join(map(str, list_output))}.
+            **Important:**
+            - Return ONLY the code, nothing else.
+            - The code should be exactly as shown in the list.
+            - Do not include any additional text or explanation.
+            """
+            level_code = gpt_call(f"Identify level {level} occupational group", user_prompt).strip()
+            level_code = code_sanitize(level_code, list_output)
+            result.update(get_level_CCOG_info(level_df, level_code, f'Level_{level}'))
     except Exception as e:
         print(f"Error during classification: {str(e)}")
         result['error'] = str(e)
     return result
 def get_skills_info_esco(Level_5_code):
+    esco_level5_df = pd.read_csv("occupations_en.csv", dtype={'code': str, 'iscoGroup': str})
     matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
     conceptUris = matches['conceptUri'].values.tolist()
+    esco_skill_map_df = pd.read_csv("occupationSkillRelations_en.csv")
     skills = esco_skill_map_df[esco_skill_map_df['occupationUri'].isin(conceptUris)]
     skillUris = skills['skillUri'].values.tolist()
+    esco_skill_df = pd.read_csv("skills_en.csv")
     thisskillslist = esco_skill_df[esco_skill_df['conceptUri'].isin(skillUris)]
+    result = thisskillslist[['preferredLabel', 'conceptUri', 'description']].drop_duplicates()
+    result = result.rename(columns={'preferredLabel': 'skill_name', 'description': 'skill_description', 'conceptUri': 'skill_code'})
+    return result
+def review_skills(Level_5_code: str, top_n: int = 10) -> List[Dict[str, str]]:
     matches = esco_level5_df[esco_level5_df['code'] == Level_5_code]
     esco_occup = matches['preferredLabel'].values.tolist()
     skill_filtered = get_skills_info_esco(Level_5_code)
+    skill_filtered_options = "\n".join(f"- {row['skill_code']}: {row['skill_name']} - {row['skill_description']}" for _, row in skill_filtered.iterrows())
     prompt = f"""
+    Here is a list of skills:
+    {skill_filtered_options}
+    Filter the skills that are relevant in the context of the work of the International Organisation for Migration.
+    Ensure that skills are relevant in the context of a {esco_occup} working for a non-profit public organization.
+    Required JSON structure:
+    {{
+        "skills": [
+            {{
+                "skill_name": "string",
+                "skill_description": "string",
+                "skill_code": "string"
+            }}
+        ]
+    }}
+    **Important:**
+    - Do not duplicate any records of skills
+    - Keep only the 10 most relevant skills
+    - Return ONLY the JSON object with no other text
+    - Use double quotes for all strings
+    - No trailing commas in arrays/objects
+    - No markdown formatting (no ```json)
+    - No text before or after the JSON
+    - Escape all special characters in strings
+    - Ensure all brackets are properly closed
+    - No trailing commas in arrays/objects, especially before closing brackets
+    """
+    raw = gpt_call("You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.", prompt)
     json_text = _extract_json(raw)
     if not json_text:
         return []
     try:
         result = json.loads(json_text)
         skills = result.get("skills", [])
         print(f"❌ JSON parsing error: {e}")
         print(f"🔍 Problematic JSON: {json_text}")
         return []
     validated_skills = []
     for skill in skills:
         try:
         except (KeyError, TypeError) as e:
             print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
             continue
+    return validated_skills[\:top_n]
 def extract_skills(responsibilities: List[str], top_n: int = 10) -> List[Dict[str, str]]:
     prompt = f"""
+    Here is a list of job responsibilities:
+    {responsibilities}
+    List the required skills and knowledge as bullet points (without numbers) using ESCO-style terms.
+    For each Skill:
+    1. skill_name: precise skills name as used in ESCO framework
+    2. skill_description: add the long description as mentioned in ESCO framework
+    3. skill_code: include the detailed corresponding ESCO code for that skill.
+    Required JSON structure:
+    {{
+        "skills": [
+            {{
+                "skill_name": "string",
+                "skill_description": "string",
+                "skill_code": "string"
+            }}
+        ]
+    }}
+    **Important:**
+    - Return ONLY the JSON object with no other text
+    - Use double quotes for all strings
+    - No trailing commas in arrays/objects
+    - No markdown formatting (no ```json)
+    - No text before or after the JSON
+    - Escape all special characters in strings
+    - Ensure all brackets are properly closed
+    """
+    raw = gpt_call("You are an HR expert working for the International Organisation for Migration and with in-depth knowledge of the European Skills, Competences, Qualifications and Occupations. Extract skills required for this position.", prompt)
     json_text = _extract_json(raw)
     if not json_text:
         return []
     try:
         result = json.loads(json_text)
         skills = result.get("skills", [])
         print(f"❌ JSON parsing error: {e}")
         print(f"🔍 Problematic JSON: {json_text}")
         return []
     validated_skills = []
     for skill in skills:
         try:
         except (KeyError, TypeError) as e:
             print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
             continue
     return validated_skills[:top_n]
 def map_proficiency_and_assessment(skills: List[str], responsibilities: List[str]) -> List[Dict]:
     prompt = f"""
+    Here is a list of job responsibilities: {responsibilities} that have been associated with the following skills: {skills}
+    For each skill, accounting for the context defined within the responsibilities, return a JSON object with:
+        - skill_name: the name of the skill
+        - importance: essential or optional
+        - type: "skill/competence" or "knowledge"
+        - proficiency_level: Basic, Intermediate, or Advanced
+        - distinctive_elements: what specific and distinctive elements are required at this defined proficiency level?
+        - resume_signals: what to look for in a resume to assess this skill?
+        - assessment_method: what is the preferred assessment method to accurately assess this skill?
+    Respond ONLY with a list of dictionaries in valid JSON.
+    Use double quotes for all strings. No markdown, no commentary, no trailing commas.
+    """
     raw = gpt_call("Define proficiency level and assessment for each skill.", prompt)
     json_text = _extract_json_array(raw)
     if not json_text:
         return []
     try:
         results = json.loads(json_text)
     except json.JSONDecodeError as e:
         print(f"❌ JSON parsing error: {e}")
         print(f"🔍 Problematic JSON: {json_text}")
         return []
     validated = []
     for item in results:
         try:
         except (KeyError, TypeError) as e:
             print(f"⚠️ Skipping invalid item: {item}. Error: {e}")
             continue
     return validated
 def _extract_json_array(raw: str) -> str:
     json_start = raw.find('[')
     json_end = raw.rfind(']') + 1
     if json_start == -1 or json_end == 0:
         print(f"❌ No JSON array found in response: {raw}")
         return ""
     json_text = raw[json_start:json_end]
+    json_text = re.sub(r',\s*([}\]])', r'\1', json_text)
+    json_text = re.sub(r'[\n\r\t]', ' ', json_text)
+    json_text = re.sub(r'(?<!\\)"', '"', json_text)
     return json_text
 def extract_qualification(responsibilities: List[str]) -> List[str]:
     prompt = f"""
+    Here is a list of job responsibilities: {responsibilities}
+    Infer the required level within the European Qualifications Framework (EQF) to implement them.
+    Identify the potential diplomas to testify such qualification
+    """
+    raw = gpt_call("You are an HR expert that excel in developing competency-based interview questions.", prompt)
     return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
 def build_interview(responsibilities: List[str], skill_assess: List[str]) -> List[str]:
     prompt = f"""
+    Here is a list of job responsibilities: {responsibilities} and related skills: {skill_assess}
+    Output: A structured 40-minute interview with:
+        Opening questions (5 min)
+        Core competency-based questions (30 min, 5-6 questions)
+        Closing & candidate questions (5 min)
+    """
+    raw = gpt_call("You are an HR expert that excel in developing competency-based interview questions.", prompt)
     return [line.strip("-• ").strip() for line in raw.splitlines() if line.strip()]
 def _extract_json(raw: str) -> str:
     json_start = raw.find('{')
     json_end = raw.rfind('}') + 1
     if json_start == -1 or json_end == 0:
         print(f"❌ No JSON found in response: {raw}")
         return ""
     json_text = raw[json_start:json_end]
+    json_text = re.sub(r',\s*([}\]])', r'\1', json_text)
+    json_text = re.sub(r'[\n\r\t]', ' ', json_text)
+    json_text = re.sub(r'\s{2,}', ' ', json_text)
+    json_text = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', json_text)
     json_text = json_text.strip()
+    return json_text
 def process_pdf(file):
     if file is None:
         return "Please upload a PDF file."
     try:
         extracted_text = extract_text_from_pdf(file.name)
+        responsibilities = extract_section_from_pdf(extracted_text, section_title="Responsibilities and Accountabilities")
         if not responsibilities:
+            print(f"Skipping {os.path.basename(file.name)} - no responsibilities section found")
             return None
         job_family = classify_job_family(responsibilities)
         occ_group = classify_occupational_group_by_level(responsibilities)
         esco_occ = classify_esco_by_hierarchical_level(responsibilities)
         qualification = extract_qualification(responsibilities)
         skills = extract_skills(responsibilities)
         skill_map = map_proficiency_and_assessment(skills, responsibilities)
         has_esco = esco_occ.get("Level_5_ESCO_code") is not None
         skill_esco_extract = []
         skill_esco_map = []
         if has_esco:
             skill_esco_extract = review_skills(Level_5_code)
             skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
         else:
+            print(f"No Level 5 ESCO code found for {os.path.basename(file.name)}, skipping ESCO skills mapping")
+        time.sleep(6)
         assessment_lookup = {item['skill_name']: item for item in skill_map}
         joined_skills = [
             {
             }
             for skill in skills
         ]
         joined_skills_esco = []
         if has_esco and skill_esco_extract:
             assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
                 }
                 for skill in skill_esco_extract
             ]
         interview = build_interview(responsibilities, skills)
         result = {
+            "file": os.path.basename(file.name),
             "responsibilities": responsibilities,
             "job_family": job_fam1['Job_family'].values[0],
             "job_subfamily": job_fam1['Job_subfamily'].values[0],
             "classified_job_family": job_family,
+            **{f"Level_{i}_CCOG_{field}": occ_group.get(f"Level_{i}_CCOG_{field}")
                for i in range(1, 5) for field in ["code", "name", "desc"]},
             "qualification": qualification,
             "interview": interview,
             "skills": {
+                "file": os.path.basename(file.name),
                 "job_family": job_fam1['Job_family'].values[0],
                 "job_subfamily": job_fam1['Job_subfamily'].values[0],
                 "skills": joined_skills
             }
         }
         if has_esco:
             result.update({
+                **{f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}")
                    for i in range(1, 6) for field in ["code", "name", "desc"]},
                 "skills_esco": {
+                    "file": os.path.basename(file.name),
                     "job_family": job_fam1['Job_family'].values[0],
                     "job_subfamily": job_fam1['Job_subfamily'].values[0],
                     "skills": joined_skills_esco
                 }
             })
         else:
             result.update({
+                **{f"Level_{i}_ESCO_{field}": None
                    for i in range(1, 6) for field in ["code", "name", "desc"]},
                 "skills_esco": None
             })
         return result
     except Exception as e:
         return f"Error processing PDF: {str(e)}"
 with gr.Blocks() as demo:
+    gr.Markdown("# Standardize Job Description!")
     gr.Markdown("Identify Job Family, Occupation, Qualification, match Skills and suggest interview questions.")
     with gr.Row():
         with gr.Column():
             file_input = gr.File(label="Upload a Job Description PDF file", file_types=[".pdf"])
             submit_btn = gr.Button("Extract Text")
         with gr.Column():
             text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
     submit_btn.click(
         fn=process_pdf,
         inputs=file_input,
         outputs=text_output
     )
 if __name__ == "__main__":
+    demo.launch()