Spaces:

edouardlgp
/

Job_Classification

Running

App Files Files Community

edouardlgp commited on May 10, 2025

Commit

e518da9

verified ·

1 Parent(s): 1735116

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -25

app.py CHANGED Viewed

@@ -13,6 +13,30 @@ import traceback
 import time
 import openai
 # Load environment variables
 load_dotenv()
@@ -32,31 +56,31 @@ except Exception as e:
 try:
     occupational_groups_df = pd.read_csv("occupational_groups.csv", on_bad_lines='skip')
 except Exception as e:
-    print(f"Error reading occupational_groups.csv: {e}")
     occupational_groups_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_df = pd.read_csv("ISCOGroups_en.csv", on_bad_lines='skip', dtype={'code': str}  ) # Force 'code' to be read as string
 except Exception as e:
-    print(f"Error reading ISCOGroups_en.csv: {e}")
     esco_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_level5_df = pd.read_csv("occupations_en.csv", on_bad_lines='skip',  dtype={'code': str, 'iscoGroup': str, }  ) # Force 'code' to be read as string
 except Exception as e:
-    print(f"Error reading occupations_en.csv: {e}")
     esco_level5_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_skill_df = pd.read_csv("skills_en.csv", on_bad_lines='skip')
 except Exception as e:
-    print(f"Error reading skills_en.csv: {e}")
     esco_skill_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
      esco_skill_map_df = pd.read_csv("occupationSkillRelations_en.csv", on_bad_lines='skip')
 except Exception as e:
-    print(f"Error reading occupationSkillRelations_en.csv: {e}")
     esco_skill_map_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
@@ -138,7 +162,7 @@ def classify_job_family(responsibilities: List[str]) -> str:
 def get_level_CCOG_info(df, code, level_name):
     matches = df[df['code'] == code]
     if len(matches) == 0:
-        print(f"Warning: No {level_name} found for CCOG code {code}")
         return {
             f'{level_name}_CCOG_code': code,
             f'{level_name}_CCOG_name': 'UNKNOWN',
@@ -184,7 +208,7 @@ def classify_occupational_group_by_level(responsibilities: List[str]) -> dict:
             level_code = code_sanitize(level_code, list_output)
             result.update(get_level_CCOG_info(level_df, level_code, f'Level_{level}'))
     except Exception as e:
-        print(f"Error during classification: {str(e)}")
         result['error'] = str(e)
     return result
@@ -367,7 +391,7 @@ def get_level_ESCO_info(df, code, level_name):
     """Helper function to get level info with error handling"""
     matches = df[df['code'] == code]
     if len(matches) == 0:
-        print(f"Warning: No {level_name} found for ESCO code {code}")
         return {
             f'{level_name}_ESCO_code': code,
             f'{level_name}_ESCO_name': 'UNKNOWN',
@@ -435,8 +459,8 @@ def review_skills(Level_5_code: str, top_n: int = 10) -> List[Dict[str, str]]:
         result = json.loads(json_text)
         skills = result.get("skills", [])
     except json.JSONDecodeError as e:
-        print(f"❌ JSON parsing error: {e}")
-        print(f"🔍 Problematic JSON: {json_text}")
         return []
     validated_skills = []
     for skill in skills:
@@ -448,7 +472,7 @@ def review_skills(Level_5_code: str, top_n: int = 10) -> List[Dict[str, str]]:
             }
             validated_skills.append(validated)
         except (KeyError, TypeError) as e:
-            print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
             continue
     return validated_skills[:top_n]
@@ -488,8 +512,8 @@ def extract_skills(responsibilities: List[str], top_n: int = 10) -> List[Dict[st
         result = json.loads(json_text)
         skills = result.get("skills", [])
     except json.JSONDecodeError as e:
-        print(f"❌ JSON parsing error: {e}")
-        print(f"🔍 Problematic JSON: {json_text}")
         return []
     validated_skills = []
     for skill in skills:
@@ -501,7 +525,7 @@ def extract_skills(responsibilities: List[str], top_n: int = 10) -> List[Dict[st
             }
             validated_skills.append(validated)
         except (KeyError, TypeError) as e:
-            print(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
             continue
     return validated_skills[:top_n]
@@ -526,8 +550,8 @@ def map_proficiency_and_assessment(skills: List[str], responsibilities: List[str
     try:
         results = json.loads(json_text)
     except json.JSONDecodeError as e:
-        print(f"❌ JSON parsing error: {e}")
-        print(f"🔍 Problematic JSON: {json_text}")
         return []
     validated = []
     for item in results:
@@ -542,7 +566,7 @@ def map_proficiency_and_assessment(skills: List[str], responsibilities: List[str
                 "assessment_method": item["assessment_method"].strip()
             })
         except (KeyError, TypeError) as e:
-            print(f"⚠️ Skipping invalid item: {item}. Error: {e}")
             continue
     return validated
@@ -550,7 +574,7 @@ def _extract_json_array(raw: str) -> str:
     json_start = raw.find('[')
     json_end = raw.rfind(']') + 1
     if json_start == -1 or json_end == 0:
-        print(f"❌ No JSON array found in response: {raw}")
         return ""
     json_text = raw[json_start:json_end]
     json_text = re.sub(r',\s*([}\]])', r'\1', json_text)
@@ -582,7 +606,7 @@ def _extract_json(raw: str) -> str:
     json_start = raw.find('{')
     json_end = raw.rfind('}') + 1
     if json_start == -1 or json_end == 0:
-        print(f"❌ No JSON found in response: {raw}")
         return ""
     json_text = raw[json_start:json_end]
     json_text = re.sub(r',\s*([}\]])', r'\1', json_text)
@@ -599,9 +623,10 @@ def process_pdf(file):
         extracted_text = extract_text_from_pdf(file.name)
         responsibilities = extract_section_from_pdf(extracted_text, section_title="Responsibilities and Accountabilities")
         if not responsibilities:
-            print(f"Skipping {os.path.basename(file.name)} - no responsibilities section found")
             return None
         job_family = classify_job_family(responsibilities)
         occ_group = classify_occupational_group_by_level(responsibilities)
         esco_occ = classify_esco_by_hierarchical_level(responsibilities)
         qualification = extract_qualification(responsibilities)
@@ -615,7 +640,7 @@ def process_pdf(file):
             skill_esco_extract = review_skills(Level_5_code)
             skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
         else:
-            print(f"No Level 5 ESCO code found for {os.path.basename(file.name)}, skipping ESCO skills mapping")
         time.sleep(6)
         assessment_lookup = {item['skill_name']: item for item in skill_map}
         joined_skills = [
@@ -688,15 +713,27 @@ with gr.Blocks() as demo:
     gr.Markdown("Identify Job Family, Occupation, Qualification, match Skills and suggest interview questions.")
     with gr.Row():
         with gr.Column():
-            file_input = gr.File(label="Upload a Job Description PDF file", file_types=[".pdf"])
-            submit_btn = gr.Button("Extract Text")
         with gr.Column():
             text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
     submit_btn.click(
         fn=process_pdf,
         inputs=file_input,
-        outputs=text_output
     )
 if __name__ == "__main__":
-    demo.launch()

 import time
 import openai
+# Debugging setup
+DEBUG = True
+debug_messages = []
+def log_debug(message):
+    """Log debug messages and keep last 20 entries"""
+    if DEBUG:
+        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+        full_message = f"[{timestamp}] {message}"
+        debug_messages.append(full_message)
+        print(full_message)  # Print to console
+        # Keep only the last 20 messages
+        if len(debug_messages) > 20:
+            debug_messages.pop(0)
+        return "\n".join(debug_messages)
+    return ""
+# Initialize debug logging
+log_debug("Application starting...")
 # Load environment variables
 load_dotenv()
 try:
     occupational_groups_df = pd.read_csv("occupational_groups.csv", on_bad_lines='skip')
 except Exception as e:
+    log_debug(f"Error reading occupational_groups.csv: {e}")
     occupational_groups_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_df = pd.read_csv("ISCOGroups_en.csv", on_bad_lines='skip', dtype={'code': str}  ) # Force 'code' to be read as string
 except Exception as e:
+    log_debug(f"Error reading ISCOGroups_en.csv: {e}")
     esco_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_level5_df = pd.read_csv("occupations_en.csv", on_bad_lines='skip',  dtype={'code': str, 'iscoGroup': str, }  ) # Force 'code' to be read as string
 except Exception as e:
+    log_debug(f"Error reading occupations_en.csv: {e}")
     esco_level5_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_skill_df = pd.read_csv("skills_en.csv", on_bad_lines='skip')
 except Exception as e:
+    log_debug(f"Error reading skills_en.csv: {e}")
     esco_skill_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
      esco_skill_map_df = pd.read_csv("occupationSkillRelations_en.csv", on_bad_lines='skip')
 except Exception as e:
+    log_debug(f"Error reading occupationSkillRelations_en.csv: {e}")
     esco_skill_map_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 def get_level_CCOG_info(df, code, level_name):
     matches = df[df['code'] == code]
     if len(matches) == 0:
+        log_debug(f"Warning: No {level_name} found for CCOG code {code}")
         return {
             f'{level_name}_CCOG_code': code,
             f'{level_name}_CCOG_name': 'UNKNOWN',
             level_code = code_sanitize(level_code, list_output)
             result.update(get_level_CCOG_info(level_df, level_code, f'Level_{level}'))
     except Exception as e:
+        log_debug(f"Error during classification: {str(e)}")
         result['error'] = str(e)
     return result
     """Helper function to get level info with error handling"""
     matches = df[df['code'] == code]
     if len(matches) == 0:
+        log_debug(f"Warning: No {level_name} found for ESCO code {code}")
         return {
             f'{level_name}_ESCO_code': code,
             f'{level_name}_ESCO_name': 'UNKNOWN',
         result = json.loads(json_text)
         skills = result.get("skills", [])
     except json.JSONDecodeError as e:
+        log_debug(f"❌ JSON Skills parsing error: {e}")
+        log_debug(f"🔍 Problematic JSON Skills: {json_text}")
         return []
     validated_skills = []
     for skill in skills:
             }
             validated_skills.append(validated)
         except (KeyError, TypeError) as e:
+            log_debug(f"⚠️ Skipping invalid skill: {skill}. Error: {e}")
             continue
     return validated_skills[:top_n]
         result = json.loads(json_text)
         skills = result.get("skills", [])
     except json.JSONDecodeError as e:
+        log_debug(f"❌ JSON Skills extrac parsing error: {e}")
+        log_debug(f"🔍 Problematic JSON Skills extract: {json_text}")
         return []
     validated_skills = []
     for skill in skills:
             }
             validated_skills.append(validated)
         except (KeyError, TypeError) as e:
+            log_debug(f"⚠️ Skipping invalid skill extract: {skill}. Error: {e}")
             continue
     return validated_skills[:top_n]
     try:
         results = json.loads(json_text)
     except json.JSONDecodeError as e:
+        log_debug(f"❌ JSON proficiency parsing error: {e}")
+        log_debug(f"🔍 Problematic JSON proficiency: {json_text}")
         return []
     validated = []
     for item in results:
                 "assessment_method": item["assessment_method"].strip()
             })
         except (KeyError, TypeError) as e:
+            log_debug(f"⚠️ Skipping invalid profiency item: {item}. Error: {e}")
             continue
     return validated
     json_start = raw.find('[')
     json_end = raw.rfind(']') + 1
     if json_start == -1 or json_end == 0:
+        log_debug(f"❌ No JSON array found in response: {raw}")
         return ""
     json_text = raw[json_start:json_end]
     json_text = re.sub(r',\s*([}\]])', r'\1', json_text)
     json_start = raw.find('{')
     json_end = raw.rfind('}') + 1
     if json_start == -1 or json_end == 0:
+        log_debug(f"❌ No JSON found in response: {raw}")
         return ""
     json_text = raw[json_start:json_end]
     json_text = re.sub(r',\s*([}\]])', r'\1', json_text)
         extracted_text = extract_text_from_pdf(file.name)
         responsibilities = extract_section_from_pdf(extracted_text, section_title="Responsibilities and Accountabilities")
         if not responsibilities:
+            log_debug(f"Skipping {os.path.basename(file.name)} - no responsibilities section found")
             return None
         job_family = classify_job_family(responsibilities)
+        log_debug(f"Identified {job_family} ")
         occ_group = classify_occupational_group_by_level(responsibilities)
         esco_occ = classify_esco_by_hierarchical_level(responsibilities)
         qualification = extract_qualification(responsibilities)
             skill_esco_extract = review_skills(Level_5_code)
             skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
         else:
+            log_debug(f"No Level 5 ESCO code found for {os.path.basename(file.name)}, skipping ESCO skills mapping")
         time.sleep(6)
         assessment_lookup = {item['skill_name']: item for item in skill_map}
         joined_skills = [
     gr.Markdown("Identify Job Family, Occupation, Qualification, match Skills and suggest interview questions.")
     with gr.Row():
         with gr.Column():
+            file_input = gr.File(label="Upload a Post Description PDF file", file_types=[".pdf"])
+            submit_btn = gr.Button("Analyse Post Description")
+       # with gr.Column():
+       #     text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
         with gr.Column():
             text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
+            if DEBUG:
+                gr.Markdown("### Debug Console", elem_classes=["debug-title"])
+                debug_console = gr.Textbox(
+                    label="",
+                    interactive=False,
+                    lines=15,
+                    elem_classes=["debug-console"]
+                )
     submit_btn.click(
         fn=process_pdf,
         inputs=file_input,
+        #outputs=text_output
+        outputs=[text_output, debug_console] if DEBUG else [text_output],
     )
 if __name__ == "__main__":
+    demo.launch(show_error=True,
+                debug=DEBUG)