Spaces:

edouardlgp
/

Job_Classification

Running

App Files Files Community

edouardlgp commited on May 11, 2025

Commit

b950432

verified ·

1 Parent(s): ec8a468

Update app.py

Browse files

Files changed (1) hide show

app.py +247 -83

app.py CHANGED Viewed

@@ -46,36 +46,42 @@ warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")
 # ================= DataFrame initializations =================
 try:
     job_families_df = pd.read_csv("job_families1.csv", on_bad_lines='skip')
 except Exception as e:
-    print(f"Error reading job_families1.csv: {e}")
     job_families_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     occupational_groups_df = pd.read_csv("occupational_groups.csv", on_bad_lines='skip')
 except Exception as e:
     log_debug(f"Error reading occupational_groups.csv: {e}")
     occupational_groups_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_df = pd.read_csv("ISCOGroups_en.csv", on_bad_lines='skip', dtype={'code': str}  ) # Force 'code' to be read as string
 except Exception as e:
     log_debug(f"Error reading ISCOGroups_en.csv: {e}")
     esco_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_level5_df = pd.read_csv("occupations_en.csv", on_bad_lines='skip',  dtype={'code': str, 'iscoGroup': str, }  ) # Force 'code' to be read as string
 except Exception as e:
     log_debug(f"Error reading occupations_en.csv: {e}")
     esco_level5_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_skill_df = pd.read_csv("skills_en.csv", on_bad_lines='skip')
 except Exception as e:
     log_debug(f"Error reading skills_en.csv: {e}")
     esco_skill_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
      esco_skill_map_df = pd.read_csv("occupationSkillRelations_en.csv", on_bad_lines='skip')
 except Exception as e:
     log_debug(f"Error reading occupationSkillRelations_en.csv: {e}")
     esco_skill_map_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
@@ -756,84 +762,231 @@ def process_pdf(file):
         )
 # ================= Build Word Report =================
 from docx import Document
-def generate_word_document(result):
     doc = Document()
-    # Add a title
-    doc.add_heading('Job Description Analysis', level=1)
-    # Add file name
-    doc.add_heading('File Name', level=2)
-    doc.add_paragraph(result["file"])
-    # Add responsibilities
-    doc.add_heading('Responsibilities', level=2)
-    doc.add_paragraph(result["responsibilities"])
-    # Add job family
-    doc.add_heading('Classified Job Family', level=2)
-    doc.add_paragraph(result["classified_job_family"])
-    # Add qualifications
-    doc.add_heading('Qualification', level=2)
-    doc.add_paragraph("\n".join(result["qualification"]))
-    # Add CCOG Levels
-    doc.add_heading('CCOG Levels', level=2)
-    for i in range(1, 5):
-        for field in ["code", "name", "desc"]:
-            key = f"Level_{i}_CCOG_{field}"
-            if key in result:
-                doc.add_paragraph(f"{key}: {result[key]}")
-    # Add interview questions
-    doc.add_heading('Interview Questions', level=2)
-    doc.add_paragraph("\n".join(result["interview"]))
-    # Add skills
-    doc.add_heading('Skills', level=2)
-    for skill in result["skills"]["skills"]:
-        doc.add_paragraph(f"Skill Name: {skill['skill_name']}")
-        doc.add_paragraph(f"Description: {skill['skill_description']}")
-        doc.add_paragraph(f"Code: {skill['skill_code']}")
-        doc.add_paragraph(f"Importance: {skill.get('importance', 'N/A')}")
-        doc.add_paragraph(f"Type: {skill.get('type', 'N/A')}")
-        doc.add_paragraph(f"Proficiency Level: {skill.get('proficiency_level', 'N/A')}")
-        doc.add_paragraph(f"Distinctive Elements: {skill.get('distinctive_elements', 'N/A')}")
-        doc.add_paragraph(f"Resume Signals: {skill.get('resume_signals', 'N/A')}")
-        doc.add_paragraph(f"Assessment Method: {skill.get('assessment_method', 'N/A')}")
-        doc.add_paragraph("")  # Add an empty line for separation
-    # Add ESCO Levels if available
-    if "skills_esco" in result and result["skills_esco"]:
-        doc.add_heading('ESCO Levels', level=2)
-        for i in range(1, 6):
-            for field in ["code", "name", "desc"]:
-                key = f"Level_{i}_ESCO_{field}"
-                if key in result:
-                    doc.add_paragraph(f"{key}: {result[key]}")
-        # Add ESCO Skills
-        doc.add_heading('ESCO Skills', level=2)
-        for skill in result["skills_esco"]["skills"]:
-            doc.add_paragraph(f"Skill Name: {skill['skill_name']}")
-            doc.add_paragraph(f"Description: {skill['skill_description']}")
-            doc.add_paragraph(f"Code: {skill['skill_code']}")
-            doc.add_paragraph(f"Importance: {skill.get('importance', 'N/A')}")
-            doc.add_paragraph(f"Type: {skill.get('type', 'N/A')}")
-            doc.add_paragraph(f"Proficiency Level: {skill.get('proficiency_level', 'N/A')}")
-            doc.add_paragraph(f"Distinctive Elements: {skill.get('distinctive_elements', 'N/A')}")
-            doc.add_paragraph(f"Resume Signals: {skill.get('resume_signals', 'N/A')}")
-            doc.add_paragraph(f"Assessment Method: {skill.get('assessment_method', 'N/A')}")
-            doc.add_paragraph("")  # Add an empty line for separation
-    # Save the document to a temporary file
-    temp_file_path = "job_description_analysis.docx"
-    doc.save(temp_file_path)
-    return temp_file_path
 # ================= GRADIO INTERFACE =================
@@ -1024,7 +1177,9 @@ label {
     with gr.Row():
         with gr.Column():
-            file_input = gr.File(label="Upload a Post Description PDF file", file_types=[".pdf"])
             submit_btn = gr.Button(
                         value="✨ Analyse Post Description",
                         variant="primary",
@@ -1040,19 +1195,19 @@ label {
     with gr.Row():
         with gr.Column():
             gr.Markdown("### CCOG Levels")
-            ccoq_levels_output = gr.JSON(label="CCOG Levels")
         with gr.Column():
             gr.Markdown("### Skills")
-            skills_output = gr.JSON(label="Skills")
     with gr.Row():
         with gr.Column():
             gr.Markdown("### ESCO Levels")
-            esco_levels_output = gr.JSON(label="ESCO Levels")
         with gr.Column():
             gr.Markdown("### ESCO Skills")
-            esco_skills_output = gr.JSON(label="ESCO Skills")
     with gr.Row():
         with gr.Column():
@@ -1101,8 +1256,17 @@ label {
     download_btn.click(
         fn=generate_word_document,
-        inputs=[file_name_output, responsibilities_output, job_family_output, qualification_output,
-                ccoq_levels_output, interview_output, skills_output, esco_levels_output, esco_skills_output],
         outputs=gr.File(label="Download Word Document")
     )

 # ================= DataFrame initializations =================
 try:
     job_families_df = pd.read_csv("job_families1.csv", on_bad_lines='skip')
+    log_debug(f"Reading {len(job_families_df)} job_families")
 except Exception as e:
+    log_debug(f"Error reading job_families1.csv: {e}")
     job_families_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     occupational_groups_df = pd.read_csv("occupational_groups.csv", on_bad_lines='skip')
+    log_debug(f"Reading  {len(occupational_groups_df)} occupational_groups")
 except Exception as e:
     log_debug(f"Error reading occupational_groups.csv: {e}")
     occupational_groups_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_df = pd.read_csv("ISCOGroups_en.csv", on_bad_lines='skip', dtype={'code': str}  ) # Force 'code' to be read as string
+    log_debug(f"Reading  {len( esco_df)}  esco groups")
 except Exception as e:
     log_debug(f"Error reading ISCOGroups_en.csv: {e}")
     esco_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_level5_df = pd.read_csv("occupations_en.csv", on_bad_lines='skip',  dtype={'code': str, 'iscoGroup': str, }  ) # Force 'code' to be read as string
+    log_debug(f"Reading  {len(sco_level5_df)} sco_level5")
 except Exception as e:
     log_debug(f"Error reading occupations_en.csv: {e}")
     esco_level5_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
     esco_skill_df = pd.read_csv("skills_en.csv", on_bad_lines='skip')
+    log_debug(f"Reading  {len(esco_skill_df)}  esco_skill")
 except Exception as e:
     log_debug(f"Error reading skills_en.csv: {e}")
     esco_skill_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
 try:
      esco_skill_map_df = pd.read_csv("occupationSkillRelations_en.csv", on_bad_lines='skip')
+    log_debug(f"Reading  {len(esco_skill_map_df)} esco_skill_map")
 except Exception as e:
     log_debug(f"Error reading occupationSkillRelations_en.csv: {e}")
     esco_skill_map_df = pd.DataFrame()  # Fallback to an empty DataFrame or handle the error appropriately
         )
 # ================= Build Word Report =================
 from docx import Document
+import os
+import re
+import time
+import tempfile
+from typing import Dict, List, Union
+def generate_word_document(
+    file_name: str,
+    responsibilities: str,
+    job_family: str,
+    qualification: str,
+    ccoq_levels: Dict,
+    interview: str,
+    skills: List[Dict],
+    esco_levels: Dict,
+    esco_skills: Dict
+) -> str:
+    """
+    Generate a comprehensive Word document from analysis results with multiple fallback mechanisms.
+    Args:
+        file_name: Original PDF filename
+        responsibilities: Extracted responsibilities text
+        job_family: Identified job family
+        qualification: Required qualifications
+        ccoq_levels: CCOG classification levels
+        interview: Generated interview questions
+        skills: List of required skills
+        esco_levels: ESCO classification levels
+        esco_skills: ESCO mapped skills
+    Returns:
+        Path to the generated Word document
+    """
+    # Initialize document with metadata
     doc = Document()
+    doc.core_properties.author = "IOM Talent Management System"
+    doc.core_properties.title = "Position Description Analysis Report"
+    # Default values for all fields
+    default_values = {
+        "file": "Unknown file",
+        "responsibilities": "No responsibilities extracted",
+        "classified_job_family": "No job family identified",
+        "qualification": ["No qualification information available"],
+        "interview": ["No interview questions generated"],
+        "skills": {"skills": [{"skill_name": "No skills identified", "description": "", "code": ""}]},
+        "skills_esco": {"skills": [{"skill_name": "No ESCO skills identified", "description": "", "code": ""}]}
+    }
+    # Safely build the result dictionary with fallbacks
+    try:
+        result = {
+            "file": file_name if file_name and isinstance(file_name, str) else default_values["file"],
+            "responsibilities": responsibilities if responsibilities else default_values["responsibilities"],
+            "classified_job_family": job_family if job_family else default_values["classified_job_family"],
+            "qualification": qualification.split('\n') if qualification else default_values["qualification"],
+            "interview": interview.split('\n') if interview else default_values["interview"],
+            "skills": {"skills": skills} if skills and isinstance(skills, list) else default_values["skills"],
+            "skills_esco": esco_skills if esco_skills and isinstance(esco_skills, dict) else default_values["skills_esco"]
+        }
+        # Add level information with validation
+        if ccoq_levels and isinstance(ccoq_levels, dict):
+            result.update({k: v for k, v in ccoq_levels.items() if v is not None})
+        if esco_levels and isinstance(esco_levels, dict):
+            result.update({k: v for k, v in esco_levels.items() if v is not None})
+    except Exception as e:
+        log_debug(f"Error building result dictionary: {str(e)}")
+        result = default_values
+    # ================= DOCUMENT CONTENT GENERATION =================
+    try:
+        # Document header
+        doc.add_heading('Job Description Analysis Report', level=0)
+        doc.add_paragraph(f"Generated on {time.strftime('%Y-%m-%d %H:%M:%S')}")
+        doc.add_paragraph("International Organization for Migration", style="Intense Quote")
+        # Metadata table
+        table = doc.add_table(rows=1, cols=2)
+        table.style = 'Light Shading Accent 1'
+        hdr_cells = table.rows[0].cells
+        hdr_cells[0].text = 'Field'
+        hdr_cells[1].text = 'Value'
+        def _add_table_row(table, field, value):
+            row = table.add_row().cells
+            row[0].text = field
+            row[1].text = str(value or "Not available")
+        _add_table_row(table, "File Name", result["file"])
+        _add_table_row(table, "Job Family", result["classified_job_family"])
+        # Section generator with error handling
+        def _add_section(heading, content, level=2):
+            doc.add_heading(heading, level=level)
+            if not content:
+                doc.add_paragraph("No information available", style='Subtle Emphasis')
+                return
+            if isinstance(content, (list, tuple)):
+                for item in content:
+                    if item and str(item).strip():
+                        doc.add_paragraph(str(item).strip(), style='List Bullet' if level > 2 else None)
+            elif isinstance(content, dict):
+                for k, v in content.items():
+                    if v is not None:
+                        doc.add_paragraph(f"{k}: {v}")
+            elif isinstance(content, str):
+                doc.add_paragraph(content)
+        # Core sections
+        _add_section("1. Responsibilities", result["responsibilities"])
+        _add_section("2. Qualifications", result["qualification"])
+        # Skills sections with robust handling
+        def _add_skills_section(heading, skills_data):
+            doc.add_heading(heading, level=2)
+            if not skills_data or not skills_data.get("skills"):
+                doc.add_paragraph("No skills information available", style='Subtle Emphasis')
+                return
+            try:
+                skills_table = doc.add_table(rows=1, cols=4)
+                skills_table.style = 'Medium List 2 Accent 1'
+                hdr = skills_table.rows[0].cells
+                hdr[0].text = 'Skill'
+                hdr[1].text = 'Description'
+                hdr[2].text = 'Proficiency'
+                hdr[3].text = 'Assessment'
+                for skill in skills_data["skills"]:
+                    if not isinstance(skill, dict):
+                        continue
+                    row = skills_table.add_row().cells
+                    row[0].text = str(skill.get("skill_name", "Unnamed skill"))
+                    row[1].text = str(skill.get("skill_description", ""))[:100] + ("..." if len(str(skill.get("skill_description", ""))) > 100 else "")
+                    row[2].text = str(skill.get("proficiency_level", "Not specified"))
+                    row[3].text = str(skill.get("assessment_method", "Not specified"))
+            except Exception as e:
+                doc.add_paragraph(f"Could not display skills table: {str(e)}", style='Subtle Emphasis')
+        _add_skills_section("3. Required Skills", result["skills"])
+        _add_skills_section("4. ESCO Mapped Skills", result["skills_esco"])
+        # Classification sections
+        def _add_classification_section(heading, prefix, levels=4):
+            doc.add_heading(heading, level=2)
+            found = False
+            for i in range(1, levels+1):
+                code = result.get(f"{prefix}_{i}_code")
+                name = result.get(f"{prefix}_{i}_name")
+                desc = result.get(f"{prefix}_{i}_desc")
+                if any([code, name, desc]):
+                    found = True
+                    doc.add_heading(f"Level {i}", level=3)
+                    if code:
+                        doc.add_paragraph(f"Code: {code}")
+                    if name:
+                        doc.add_paragraph(f"Name: {name}")
+                    if desc:
+                        doc.add_paragraph(f"Description: {desc}")
+            if not found:
+                doc.add_paragraph("No classification information available", style='Subtle Emphasis')
+        _add_classification_section("5. CCOG Classification", "Level_CCOG")
+        _add_classification_section("6. ESCO Classification", "Level_ESCO", levels=5)
+        # Interview questions
+        doc.add_heading("7. Suggested Interview Questions", level=2)
+        if result["interview"] and any(q.strip() for q in result["interview"]):
+            for i, question in enumerate(result["interview"], 1):
+                if question.strip():
+                    doc.add_paragraph(f"{i}. {question}", style='List Number')
+        else:
+            doc.add_paragraph("No interview questions generated", style='Subtle Emphasis')
+        # Footer
+        doc.add_paragraph()
+        doc.add_paragraph("Generated by IOM Talent Management AI Tool", style='Footer')
+    except Exception as e:
+        log_debug(f"Error generating document content: {str(e)}")
+        # Fallback to simple error document
+        doc = Document()
+        doc.add_heading("Partial Report Generated", level=1)
+        doc.add_paragraph(f"Some sections could not be generated due to: {str(e)}")
+    # ================= FILE SAVING WITH MULTIPLE FALLBACKS =================
+    try:
+        # Generate appropriate filename
+        if file_name and isinstance(file_name, str):
+            base_name = os.path.splitext(os.path.basename(file_name))[0]
+            clean_name = re.sub(r'[^\w\-]', '_', base_name)[:50]  # Sanitize and truncate
+            output_filename = f"{clean_name}_analysis_{time.strftime('%Y%m%d')}.docx"
+        else:
+            output_filename = f"job_analysis_{time.strftime('%Y%m%d_%H%M%S')}.docx"
+        # Try saving to reports directory first
+        output_dir = "generated_reports"
+        try:
+            os.makedirs(output_dir, exist_ok=True)
+            output_path = os.path.join(output_dir, output_filename)
+            doc.save(output_path)
+            return output_path
+        except PermissionError:
+            # Fallback to system temp directory
+            temp_dir = tempfile.gettempdir()
+            temp_path = os.path.join(temp_dir, output_filename)
+            doc.save(temp_path)
+            return temp_path
+    except Exception as e:
+        # Ultimate fallback with error document
+        error_doc = Document()
+        error_doc.add_heading("Error Generating Report", level=1)
+        error_doc.add_paragraph(f"Could not save report due to: {str(e)}")
+        fallback_path = os.path.join(tempfile.gettempdir(), f"error_report_{time.strftime('%Y%m%d_%H%M%S')}.docx")
+        error_doc.save(fallback_path)
+        return fallback_path
 # ================= GRADIO INTERFACE =================
     with gr.Row():
         with gr.Column():
+            file_input = gr.File(
+                        label="Upload a Post Description PDF file",
+                        file_types=[".pdf"])
             submit_btn = gr.Button(
                         value="✨ Analyse Post Description",
                         variant="primary",
     with gr.Row():
         with gr.Column():
             gr.Markdown("### CCOG Levels")
+            ccoq_levels_output = gr.Textbox(label="CCOG Levels")
         with gr.Column():
             gr.Markdown("### Skills")
+            skills_output = gr.Textbox(label="Skills")
     with gr.Row():
         with gr.Column():
             gr.Markdown("### ESCO Levels")
+            esco_levels_output = gr.Textbox(label="ESCO Levels")
         with gr.Column():
             gr.Markdown("### ESCO Skills")
+            esco_skills_output = gr.Textbox(label="ESCO Skills")
     with gr.Row():
         with gr.Column():
     download_btn.click(
         fn=generate_word_document,
+        inputs=[
+            file_name_output,
+            responsibilities_output,
+            job_family_output,
+            qualification_output,
+            ccoq_levels_output,
+            interview_output,
+            skills_output,
+            esco_levels_output,
+            esco_skills_output
+        ],
         outputs=gr.File(label="Download Word Document")
     )