Spaces:

arun-misra
/

my_env

Sleeping

File size: 7,235 Bytes

import os
import json
import glob
import random

def load_data(limit=5, split="train", task=None):
    """
    Loads real-world resume and job match requirements directly from netsol_raw directory.
    Includes 'invalid' (gibberish) resumes as negative test cases and randomizes selection.
    """
    print(f"Loading randomized raw JSON dataset from netsol_raw (Limit: {limit})...")
    
    local_dir = os.path.join(os.path.dirname(__file__), "netsol_raw")
    files = glob.glob(os.path.join(local_dir, "*.json"))
    
    if not files:
        print("Warning: netsol_raw directory not found or empty.")
        return []
        
    random.shuffle(files) # Randomize matches for every run
        
    scenarios = []
    
    # Load and process files until we hit the limit
    for filepath in files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                row = json.load(f)
        except Exception:
            continue
            
        input_data = row.get("input", {})
        output_data = row.get("output", {})
        
        # Handle invalid/gibberish resumes as automatic "Reject" cases
        is_valid = output_data.get("valid_resume_and_jd", True)
        
        if not is_valid:
            score = 0.0
            expected_decision = "reject"
            rationale = "Candidate submission contains invalid or nonsensical text (gibberish)."
        else:
            scores = output_data.get("scores", {})
            aggregated_scores = scores.get("aggregated_scores", {})

            # Extract the precalculated macro score out of 10
            try:
                score_out_of_10 = aggregated_scores.get("macro_scores", 0.0)
                score = score_out_of_10 / 10.0 # Normalize to 0-1
            except Exception:
                score = 0.0

        # Task Difficulty Filtering logic based on matched_score
        if task == "easy":
            # Very obvious extremes for easy mode
            if 0.35 <= score <= 0.70: continue
        elif task == "medium":
            # Standard random distribution
            pass
        elif task == "hard":
            # Edge cases and ambiguous profiles tightly grouped around thresholds
            if score < 0.20 or score > 0.85: continue
            
        if len(scenarios) >= limit:
            break
            
        if score > 0.65:
            expected_decision = "shortlist"
            rationale = f"High GPT-4 rated compatibility (Score: {score*10:.1f}/10.0)."
        elif score > 0.40:
            expected_decision = "flag_for_review"
            rationale = f"Partial match (Score: {score*10:.1f}/10.0) requiring manual review."
        else:
            expected_decision = "reject"
            rationale = f"Low factor compatibility (Score: {score*10:.1f}/10.0)."
            
        try:
            jd_text = input_data.get("job_description", "N/A")
            job_title = jd_text.split('\n')[0][:50] + "..." if '\n' in jd_text else "Target Role"
        except Exception:
            job_title = "Target Role"
            jd_text = "N/A"
            
        # Assemble complete requirements correctly, mirroring GPT-4's grading methodology
        min_reqs = input_data.get("minimum_requirements", [])
        if min_reqs:
            jd_text += "\n\nMinimum Requirements:\n- " + "\n- ".join(min_reqs)
            
        add_info = input_data.get("additional_info", "")
        if add_info:
            jd_text += f"\n\nAdditional Info:\n{add_info}"

        # Include weighted criteria so the agent can mirror GPT-4's grading methodology
        macro_dict = input_data.get("macro_dict", {})
        micro_dict = input_data.get("micro_dict", {})
        
        # Build a structured, enriched resume string from both the raw text
        # and the parsed details block, which the AI can read more accurately.
        raw_resume = input_data.get("resume", "N/A")
        details = row.get("details", {})
        
        enriched_parts = []
        
        # Structured header info
        if details.get("name"):
            enriched_parts.append(f"Name: {details['name']}")
        if details.get("email_id"):
            enriched_parts.append(f"Email: {details['email_id']}")
        if details.get("location"):
            enriched_parts.append(f"Location: {details['location']}")
        
        # Executive summary
        if details.get("executive_summary"):
            enriched_parts.append(f"\nSUMMARY:\n{details['executive_summary']}")
        
        # Employment history (clearly labeled)
        if details.get("employment_history"):
            enriched_parts.append("\nEXPERIENCE:")
            for job in details["employment_history"]:
                title = job.get("job_title", "")
                company = job.get("company_name", "")
                start = job.get("start_date", "")
                end = job.get("end_date", "Present")
                job_details = job.get("details", "")
                enriched_parts.append(f"  - {title} at {company} ({start} - {end}): {job_details}")
        
        # Education
        if details.get("education"):
            enriched_parts.append("\nEDUCATION:")
            for edu in details["education"]:
                enriched_parts.append(f"  - {edu.get('degree_title','')} from {edu.get('university','')} (Graduated: {edu.get('end_date','')})")
        
        # Skills (explicitly listed so AI can match against criteria)
        if details.get("skills"):
            skill_items = details["skills"]
            # Skills can be a list of strings or dicts
            if skill_items and isinstance(skill_items[0], dict):
                skills_str = ", ".join(s.get("skill", s.get("name", str(s))) for s in skill_items)
            else:
                skills_str = ", ".join(str(s) for s in skill_items)
            enriched_parts.append(f"\nSKILLS: {skills_str}")
        
        # Certifications (can be list of strings or dicts)
        if details.get("certifications"):
            cert_items = details["certifications"]
            if cert_items and isinstance(cert_items[0], dict):
                certs_str = ", ".join(
                    c.get("certification_name", c.get("name", str(c))) for c in cert_items
                )
            else:
                certs_str = ", ".join(str(c) for c in cert_items)
            enriched_parts.append(f"CERTIFICATIONS: {certs_str}")
        
        # Fall back to raw text for any additional context
        if raw_resume and raw_resume != "N/A":
            enriched_parts.append(f"\n--- Original Resume Text (for additional context) ---\n{raw_resume}")
        
        resume_text = "\n".join(enriched_parts) if enriched_parts else raw_resume
        
        scenario = {
            "id": os.path.basename(filepath),
            "difficulty": "Multi-Factor",
            "job_title": job_title,
            "job_description": jd_text,
            "resume_text": resume_text,
            "macro_criteria": json.dumps(macro_dict),
            "micro_criteria": json.dumps(micro_dict),
            "expected_decision": expected_decision,
            "rationale": rationale
        }
        scenarios.append(scenario)
        
    return scenarios