import os import json import glob import random def load_data(limit=5, split="train", task=None): """ Loads real-world resume and job match requirements directly from netsol_raw directory. Includes 'invalid' (gibberish) resumes as negative test cases and randomizes selection. """ print(f"Loading randomized raw JSON dataset from netsol_raw (Limit: {limit})...") local_dir = os.path.join(os.path.dirname(__file__), "netsol_raw") files = glob.glob(os.path.join(local_dir, "*.json")) if not files: print("Warning: netsol_raw directory not found or empty.") return [] random.shuffle(files) # Randomize matches for every run scenarios = [] # Load and process files until we hit the limit for filepath in files: try: with open(filepath, 'r', encoding='utf-8') as f: row = json.load(f) except Exception: continue input_data = row.get("input", {}) output_data = row.get("output", {}) # Handle invalid/gibberish resumes as automatic "Reject" cases is_valid = output_data.get("valid_resume_and_jd", True) if not is_valid: score = 0.0 expected_decision = "reject" rationale = "Candidate submission contains invalid or nonsensical text (gibberish)." else: scores = output_data.get("scores", {}) aggregated_scores = scores.get("aggregated_scores", {}) # Extract the precalculated macro score out of 10 try: score_out_of_10 = aggregated_scores.get("macro_scores", 0.0) score = score_out_of_10 / 10.0 # Normalize to 0-1 except Exception: score = 0.0 # Task Difficulty Filtering logic based on matched_score if task == "easy": # Very obvious extremes for easy mode if 0.35 <= score <= 0.70: continue elif task == "medium": # Standard random distribution pass elif task == "hard": # Edge cases and ambiguous profiles tightly grouped around thresholds if score < 0.20 or score > 0.85: continue if len(scenarios) >= limit: break if score > 0.65: expected_decision = "shortlist" rationale = f"High GPT-4 rated compatibility (Score: {score*10:.1f}/10.0)." elif score > 0.40: expected_decision = "flag_for_review" rationale = f"Partial match (Score: {score*10:.1f}/10.0) requiring manual review." else: expected_decision = "reject" rationale = f"Low factor compatibility (Score: {score*10:.1f}/10.0)." try: jd_text = input_data.get("job_description", "N/A") job_title = jd_text.split('\n')[0][:50] + "..." if '\n' in jd_text else "Target Role" except Exception: job_title = "Target Role" jd_text = "N/A" # Assemble complete requirements correctly, mirroring GPT-4's grading methodology min_reqs = input_data.get("minimum_requirements", []) if min_reqs: jd_text += "\n\nMinimum Requirements:\n- " + "\n- ".join(min_reqs) add_info = input_data.get("additional_info", "") if add_info: jd_text += f"\n\nAdditional Info:\n{add_info}" # Include weighted criteria so the agent can mirror GPT-4's grading methodology macro_dict = input_data.get("macro_dict", {}) micro_dict = input_data.get("micro_dict", {}) # Build a structured, enriched resume string from both the raw text # and the parsed details block, which the AI can read more accurately. raw_resume = input_data.get("resume", "N/A") details = row.get("details", {}) enriched_parts = [] # Structured header info if details.get("name"): enriched_parts.append(f"Name: {details['name']}") if details.get("email_id"): enriched_parts.append(f"Email: {details['email_id']}") if details.get("location"): enriched_parts.append(f"Location: {details['location']}") # Executive summary if details.get("executive_summary"): enriched_parts.append(f"\nSUMMARY:\n{details['executive_summary']}") # Employment history (clearly labeled) if details.get("employment_history"): enriched_parts.append("\nEXPERIENCE:") for job in details["employment_history"]: title = job.get("job_title", "") company = job.get("company_name", "") start = job.get("start_date", "") end = job.get("end_date", "Present") job_details = job.get("details", "") enriched_parts.append(f" - {title} at {company} ({start} - {end}): {job_details}") # Education if details.get("education"): enriched_parts.append("\nEDUCATION:") for edu in details["education"]: enriched_parts.append(f" - {edu.get('degree_title','')} from {edu.get('university','')} (Graduated: {edu.get('end_date','')})") # Skills (explicitly listed so AI can match against criteria) if details.get("skills"): skill_items = details["skills"] # Skills can be a list of strings or dicts if skill_items and isinstance(skill_items[0], dict): skills_str = ", ".join(s.get("skill", s.get("name", str(s))) for s in skill_items) else: skills_str = ", ".join(str(s) for s in skill_items) enriched_parts.append(f"\nSKILLS: {skills_str}") # Certifications (can be list of strings or dicts) if details.get("certifications"): cert_items = details["certifications"] if cert_items and isinstance(cert_items[0], dict): certs_str = ", ".join( c.get("certification_name", c.get("name", str(c))) for c in cert_items ) else: certs_str = ", ".join(str(c) for c in cert_items) enriched_parts.append(f"CERTIFICATIONS: {certs_str}") # Fall back to raw text for any additional context if raw_resume and raw_resume != "N/A": enriched_parts.append(f"\n--- Original Resume Text (for additional context) ---\n{raw_resume}") resume_text = "\n".join(enriched_parts) if enriched_parts else raw_resume scenario = { "id": os.path.basename(filepath), "difficulty": "Multi-Factor", "job_title": job_title, "job_description": jd_text, "resume_text": resume_text, "macro_criteria": json.dumps(macro_dict), "micro_criteria": json.dumps(micro_dict), "expected_decision": expected_decision, "rationale": rationale } scenarios.append(scenario) return scenarios