Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import glob | |
| import random | |
| def load_data(limit=5, split="train", task=None): | |
| """ | |
| Loads real-world resume and job match requirements directly from netsol_raw directory. | |
| Includes 'invalid' (gibberish) resumes as negative test cases and randomizes selection. | |
| """ | |
| print(f"Loading randomized raw JSON dataset from netsol_raw (Limit: {limit})...") | |
| local_dir = os.path.join(os.path.dirname(__file__), "netsol_raw") | |
| files = glob.glob(os.path.join(local_dir, "*.json")) | |
| if not files: | |
| print("Warning: netsol_raw directory not found or empty.") | |
| return [] | |
| random.shuffle(files) # Randomize matches for every run | |
| scenarios = [] | |
| # Load and process files until we hit the limit | |
| for filepath in files: | |
| try: | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| row = json.load(f) | |
| except Exception: | |
| continue | |
| input_data = row.get("input", {}) | |
| output_data = row.get("output", {}) | |
| # Handle invalid/gibberish resumes as automatic "Reject" cases | |
| is_valid = output_data.get("valid_resume_and_jd", True) | |
| if not is_valid: | |
| score = 0.0 | |
| expected_decision = "reject" | |
| rationale = "Candidate submission contains invalid or nonsensical text (gibberish)." | |
| else: | |
| scores = output_data.get("scores", {}) | |
| aggregated_scores = scores.get("aggregated_scores", {}) | |
| # Extract the precalculated macro score out of 10 | |
| try: | |
| score_out_of_10 = aggregated_scores.get("macro_scores", 0.0) | |
| score = score_out_of_10 / 10.0 # Normalize to 0-1 | |
| except Exception: | |
| score = 0.0 | |
| # Task Difficulty Filtering logic based on matched_score | |
| if task == "easy": | |
| # Very obvious extremes for easy mode | |
| if 0.35 <= score <= 0.70: continue | |
| elif task == "medium": | |
| # Standard random distribution | |
| pass | |
| elif task == "hard": | |
| # Edge cases and ambiguous profiles tightly grouped around thresholds | |
| if score < 0.20 or score > 0.85: continue | |
| if len(scenarios) >= limit: | |
| break | |
| if score > 0.65: | |
| expected_decision = "shortlist" | |
| rationale = f"High GPT-4 rated compatibility (Score: {score*10:.1f}/10.0)." | |
| elif score > 0.40: | |
| expected_decision = "flag_for_review" | |
| rationale = f"Partial match (Score: {score*10:.1f}/10.0) requiring manual review." | |
| else: | |
| expected_decision = "reject" | |
| rationale = f"Low factor compatibility (Score: {score*10:.1f}/10.0)." | |
| try: | |
| jd_text = input_data.get("job_description", "N/A") | |
| job_title = jd_text.split('\n')[0][:50] + "..." if '\n' in jd_text else "Target Role" | |
| except Exception: | |
| job_title = "Target Role" | |
| jd_text = "N/A" | |
| # Assemble complete requirements correctly, mirroring GPT-4's grading methodology | |
| min_reqs = input_data.get("minimum_requirements", []) | |
| if min_reqs: | |
| jd_text += "\n\nMinimum Requirements:\n- " + "\n- ".join(min_reqs) | |
| add_info = input_data.get("additional_info", "") | |
| if add_info: | |
| jd_text += f"\n\nAdditional Info:\n{add_info}" | |
| # Include weighted criteria so the agent can mirror GPT-4's grading methodology | |
| macro_dict = input_data.get("macro_dict", {}) | |
| micro_dict = input_data.get("micro_dict", {}) | |
| # Build a structured, enriched resume string from both the raw text | |
| # and the parsed details block, which the AI can read more accurately. | |
| raw_resume = input_data.get("resume", "N/A") | |
| details = row.get("details", {}) | |
| enriched_parts = [] | |
| # Structured header info | |
| if details.get("name"): | |
| enriched_parts.append(f"Name: {details['name']}") | |
| if details.get("email_id"): | |
| enriched_parts.append(f"Email: {details['email_id']}") | |
| if details.get("location"): | |
| enriched_parts.append(f"Location: {details['location']}") | |
| # Executive summary | |
| if details.get("executive_summary"): | |
| enriched_parts.append(f"\nSUMMARY:\n{details['executive_summary']}") | |
| # Employment history (clearly labeled) | |
| if details.get("employment_history"): | |
| enriched_parts.append("\nEXPERIENCE:") | |
| for job in details["employment_history"]: | |
| title = job.get("job_title", "") | |
| company = job.get("company_name", "") | |
| start = job.get("start_date", "") | |
| end = job.get("end_date", "Present") | |
| job_details = job.get("details", "") | |
| enriched_parts.append(f" - {title} at {company} ({start} - {end}): {job_details}") | |
| # Education | |
| if details.get("education"): | |
| enriched_parts.append("\nEDUCATION:") | |
| for edu in details["education"]: | |
| enriched_parts.append(f" - {edu.get('degree_title','')} from {edu.get('university','')} (Graduated: {edu.get('end_date','')})") | |
| # Skills (explicitly listed so AI can match against criteria) | |
| if details.get("skills"): | |
| skill_items = details["skills"] | |
| # Skills can be a list of strings or dicts | |
| if skill_items and isinstance(skill_items[0], dict): | |
| skills_str = ", ".join(s.get("skill", s.get("name", str(s))) for s in skill_items) | |
| else: | |
| skills_str = ", ".join(str(s) for s in skill_items) | |
| enriched_parts.append(f"\nSKILLS: {skills_str}") | |
| # Certifications (can be list of strings or dicts) | |
| if details.get("certifications"): | |
| cert_items = details["certifications"] | |
| if cert_items and isinstance(cert_items[0], dict): | |
| certs_str = ", ".join( | |
| c.get("certification_name", c.get("name", str(c))) for c in cert_items | |
| ) | |
| else: | |
| certs_str = ", ".join(str(c) for c in cert_items) | |
| enriched_parts.append(f"CERTIFICATIONS: {certs_str}") | |
| # Fall back to raw text for any additional context | |
| if raw_resume and raw_resume != "N/A": | |
| enriched_parts.append(f"\n--- Original Resume Text (for additional context) ---\n{raw_resume}") | |
| resume_text = "\n".join(enriched_parts) if enriched_parts else raw_resume | |
| scenario = { | |
| "id": os.path.basename(filepath), | |
| "difficulty": "Multi-Factor", | |
| "job_title": job_title, | |
| "job_description": jd_text, | |
| "resume_text": resume_text, | |
| "macro_criteria": json.dumps(macro_dict), | |
| "micro_criteria": json.dumps(micro_dict), | |
| "expected_decision": expected_decision, | |
| "rationale": rationale | |
| } | |
| scenarios.append(scenario) | |
| return scenarios | |