Spaces:
Sleeping
Sleeping
File size: 7,235 Bytes
da51dd3 dc14d7c 027bb87 433802a dc14d7c 027bb87 433802a 027bb87 433802a dc14d7c 433802a dc14d7c 027bb87 dc14d7c 027bb87 dc14d7c da51dd3 027bb87 da51dd3 027bb87 da51dd3 f0717b6 433802a f0717b6 433802a f0717b6 433802a f0717b6 433802a da51dd3 f0717b6 433802a da51dd3 433802a da51dd3 433802a da51dd3 dc14d7c f0717b6 e05112a 1726711 b8d3138 1726711 b8d3138 1726711 b8d3138 1726711 433802a 027bb87 f0717b6 da51dd3 dc14d7c 1726711 e05112a 433802a f0717b6 433802a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | import os
import json
import glob
import random
def load_data(limit=5, split="train", task=None):
"""
Loads real-world resume and job match requirements directly from netsol_raw directory.
Includes 'invalid' (gibberish) resumes as negative test cases and randomizes selection.
"""
print(f"Loading randomized raw JSON dataset from netsol_raw (Limit: {limit})...")
local_dir = os.path.join(os.path.dirname(__file__), "netsol_raw")
files = glob.glob(os.path.join(local_dir, "*.json"))
if not files:
print("Warning: netsol_raw directory not found or empty.")
return []
random.shuffle(files) # Randomize matches for every run
scenarios = []
# Load and process files until we hit the limit
for filepath in files:
try:
with open(filepath, 'r', encoding='utf-8') as f:
row = json.load(f)
except Exception:
continue
input_data = row.get("input", {})
output_data = row.get("output", {})
# Handle invalid/gibberish resumes as automatic "Reject" cases
is_valid = output_data.get("valid_resume_and_jd", True)
if not is_valid:
score = 0.0
expected_decision = "reject"
rationale = "Candidate submission contains invalid or nonsensical text (gibberish)."
else:
scores = output_data.get("scores", {})
aggregated_scores = scores.get("aggregated_scores", {})
# Extract the precalculated macro score out of 10
try:
score_out_of_10 = aggregated_scores.get("macro_scores", 0.0)
score = score_out_of_10 / 10.0 # Normalize to 0-1
except Exception:
score = 0.0
# Task Difficulty Filtering logic based on matched_score
if task == "easy":
# Very obvious extremes for easy mode
if 0.35 <= score <= 0.70: continue
elif task == "medium":
# Standard random distribution
pass
elif task == "hard":
# Edge cases and ambiguous profiles tightly grouped around thresholds
if score < 0.20 or score > 0.85: continue
if len(scenarios) >= limit:
break
if score > 0.65:
expected_decision = "shortlist"
rationale = f"High GPT-4 rated compatibility (Score: {score*10:.1f}/10.0)."
elif score > 0.40:
expected_decision = "flag_for_review"
rationale = f"Partial match (Score: {score*10:.1f}/10.0) requiring manual review."
else:
expected_decision = "reject"
rationale = f"Low factor compatibility (Score: {score*10:.1f}/10.0)."
try:
jd_text = input_data.get("job_description", "N/A")
job_title = jd_text.split('\n')[0][:50] + "..." if '\n' in jd_text else "Target Role"
except Exception:
job_title = "Target Role"
jd_text = "N/A"
# Assemble complete requirements correctly, mirroring GPT-4's grading methodology
min_reqs = input_data.get("minimum_requirements", [])
if min_reqs:
jd_text += "\n\nMinimum Requirements:\n- " + "\n- ".join(min_reqs)
add_info = input_data.get("additional_info", "")
if add_info:
jd_text += f"\n\nAdditional Info:\n{add_info}"
# Include weighted criteria so the agent can mirror GPT-4's grading methodology
macro_dict = input_data.get("macro_dict", {})
micro_dict = input_data.get("micro_dict", {})
# Build a structured, enriched resume string from both the raw text
# and the parsed details block, which the AI can read more accurately.
raw_resume = input_data.get("resume", "N/A")
details = row.get("details", {})
enriched_parts = []
# Structured header info
if details.get("name"):
enriched_parts.append(f"Name: {details['name']}")
if details.get("email_id"):
enriched_parts.append(f"Email: {details['email_id']}")
if details.get("location"):
enriched_parts.append(f"Location: {details['location']}")
# Executive summary
if details.get("executive_summary"):
enriched_parts.append(f"\nSUMMARY:\n{details['executive_summary']}")
# Employment history (clearly labeled)
if details.get("employment_history"):
enriched_parts.append("\nEXPERIENCE:")
for job in details["employment_history"]:
title = job.get("job_title", "")
company = job.get("company_name", "")
start = job.get("start_date", "")
end = job.get("end_date", "Present")
job_details = job.get("details", "")
enriched_parts.append(f" - {title} at {company} ({start} - {end}): {job_details}")
# Education
if details.get("education"):
enriched_parts.append("\nEDUCATION:")
for edu in details["education"]:
enriched_parts.append(f" - {edu.get('degree_title','')} from {edu.get('university','')} (Graduated: {edu.get('end_date','')})")
# Skills (explicitly listed so AI can match against criteria)
if details.get("skills"):
skill_items = details["skills"]
# Skills can be a list of strings or dicts
if skill_items and isinstance(skill_items[0], dict):
skills_str = ", ".join(s.get("skill", s.get("name", str(s))) for s in skill_items)
else:
skills_str = ", ".join(str(s) for s in skill_items)
enriched_parts.append(f"\nSKILLS: {skills_str}")
# Certifications (can be list of strings or dicts)
if details.get("certifications"):
cert_items = details["certifications"]
if cert_items and isinstance(cert_items[0], dict):
certs_str = ", ".join(
c.get("certification_name", c.get("name", str(c))) for c in cert_items
)
else:
certs_str = ", ".join(str(c) for c in cert_items)
enriched_parts.append(f"CERTIFICATIONS: {certs_str}")
# Fall back to raw text for any additional context
if raw_resume and raw_resume != "N/A":
enriched_parts.append(f"\n--- Original Resume Text (for additional context) ---\n{raw_resume}")
resume_text = "\n".join(enriched_parts) if enriched_parts else raw_resume
scenario = {
"id": os.path.basename(filepath),
"difficulty": "Multi-Factor",
"job_title": job_title,
"job_description": jd_text,
"resume_text": resume_text,
"macro_criteria": json.dumps(macro_dict),
"micro_criteria": json.dumps(micro_dict),
"expected_decision": expected_decision,
"rationale": rationale
}
scenarios.append(scenario)
return scenarios
|