File size: 7,235 Bytes
da51dd3
 
dc14d7c
027bb87
433802a
 
 
dc14d7c
027bb87
433802a
027bb87
433802a
dc14d7c
 
433802a
dc14d7c
 
 
 
027bb87
 
 
 
 
dc14d7c
 
 
027bb87
dc14d7c
 
 
da51dd3
 
 
027bb87
 
 
 
da51dd3
027bb87
 
 
 
 
 
 
 
 
 
 
 
da51dd3
f0717b6
433802a
 
f0717b6
433802a
 
 
 
f0717b6
 
433802a
f0717b6
 
 
 
433802a
da51dd3
f0717b6
433802a
da51dd3
433802a
 
da51dd3
433802a
da51dd3
 
 
 
 
dc14d7c
 
 
 
 
 
 
 
 
 
f0717b6
e05112a
 
 
 
1726711
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8d3138
 
 
 
 
 
1726711
 
b8d3138
1726711
b8d3138
 
 
 
 
 
 
 
1726711
 
 
 
 
 
 
433802a
027bb87
f0717b6
da51dd3
dc14d7c
1726711
e05112a
 
433802a
f0717b6
433802a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
import json
import glob
import random

def load_data(limit=5, split="train", task=None):
    """
    Loads real-world resume and job match requirements directly from netsol_raw directory.
    Includes 'invalid' (gibberish) resumes as negative test cases and randomizes selection.
    """
    print(f"Loading randomized raw JSON dataset from netsol_raw (Limit: {limit})...")
    
    local_dir = os.path.join(os.path.dirname(__file__), "netsol_raw")
    files = glob.glob(os.path.join(local_dir, "*.json"))
    
    if not files:
        print("Warning: netsol_raw directory not found or empty.")
        return []
        
    random.shuffle(files) # Randomize matches for every run
        
    scenarios = []
    
    # Load and process files until we hit the limit
    for filepath in files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                row = json.load(f)
        except Exception:
            continue
            
        input_data = row.get("input", {})
        output_data = row.get("output", {})
        
        # Handle invalid/gibberish resumes as automatic "Reject" cases
        is_valid = output_data.get("valid_resume_and_jd", True)
        
        if not is_valid:
            score = 0.0
            expected_decision = "reject"
            rationale = "Candidate submission contains invalid or nonsensical text (gibberish)."
        else:
            scores = output_data.get("scores", {})
            aggregated_scores = scores.get("aggregated_scores", {})

            # Extract the precalculated macro score out of 10
            try:
                score_out_of_10 = aggregated_scores.get("macro_scores", 0.0)
                score = score_out_of_10 / 10.0 # Normalize to 0-1
            except Exception:
                score = 0.0

        # Task Difficulty Filtering logic based on matched_score
        if task == "easy":
            # Very obvious extremes for easy mode
            if 0.35 <= score <= 0.70: continue
        elif task == "medium":
            # Standard random distribution
            pass
        elif task == "hard":
            # Edge cases and ambiguous profiles tightly grouped around thresholds
            if score < 0.20 or score > 0.85: continue
            
        if len(scenarios) >= limit:
            break
            
        if score > 0.65:
            expected_decision = "shortlist"
            rationale = f"High GPT-4 rated compatibility (Score: {score*10:.1f}/10.0)."
        elif score > 0.40:
            expected_decision = "flag_for_review"
            rationale = f"Partial match (Score: {score*10:.1f}/10.0) requiring manual review."
        else:
            expected_decision = "reject"
            rationale = f"Low factor compatibility (Score: {score*10:.1f}/10.0)."
            
        try:
            jd_text = input_data.get("job_description", "N/A")
            job_title = jd_text.split('\n')[0][:50] + "..." if '\n' in jd_text else "Target Role"
        except Exception:
            job_title = "Target Role"
            jd_text = "N/A"
            
        # Assemble complete requirements correctly, mirroring GPT-4's grading methodology
        min_reqs = input_data.get("minimum_requirements", [])
        if min_reqs:
            jd_text += "\n\nMinimum Requirements:\n- " + "\n- ".join(min_reqs)
            
        add_info = input_data.get("additional_info", "")
        if add_info:
            jd_text += f"\n\nAdditional Info:\n{add_info}"

        # Include weighted criteria so the agent can mirror GPT-4's grading methodology
        macro_dict = input_data.get("macro_dict", {})
        micro_dict = input_data.get("micro_dict", {})
        
        # Build a structured, enriched resume string from both the raw text
        # and the parsed details block, which the AI can read more accurately.
        raw_resume = input_data.get("resume", "N/A")
        details = row.get("details", {})
        
        enriched_parts = []
        
        # Structured header info
        if details.get("name"):
            enriched_parts.append(f"Name: {details['name']}")
        if details.get("email_id"):
            enriched_parts.append(f"Email: {details['email_id']}")
        if details.get("location"):
            enriched_parts.append(f"Location: {details['location']}")
        
        # Executive summary
        if details.get("executive_summary"):
            enriched_parts.append(f"\nSUMMARY:\n{details['executive_summary']}")
        
        # Employment history (clearly labeled)
        if details.get("employment_history"):
            enriched_parts.append("\nEXPERIENCE:")
            for job in details["employment_history"]:
                title = job.get("job_title", "")
                company = job.get("company_name", "")
                start = job.get("start_date", "")
                end = job.get("end_date", "Present")
                job_details = job.get("details", "")
                enriched_parts.append(f"  - {title} at {company} ({start} - {end}): {job_details}")
        
        # Education
        if details.get("education"):
            enriched_parts.append("\nEDUCATION:")
            for edu in details["education"]:
                enriched_parts.append(f"  - {edu.get('degree_title','')} from {edu.get('university','')} (Graduated: {edu.get('end_date','')})")
        
        # Skills (explicitly listed so AI can match against criteria)
        if details.get("skills"):
            skill_items = details["skills"]
            # Skills can be a list of strings or dicts
            if skill_items and isinstance(skill_items[0], dict):
                skills_str = ", ".join(s.get("skill", s.get("name", str(s))) for s in skill_items)
            else:
                skills_str = ", ".join(str(s) for s in skill_items)
            enriched_parts.append(f"\nSKILLS: {skills_str}")
        
        # Certifications (can be list of strings or dicts)
        if details.get("certifications"):
            cert_items = details["certifications"]
            if cert_items and isinstance(cert_items[0], dict):
                certs_str = ", ".join(
                    c.get("certification_name", c.get("name", str(c))) for c in cert_items
                )
            else:
                certs_str = ", ".join(str(c) for c in cert_items)
            enriched_parts.append(f"CERTIFICATIONS: {certs_str}")
        
        # Fall back to raw text for any additional context
        if raw_resume and raw_resume != "N/A":
            enriched_parts.append(f"\n--- Original Resume Text (for additional context) ---\n{raw_resume}")
        
        resume_text = "\n".join(enriched_parts) if enriched_parts else raw_resume
        
        scenario = {
            "id": os.path.basename(filepath),
            "difficulty": "Multi-Factor",
            "job_title": job_title,
            "job_description": jd_text,
            "resume_text": resume_text,
            "macro_criteria": json.dumps(macro_dict),
            "micro_criteria": json.dumps(micro_dict),
            "expected_decision": expected_decision,
            "rationale": rationale
        }
        scenarios.append(scenario)
        
    return scenarios