"""
Data Loader for the Resume Env Environment.

Loads resume screening scenarios from the netsol/resume-score-details dataset,
stored as individual JSON files in the netsol_raw/ directory.

File naming convention:
  - match_*.json               : Resume genuinely fits the JD → high score → shortlist
  - mismatch_*.json            : Resume intentionally wrong JD → low score → reject
  - invalid_*.json             : Gibberish/fake resume → automatic reject
  - empty_additional_info_*.json : Valid match but no extra hiring context

Each file structure:
  input.resume                 : Full resume text
  input.job_description        : Full job description
  input.minimum_requirements   : List of hard requirements
  input.additional_info        : Extra recruiter context
  input.macro_dict             : High-level weighted criteria (e.g. {"experience": 70})
  input.micro_dict             : Detailed skill criteria (e.g. {"python": 30})
  output.valid_resume_and_jd   : False for gibberish files
  output.scores.aggregated_scores.macro_scores : GPT-4 computed score out of 10
"""

import glob
import json
import os
import random


def load_data(limit: int = 5, split: str = "train", task: str = None) -> list:
    """
    Loads screening scenarios from netsol_raw/*.json files.

    Args:
        limit: Maximum number of scenarios to return per episode.
        split: Dataset split (unused but kept for OpenEnv interface compatibility).
        task:  Difficulty filter: "easy", "medium", "hard", or None for all.

    Returns:
        List of scenario dicts ready for the environment queue.
    """
    print(f"Loading randomized dataset from netsol_raw/ (limit={limit}, task={task})...")

    local_dir = os.path.join(os.path.dirname(__file__), "netsol_raw")
    files = glob.glob(os.path.join(local_dir, "*.json"))

    if not files:
        print("WARNING: netsol_raw/ directory not found or empty.")
        return []

    random.shuffle(files)  # Different candidate order every episode

    scenarios = []

    for filepath in files:
        if len(scenarios) >= limit:
            break

        try:
            with open(filepath, "r", encoding="utf-8") as f:
                row = json.load(f)
        except Exception:
            continue

        input_data = row.get("input", {})
        output_data = row.get("output", {})

        # --- Handle invalid/gibberish files as automatic Reject test cases ---
        is_valid = output_data.get("valid_resume_and_jd", True)

        if not is_valid:
            score = 0.0
            expected_decision = "reject"
            rationale = "Candidate submission contains invalid or nonsensical text (gibberish/spam)."
        else:
            scores = output_data.get("scores", {})
            aggregated = scores.get("aggregated_scores", {})
            try:
                score = aggregated.get("macro_scores", 0.0) / 10.0
            except Exception:
                score = 0.0

            # --- Task-based filtering ---
            if task == "easy":
                # Easy mode: only very strong matches or very clear rejections
                if 0.3 < score < 0.7:
                    continue
            elif task == "hard":
                # Hard mode: include adversarial/junk cases
                # (already covered by is_valid logic above)
                pass
            # "medium" task (the default) includes the full spectrum

            # --- Map score to expected decision ---
            if score > 0.65:
                expected_decision = "shortlist"
                rationale = f"High compatibility (Score: {score * 10:.1f}/10.0)."
            elif score > 0.40:
                expected_decision = "flag_for_review"
                rationale = f"Partial match (Score: {score * 10:.1f}/10.0) requiring manual review."
            else:
                expected_decision = "reject"
                rationale = f"Low factor compatibility (Score: {score * 10:.1f}/10.0)."

        # --- Build job description with all available context ---
        try:
            jd_text = input_data.get("job_description", "N/A")
            job_title = jd_text.split("\n")[0][:60] + "..." if "\n" in jd_text else "Target Role"
        except Exception:
            jd_text = "N/A"
            job_title = "Target Role"

        min_reqs = input_data.get("minimum_requirements", [])
        if min_reqs:
            jd_text += "\n\nMinimum Requirements:\n- " + "\n- ".join(min_reqs)

        add_info = input_data.get("additional_info", "")
        if add_info:
            jd_text += f"\n\nAdditional Info:\n{add_info}"

        macro_dict = input_data.get("macro_dict", {})
        micro_dict = input_data.get("micro_dict", {})

        # --- Build enriched, structured resume from the details block ---
        # The 'details' block has been pre-parsed by GPT-4, giving clean structured fields.
        # We build a labeled text block the AI can reason against instead of raw OCR.
        raw_resume = input_data.get("resume", "N/A")
        details = row.get("details", {})

        enriched_parts = []

        if details.get("name"):
            enriched_parts.append(f"Name: {details['name']}")
        if details.get("email_id"):
            enriched_parts.append(f"Email: {details['email_id']}")
        if details.get("location"):
            enriched_parts.append(f"Location: {details['location']}")

        if details.get("executive_summary"):
            enriched_parts.append(f"\nSUMMARY:\n{details['executive_summary']}")

        if details.get("employment_history"):
            enriched_parts.append("\nEXPERIENCE:")
            for job in details["employment_history"]:
                title = job.get("job_title", "")
                company = job.get("company_name", "")
                start = job.get("start_date", "")
                end = job.get("end_date", "Present")
                job_details = job.get("details", "")
                enriched_parts.append(f"  - {title} at {company} ({start} - {end}): {job_details}")

        if details.get("education"):
            enriched_parts.append("\nEDUCATION:")
            for edu in details["education"]:
                enriched_parts.append(
                    f"  - {edu.get('degree_title', '')} from {edu.get('university', '')} (Graduated: {edu.get('end_date', '')})"
                )

        if details.get("skills"):
            skill_items = details["skills"]
            if skill_items and isinstance(skill_items[0], dict):
                skills_str = ", ".join(s.get("skill", s.get("name", str(s))) for s in skill_items)
            else:
                skills_str = ", ".join(str(s) for s in skill_items)
            enriched_parts.append(f"\nSKILLS: {skills_str}")

        if details.get("certifications"):
            cert_items = details["certifications"]
            if cert_items and isinstance(cert_items[0], dict):
                certs_str = ", ".join(c.get("certification_name", c.get("name", str(c))) for c in cert_items)
            else:
                certs_str = ", ".join(str(c) for c in cert_items)
            enriched_parts.append(f"CERTIFICATIONS: {certs_str}")

        # Always append raw OCR text as additional context fallback
        if raw_resume and raw_resume != "N/A":
            enriched_parts.append(f"\n--- Original Resume Text ---\n{raw_resume}")

        resume_text = "\n".join(enriched_parts) if enriched_parts else raw_resume

        scenarios.append({
            "id": os.path.basename(filepath),
            "difficulty": task or "mixed",
            "job_title": job_title,
            "job_description": jd_text,
            "resume_text": resume_text,
            "macro_criteria": json.dumps(macro_dict),
            "micro_criteria": json.dumps(micro_dict),
            "expected_decision": expected_decision,
            "rationale": rationale,
        })

    print(f"Loaded {len(scenarios)} scenarios.")
    return scenarios