File size: 12,092 Bytes

"""
Generate BIO-tagged NER training data from datasetmaster/resumes structured data.
Creates diverse resume text formats + aligned BIO tags.
"""

import json
import random
from pathlib import Path

from datasets import load_dataset

try:
    from training.dataset_utils import dedupe_examples, stable_split_examples, write_dataset
    from training.labels import ID2LABEL, LABEL2ID
    from training.synthetic_assets import COMMON_CERTS, generate_email, generate_phone, load_companies, load_titles
    from training.synthetic_formats import RESUME_FORMATS
    from training.tagging import tag_exact_words
except ModuleNotFoundError:
    from dataset_utils import dedupe_examples, stable_split_examples, write_dataset
    from labels import ID2LABEL, LABEL2ID
    from synthetic_assets import COMMON_CERTS, generate_email, generate_phone, load_companies, load_titles
    from synthetic_formats import RESUME_FORMATS
    from tagging import tag_exact_words

DATA_DIR = Path(__file__).parent / "data"


def tag_skill_individually(tokens, labels, skill_name):
    skill_clean = skill_name.strip().rstrip(",.;:|")
    skill_words = skill_clean.split()
    if not skill_words:
        return
    for i in range(len(tokens)):
        if len(skill_words) == 1:
            clean_tok = tokens[i].rstrip(",.;:|").lstrip("-")
            if clean_tok.lower() == skill_clean.lower():
                if labels[i] == "O":
                    labels[i] = "B-SKILL"
                return
        else:
            match = True
            for j, sw in enumerate(skill_words):
                if i + j >= len(tokens):
                    match = False
                    break
                if tokens[i + j].rstrip(",.;:|").lower() != sw.rstrip(",.;:|").lower():
                    match = False
                    break
            if match:
                for j in range(len(skill_words)):
                    if labels[i + j] == "O":
                        labels[i + j] = "B-SKILL" if j == 0 else "I-SKILL"
                return


def extract_fields(sample):
    p = sample.get("personal_info") or {}
    if not isinstance(p, dict): return None
    name = p.get("name", "Unknown")
    if not name or name == "Unknown": return None

    loc = p.get("location") or {}
    city = loc.get("city", "") if isinstance(loc, dict) else ""
    country = loc.get("country", "") if isinstance(loc, dict) else ""
    location = f"{city}, {country}".strip(", ") if city or country else ""

    email = generate_email(name)
    phone = generate_phone()
    summary = p.get("summary", "")
    if summary == "Unknown": summary = ""

    exp_list = []
    for e in (sample.get("experience") or []):
        if not isinstance(e, dict): continue
        title = e.get("title", "")
        company = e.get("company", "")
        if not title or title == "Unknown": continue
        if company in ("Unknown", "Fresher", ""): company = ""
        dates_d = e.get("dates", {}) or {}
        start = dates_d.get("start", "") if isinstance(dates_d, dict) else ""
        end = dates_d.get("end", "") if isinstance(dates_d, dict) else ""
        if start == "Unknown": start = ""
        if end == "Unknown": end = ""
        dates = ""
        if start:
            dates = start
            if end: dates += f" - {end}"
        resps = e.get("responsibilities", [])
        desc = ""
        if isinstance(resps, list) and resps and resps[0] != "Unknown":
            desc = resps[0]

        exp_loc = ""
        company_info = e.get("company_info") or {}
        if isinstance(company_info, dict):
            exp_loc = company_info.get("location", "") or ""
            if exp_loc == "Unknown": exp_loc = ""

        # 90% chance to swap with real company names, 50% for titles
        companies = load_companies(DATA_DIR)
        titles = load_titles(DATA_DIR)
        if companies and random.random() < 0.9:
            company = random.choice(companies)
        if titles and random.random() < 0.5:
            title = random.choice(titles)

        exp_list.append({"title": title, "company": company, "location": exp_loc, "dates": dates, "desc": desc, "start": start, "end": end})

    edu_list = []
    for ed in (sample.get("education") or []):
        if not isinstance(ed, dict): continue
        degree = ed.get("degree", {}) or {}
        level = degree.get("level", "") if isinstance(degree, dict) else ""
        field = degree.get("field", "") if isinstance(degree, dict) else ""
        inst = ed.get("institution", {}) or {}
        inst_name = inst.get("name", "") if isinstance(inst, dict) else ""
        if level == "Unknown": level = ""
        if field == "Unknown": field = ""
        if inst_name == "Unknown": inst_name = ""
        if not level and not inst_name: continue
        line_parts = []
        if level: line_parts.append(level)
        if field: line_parts.append(f"in {field}")
        if inst_name: line_parts.append(f"from {inst_name}" if line_parts else inst_name)
        edu_list.append({"level": level, "field": field, "institution": inst_name, "line": " ".join(line_parts)})

    skills_data = sample.get("skills") or {}
    tech = skills_data.get("technical", {}) if isinstance(skills_data, dict) else {}
    all_skills = []
    if isinstance(tech, dict):
        for cat in tech.values():
            if isinstance(cat, list):
                for sk in cat:
                    if isinstance(sk, dict) and sk.get("name") and sk["name"] != "Unknown":
                        all_skills.append(sk["name"])

    certs = sample.get("certifications") or []
    cert_names = []
    if isinstance(certs, list):
        for c in certs:
            if isinstance(c, dict) and c.get("name") and c["name"] != "Unknown":
                cert_names.append(c["name"])

    # Add synthetic certs for variety (some resumes should have them)
    if not cert_names and random.random() < 0.3:
        cert_names = random.sample(COMMON_CERTS, random.randint(1, 2))

    return {
        "name": name, "email": email, "phone": phone, "location": location,
        "summary": summary, "exp": exp_list, "edu": edu_list,
        "skills": all_skills, "certs": cert_names,
    }


def build_resume_and_tags(sample):
    fields = extract_fields(sample)
    if not fields or len(fields["skills"]) < 2:
        return None

    fmt = random.choice(RESUME_FORMATS)
    text = fmt(
        fields["name"], fields["email"], fields["phone"], fields["location"],
        fields["summary"], fields["exp"], fields["edu"], fields["skills"], fields["certs"],
    )

    tokens = text.split()
    if len(tokens) < 15:
        return None
    labels = ["O"] * len(tokens)

    tag_exact_words(tokens, labels, fields["name"], "NAME")
    tag_exact_words(tokens, labels, fields["email"], "EMAIL")
    tag_exact_words(tokens, labels, fields["phone"], "PHONE")
    if fields["location"]:
        tag_exact_words(tokens, labels, fields["location"], "LOCATION")

    for e in fields["exp"]:
        if e["title"]:
            tag_exact_words(tokens, labels, e["title"], "TITLE")
        if e["company"]:
            tag_exact_words(tokens, labels, e["company"], "COMPANY")
        if e["start"]:
            tag_exact_words(tokens, labels, e["start"], "DATE")
        if e["end"] and e["end"] != "Present":
            tag_exact_words(tokens, labels, e["end"], "DATE")
        if e["end"] == "Present":
            tag_exact_words(tokens, labels, "Present", "DATE")

    for ed in fields["edu"]:
        if ed["level"]:
            tag_exact_words(tokens, labels, ed["level"], "DEGREE")
        if ed["field"]:
            tag_exact_words(tokens, labels, ed["field"], "FIELD")
        if ed["institution"]:
            tag_exact_words(tokens, labels, ed["institution"], "INSTITUTION")

    for sk in fields["skills"]:
        tag_skill_individually(tokens, labels, sk)

    for cn in fields["certs"]:
        tag_exact_words(tokens, labels, cn, "CERT")

    tagged = sum(1 for l in labels if l != "O")
    if tagged < 5:
        return None

    return {"tokens": tokens, "ner_tags": [LABEL2ID.get(l, 0) for l in labels]}


def main():
    print("Loading datasetmaster/resumes...")
    ds = load_dataset("datasetmaster/resumes", split="train")
    print(f"Total: {len(ds)}")

    random.seed(42)
    converted = []
    for idx, sample in enumerate(ds):
        result = build_resume_and_tags(sample)
        if result:
            result["metadata"] = {
                "source": "datasetmaster_resumes",
                "source_id": f"generated:{idx}",
                "group_id": f"generated:{idx}",
            }
            converted.append(result)

    print(f"Converted: {len(converted)}")

    random.shuffle(converted)
    selected = converted[:4000]

    try:
        from training.convert_dataturks import convert_dataturks_sample
        from training.manual_resumes import build_manual_examples
        from training.build_long_resumes import build_examples as build_long_examples
    except ModuleNotFoundError:
        from convert_dataturks import convert_dataturks_sample
        from manual_resumes import build_manual_examples
        from build_long_resumes import build_examples as build_long_examples

    dataturks = []
    with open(DATA_DIR / "sources" / "dataturks_raw.json") as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    item = json.loads(line)
                    result = convert_dataturks_sample(item, source_id=f"dataturks:{len(dataturks)}")
                    if result:
                        dataturks.append(result)
                except json.JSONDecodeError:
                    print("Skipped invalid DataTurks JSON line")

    manual = build_manual_examples()
    long_resumes = build_long_examples()

    resume_resource = []
    rr_path = DATA_DIR / "gold" / "resume_resource_gold.json"
    if rr_path.exists():
        with open(rr_path) as f:
            resume_resource = json.load(f)["data"]

    print(f"DataTurks: {len(dataturks)}")
    print(f"Generated: {len(selected)}")
    print(f"Manual templates: {len(manual)}")
    print(f"Long resumes: {len(long_resumes)}")
    print(f"Resume resource: {len(resume_resource)}")

    all_data = dataturks + selected + manual + long_resumes + resume_resource

    clean = []
    for e in all_data:
        name_tokens = [t for t, tag in zip(e["tokens"], e["ner_tags"]) if ID2LABEL[tag] in ("B-NAME", "I-NAME")]
        name = " ".join(name_tokens).lower()
        if name in ("not provided", "unknown", "", "n/a"):
            continue
        if sum(1 for t in e["ner_tags"] if t != 0) < 5:
            continue
        clean.append(e)

    clean, duplicates_removed = dedupe_examples(clean)
    train, val = stable_split_examples(clean, train_ratio=0.85)

    try:
        from training.noise_augment import augment_examples
    except ModuleNotFoundError:
        from noise_augment import augment_examples

    augmented = augment_examples(train, multiplier=2, seed=42)
    print(f"Noise augmented: {len(augmented)} (from {len(train)} train examples)")
    train = train + augmented
    write_dataset(
        train,
        val,
        DATA_DIR,
        manifest={
            "builder": "generate_from_structured.py",
            "sources": {
                "dataturks": len(dataturks),
                "generated": len(selected),
                "manual_templates": len(manual),
                "long_resumes": len(long_resumes),
                "resume_resource": len(resume_resource),
                "noise_augmented": len(augmented),
                "duplicates_removed": duplicates_removed,
            },
        },
    )

    print(f"\nFinal: Train={len(train)}, Val={len(val)}")

    from collections import Counter
    counts = Counter()
    for e in clean:
        for tag in e["ner_tags"]:
            label = ID2LABEL[tag]
            if label != "O":
                counts[label[2:]] += 1
    print("\nLabels:")
    for l, c in counts.most_common():
        print(f"  {l:15s}: {c}")


if __name__ == "__main__":
    main()