File size: 12,092 Bytes
496f788
 
 
 
 
 
 
 
750e1a2
496f788
 
750e1a2
 
 
 
 
 
 
 
 
 
 
 
496f788
750e1a2
496f788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750e1a2
 
496f788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750e1a2
496f788
 
 
 
 
 
 
 
 
 
 
 
 
750e1a2
496f788
 
 
 
 
efba759
496f788
 
 
 
750e1a2
 
 
496f788
750e1a2
496f788
 
 
750e1a2
496f788
750e1a2
496f788
750e1a2
496f788
750e1a2
496f788
750e1a2
496f788
 
 
750e1a2
496f788
750e1a2
496f788
750e1a2
496f788
 
 
 
 
750e1a2
496f788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750e1a2
 
496f788
750e1a2
 
 
 
 
496f788
 
 
 
750e1a2
496f788
750e1a2
 
 
 
efba759
750e1a2
 
 
efba759
496f788
 
ae7305b
496f788
 
 
 
 
750e1a2
496f788
 
750e1a2
 
496f788
 
efba759
496f788
a11ed90
ae7305b
a11ed90
 
 
 
496f788
 
 
efba759
a11ed90
496f788
a11ed90
496f788
 
 
750e1a2
496f788
 
 
 
 
 
 
750e1a2
 
1652579
 
 
 
 
 
 
 
 
750e1a2
 
 
 
 
 
 
 
 
 
efba759
a11ed90
1652579
750e1a2
 
 
 
496f788
 
 
 
 
 
 
750e1a2
496f788
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
"""
Generate BIO-tagged NER training data from datasetmaster/resumes structured data.
Creates diverse resume text formats + aligned BIO tags.
"""

import json
import random
from pathlib import Path

from datasets import load_dataset

try:
    from training.dataset_utils import dedupe_examples, stable_split_examples, write_dataset
    from training.labels import ID2LABEL, LABEL2ID
    from training.synthetic_assets import COMMON_CERTS, generate_email, generate_phone, load_companies, load_titles
    from training.synthetic_formats import RESUME_FORMATS
    from training.tagging import tag_exact_words
except ModuleNotFoundError:
    from dataset_utils import dedupe_examples, stable_split_examples, write_dataset
    from labels import ID2LABEL, LABEL2ID
    from synthetic_assets import COMMON_CERTS, generate_email, generate_phone, load_companies, load_titles
    from synthetic_formats import RESUME_FORMATS
    from tagging import tag_exact_words

DATA_DIR = Path(__file__).parent / "data"


def tag_skill_individually(tokens, labels, skill_name):
    skill_clean = skill_name.strip().rstrip(",.;:|")
    skill_words = skill_clean.split()
    if not skill_words:
        return
    for i in range(len(tokens)):
        if len(skill_words) == 1:
            clean_tok = tokens[i].rstrip(",.;:|").lstrip("-")
            if clean_tok.lower() == skill_clean.lower():
                if labels[i] == "O":
                    labels[i] = "B-SKILL"
                return
        else:
            match = True
            for j, sw in enumerate(skill_words):
                if i + j >= len(tokens):
                    match = False
                    break
                if tokens[i + j].rstrip(",.;:|").lower() != sw.rstrip(",.;:|").lower():
                    match = False
                    break
            if match:
                for j in range(len(skill_words)):
                    if labels[i + j] == "O":
                        labels[i + j] = "B-SKILL" if j == 0 else "I-SKILL"
                return


def extract_fields(sample):
    p = sample.get("personal_info") or {}
    if not isinstance(p, dict): return None
    name = p.get("name", "Unknown")
    if not name or name == "Unknown": return None

    loc = p.get("location") or {}
    city = loc.get("city", "") if isinstance(loc, dict) else ""
    country = loc.get("country", "") if isinstance(loc, dict) else ""
    location = f"{city}, {country}".strip(", ") if city or country else ""

    email = generate_email(name)
    phone = generate_phone()
    summary = p.get("summary", "")
    if summary == "Unknown": summary = ""

    exp_list = []
    for e in (sample.get("experience") or []):
        if not isinstance(e, dict): continue
        title = e.get("title", "")
        company = e.get("company", "")
        if not title or title == "Unknown": continue
        if company in ("Unknown", "Fresher", ""): company = ""
        dates_d = e.get("dates", {}) or {}
        start = dates_d.get("start", "") if isinstance(dates_d, dict) else ""
        end = dates_d.get("end", "") if isinstance(dates_d, dict) else ""
        if start == "Unknown": start = ""
        if end == "Unknown": end = ""
        dates = ""
        if start:
            dates = start
            if end: dates += f" - {end}"
        resps = e.get("responsibilities", [])
        desc = ""
        if isinstance(resps, list) and resps and resps[0] != "Unknown":
            desc = resps[0]

        exp_loc = ""
        company_info = e.get("company_info") or {}
        if isinstance(company_info, dict):
            exp_loc = company_info.get("location", "") or ""
            if exp_loc == "Unknown": exp_loc = ""

        # 90% chance to swap with real company names, 50% for titles
        companies = load_companies(DATA_DIR)
        titles = load_titles(DATA_DIR)
        if companies and random.random() < 0.9:
            company = random.choice(companies)
        if titles and random.random() < 0.5:
            title = random.choice(titles)

        exp_list.append({"title": title, "company": company, "location": exp_loc, "dates": dates, "desc": desc, "start": start, "end": end})

    edu_list = []
    for ed in (sample.get("education") or []):
        if not isinstance(ed, dict): continue
        degree = ed.get("degree", {}) or {}
        level = degree.get("level", "") if isinstance(degree, dict) else ""
        field = degree.get("field", "") if isinstance(degree, dict) else ""
        inst = ed.get("institution", {}) or {}
        inst_name = inst.get("name", "") if isinstance(inst, dict) else ""
        if level == "Unknown": level = ""
        if field == "Unknown": field = ""
        if inst_name == "Unknown": inst_name = ""
        if not level and not inst_name: continue
        line_parts = []
        if level: line_parts.append(level)
        if field: line_parts.append(f"in {field}")
        if inst_name: line_parts.append(f"from {inst_name}" if line_parts else inst_name)
        edu_list.append({"level": level, "field": field, "institution": inst_name, "line": " ".join(line_parts)})

    skills_data = sample.get("skills") or {}
    tech = skills_data.get("technical", {}) if isinstance(skills_data, dict) else {}
    all_skills = []
    if isinstance(tech, dict):
        for cat in tech.values():
            if isinstance(cat, list):
                for sk in cat:
                    if isinstance(sk, dict) and sk.get("name") and sk["name"] != "Unknown":
                        all_skills.append(sk["name"])

    certs = sample.get("certifications") or []
    cert_names = []
    if isinstance(certs, list):
        for c in certs:
            if isinstance(c, dict) and c.get("name") and c["name"] != "Unknown":
                cert_names.append(c["name"])

    # Add synthetic certs for variety (some resumes should have them)
    if not cert_names and random.random() < 0.3:
        cert_names = random.sample(COMMON_CERTS, random.randint(1, 2))

    return {
        "name": name, "email": email, "phone": phone, "location": location,
        "summary": summary, "exp": exp_list, "edu": edu_list,
        "skills": all_skills, "certs": cert_names,
    }


def build_resume_and_tags(sample):
    fields = extract_fields(sample)
    if not fields or len(fields["skills"]) < 2:
        return None

    fmt = random.choice(RESUME_FORMATS)
    text = fmt(
        fields["name"], fields["email"], fields["phone"], fields["location"],
        fields["summary"], fields["exp"], fields["edu"], fields["skills"], fields["certs"],
    )

    tokens = text.split()
    if len(tokens) < 15:
        return None
    labels = ["O"] * len(tokens)

    tag_exact_words(tokens, labels, fields["name"], "NAME")
    tag_exact_words(tokens, labels, fields["email"], "EMAIL")
    tag_exact_words(tokens, labels, fields["phone"], "PHONE")
    if fields["location"]:
        tag_exact_words(tokens, labels, fields["location"], "LOCATION")

    for e in fields["exp"]:
        if e["title"]:
            tag_exact_words(tokens, labels, e["title"], "TITLE")
        if e["company"]:
            tag_exact_words(tokens, labels, e["company"], "COMPANY")
        if e["start"]:
            tag_exact_words(tokens, labels, e["start"], "DATE")
        if e["end"] and e["end"] != "Present":
            tag_exact_words(tokens, labels, e["end"], "DATE")
        if e["end"] == "Present":
            tag_exact_words(tokens, labels, "Present", "DATE")

    for ed in fields["edu"]:
        if ed["level"]:
            tag_exact_words(tokens, labels, ed["level"], "DEGREE")
        if ed["field"]:
            tag_exact_words(tokens, labels, ed["field"], "FIELD")
        if ed["institution"]:
            tag_exact_words(tokens, labels, ed["institution"], "INSTITUTION")

    for sk in fields["skills"]:
        tag_skill_individually(tokens, labels, sk)

    for cn in fields["certs"]:
        tag_exact_words(tokens, labels, cn, "CERT")

    tagged = sum(1 for l in labels if l != "O")
    if tagged < 5:
        return None

    return {"tokens": tokens, "ner_tags": [LABEL2ID.get(l, 0) for l in labels]}


def main():
    print("Loading datasetmaster/resumes...")
    ds = load_dataset("datasetmaster/resumes", split="train")
    print(f"Total: {len(ds)}")

    random.seed(42)
    converted = []
    for idx, sample in enumerate(ds):
        result = build_resume_and_tags(sample)
        if result:
            result["metadata"] = {
                "source": "datasetmaster_resumes",
                "source_id": f"generated:{idx}",
                "group_id": f"generated:{idx}",
            }
            converted.append(result)

    print(f"Converted: {len(converted)}")

    random.shuffle(converted)
    selected = converted[:4000]

    try:
        from training.convert_dataturks import convert_dataturks_sample
        from training.manual_resumes import build_manual_examples
        from training.build_long_resumes import build_examples as build_long_examples
    except ModuleNotFoundError:
        from convert_dataturks import convert_dataturks_sample
        from manual_resumes import build_manual_examples
        from build_long_resumes import build_examples as build_long_examples

    dataturks = []
    with open(DATA_DIR / "sources" / "dataturks_raw.json") as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    item = json.loads(line)
                    result = convert_dataturks_sample(item, source_id=f"dataturks:{len(dataturks)}")
                    if result:
                        dataturks.append(result)
                except json.JSONDecodeError:
                    print("Skipped invalid DataTurks JSON line")

    manual = build_manual_examples()
    long_resumes = build_long_examples()

    resume_resource = []
    rr_path = DATA_DIR / "gold" / "resume_resource_gold.json"
    if rr_path.exists():
        with open(rr_path) as f:
            resume_resource = json.load(f)["data"]

    print(f"DataTurks: {len(dataturks)}")
    print(f"Generated: {len(selected)}")
    print(f"Manual templates: {len(manual)}")
    print(f"Long resumes: {len(long_resumes)}")
    print(f"Resume resource: {len(resume_resource)}")

    all_data = dataturks + selected + manual + long_resumes + resume_resource

    clean = []
    for e in all_data:
        name_tokens = [t for t, tag in zip(e["tokens"], e["ner_tags"]) if ID2LABEL[tag] in ("B-NAME", "I-NAME")]
        name = " ".join(name_tokens).lower()
        if name in ("not provided", "unknown", "", "n/a"):
            continue
        if sum(1 for t in e["ner_tags"] if t != 0) < 5:
            continue
        clean.append(e)

    clean, duplicates_removed = dedupe_examples(clean)
    train, val = stable_split_examples(clean, train_ratio=0.85)

    try:
        from training.noise_augment import augment_examples
    except ModuleNotFoundError:
        from noise_augment import augment_examples

    augmented = augment_examples(train, multiplier=2, seed=42)
    print(f"Noise augmented: {len(augmented)} (from {len(train)} train examples)")
    train = train + augmented
    write_dataset(
        train,
        val,
        DATA_DIR,
        manifest={
            "builder": "generate_from_structured.py",
            "sources": {
                "dataturks": len(dataturks),
                "generated": len(selected),
                "manual_templates": len(manual),
                "long_resumes": len(long_resumes),
                "resume_resource": len(resume_resource),
                "noise_augmented": len(augmented),
                "duplicates_removed": duplicates_removed,
            },
        },
    )

    print(f"\nFinal: Train={len(train)}, Val={len(val)}")

    from collections import Counter
    counts = Counter()
    for e in clean:
        for tag in e["ner_tags"]:
            label = ID2LABEL[tag]
            if label != "O":
                counts[label[2:]] += 1
    print("\nLabels:")
    for l, c in counts.most_common():
        print(f"  {l:15s}: {c}")


if __name__ == "__main__":
    main()