resume-ner / training /generate_from_structured.py
Somasundaram Ayyappan
Add Kaggle silver training data, retrain model, reorganize data directory
ae7305b
"""
Generate BIO-tagged NER training data from datasetmaster/resumes structured data.
Creates diverse resume text formats + aligned BIO tags.
"""
import json
import random
from pathlib import Path
from datasets import load_dataset
try:
from training.dataset_utils import dedupe_examples, stable_split_examples, write_dataset
from training.labels import ID2LABEL, LABEL2ID
from training.synthetic_assets import COMMON_CERTS, generate_email, generate_phone, load_companies, load_titles
from training.synthetic_formats import RESUME_FORMATS
from training.tagging import tag_exact_words
except ModuleNotFoundError:
from dataset_utils import dedupe_examples, stable_split_examples, write_dataset
from labels import ID2LABEL, LABEL2ID
from synthetic_assets import COMMON_CERTS, generate_email, generate_phone, load_companies, load_titles
from synthetic_formats import RESUME_FORMATS
from tagging import tag_exact_words
DATA_DIR = Path(__file__).parent / "data"
def tag_skill_individually(tokens, labels, skill_name):
skill_clean = skill_name.strip().rstrip(",.;:|")
skill_words = skill_clean.split()
if not skill_words:
return
for i in range(len(tokens)):
if len(skill_words) == 1:
clean_tok = tokens[i].rstrip(",.;:|").lstrip("-")
if clean_tok.lower() == skill_clean.lower():
if labels[i] == "O":
labels[i] = "B-SKILL"
return
else:
match = True
for j, sw in enumerate(skill_words):
if i + j >= len(tokens):
match = False
break
if tokens[i + j].rstrip(",.;:|").lower() != sw.rstrip(",.;:|").lower():
match = False
break
if match:
for j in range(len(skill_words)):
if labels[i + j] == "O":
labels[i + j] = "B-SKILL" if j == 0 else "I-SKILL"
return
def extract_fields(sample):
p = sample.get("personal_info") or {}
if not isinstance(p, dict): return None
name = p.get("name", "Unknown")
if not name or name == "Unknown": return None
loc = p.get("location") or {}
city = loc.get("city", "") if isinstance(loc, dict) else ""
country = loc.get("country", "") if isinstance(loc, dict) else ""
location = f"{city}, {country}".strip(", ") if city or country else ""
email = generate_email(name)
phone = generate_phone()
summary = p.get("summary", "")
if summary == "Unknown": summary = ""
exp_list = []
for e in (sample.get("experience") or []):
if not isinstance(e, dict): continue
title = e.get("title", "")
company = e.get("company", "")
if not title or title == "Unknown": continue
if company in ("Unknown", "Fresher", ""): company = ""
dates_d = e.get("dates", {}) or {}
start = dates_d.get("start", "") if isinstance(dates_d, dict) else ""
end = dates_d.get("end", "") if isinstance(dates_d, dict) else ""
if start == "Unknown": start = ""
if end == "Unknown": end = ""
dates = ""
if start:
dates = start
if end: dates += f" - {end}"
resps = e.get("responsibilities", [])
desc = ""
if isinstance(resps, list) and resps and resps[0] != "Unknown":
desc = resps[0]
exp_loc = ""
company_info = e.get("company_info") or {}
if isinstance(company_info, dict):
exp_loc = company_info.get("location", "") or ""
if exp_loc == "Unknown": exp_loc = ""
# 90% chance to swap with real company names, 50% for titles
companies = load_companies(DATA_DIR)
titles = load_titles(DATA_DIR)
if companies and random.random() < 0.9:
company = random.choice(companies)
if titles and random.random() < 0.5:
title = random.choice(titles)
exp_list.append({"title": title, "company": company, "location": exp_loc, "dates": dates, "desc": desc, "start": start, "end": end})
edu_list = []
for ed in (sample.get("education") or []):
if not isinstance(ed, dict): continue
degree = ed.get("degree", {}) or {}
level = degree.get("level", "") if isinstance(degree, dict) else ""
field = degree.get("field", "") if isinstance(degree, dict) else ""
inst = ed.get("institution", {}) or {}
inst_name = inst.get("name", "") if isinstance(inst, dict) else ""
if level == "Unknown": level = ""
if field == "Unknown": field = ""
if inst_name == "Unknown": inst_name = ""
if not level and not inst_name: continue
line_parts = []
if level: line_parts.append(level)
if field: line_parts.append(f"in {field}")
if inst_name: line_parts.append(f"from {inst_name}" if line_parts else inst_name)
edu_list.append({"level": level, "field": field, "institution": inst_name, "line": " ".join(line_parts)})
skills_data = sample.get("skills") or {}
tech = skills_data.get("technical", {}) if isinstance(skills_data, dict) else {}
all_skills = []
if isinstance(tech, dict):
for cat in tech.values():
if isinstance(cat, list):
for sk in cat:
if isinstance(sk, dict) and sk.get("name") and sk["name"] != "Unknown":
all_skills.append(sk["name"])
certs = sample.get("certifications") or []
cert_names = []
if isinstance(certs, list):
for c in certs:
if isinstance(c, dict) and c.get("name") and c["name"] != "Unknown":
cert_names.append(c["name"])
# Add synthetic certs for variety (some resumes should have them)
if not cert_names and random.random() < 0.3:
cert_names = random.sample(COMMON_CERTS, random.randint(1, 2))
return {
"name": name, "email": email, "phone": phone, "location": location,
"summary": summary, "exp": exp_list, "edu": edu_list,
"skills": all_skills, "certs": cert_names,
}
def build_resume_and_tags(sample):
fields = extract_fields(sample)
if not fields or len(fields["skills"]) < 2:
return None
fmt = random.choice(RESUME_FORMATS)
text = fmt(
fields["name"], fields["email"], fields["phone"], fields["location"],
fields["summary"], fields["exp"], fields["edu"], fields["skills"], fields["certs"],
)
tokens = text.split()
if len(tokens) < 15:
return None
labels = ["O"] * len(tokens)
tag_exact_words(tokens, labels, fields["name"], "NAME")
tag_exact_words(tokens, labels, fields["email"], "EMAIL")
tag_exact_words(tokens, labels, fields["phone"], "PHONE")
if fields["location"]:
tag_exact_words(tokens, labels, fields["location"], "LOCATION")
for e in fields["exp"]:
if e["title"]:
tag_exact_words(tokens, labels, e["title"], "TITLE")
if e["company"]:
tag_exact_words(tokens, labels, e["company"], "COMPANY")
if e["start"]:
tag_exact_words(tokens, labels, e["start"], "DATE")
if e["end"] and e["end"] != "Present":
tag_exact_words(tokens, labels, e["end"], "DATE")
if e["end"] == "Present":
tag_exact_words(tokens, labels, "Present", "DATE")
for ed in fields["edu"]:
if ed["level"]:
tag_exact_words(tokens, labels, ed["level"], "DEGREE")
if ed["field"]:
tag_exact_words(tokens, labels, ed["field"], "FIELD")
if ed["institution"]:
tag_exact_words(tokens, labels, ed["institution"], "INSTITUTION")
for sk in fields["skills"]:
tag_skill_individually(tokens, labels, sk)
for cn in fields["certs"]:
tag_exact_words(tokens, labels, cn, "CERT")
tagged = sum(1 for l in labels if l != "O")
if tagged < 5:
return None
return {"tokens": tokens, "ner_tags": [LABEL2ID.get(l, 0) for l in labels]}
def main():
print("Loading datasetmaster/resumes...")
ds = load_dataset("datasetmaster/resumes", split="train")
print(f"Total: {len(ds)}")
random.seed(42)
converted = []
for idx, sample in enumerate(ds):
result = build_resume_and_tags(sample)
if result:
result["metadata"] = {
"source": "datasetmaster_resumes",
"source_id": f"generated:{idx}",
"group_id": f"generated:{idx}",
}
converted.append(result)
print(f"Converted: {len(converted)}")
random.shuffle(converted)
selected = converted[:4000]
try:
from training.convert_dataturks import convert_dataturks_sample
from training.manual_resumes import build_manual_examples
from training.build_long_resumes import build_examples as build_long_examples
except ModuleNotFoundError:
from convert_dataturks import convert_dataturks_sample
from manual_resumes import build_manual_examples
from build_long_resumes import build_examples as build_long_examples
dataturks = []
with open(DATA_DIR / "sources" / "dataturks_raw.json") as f:
for line in f:
line = line.strip()
if line:
try:
item = json.loads(line)
result = convert_dataturks_sample(item, source_id=f"dataturks:{len(dataturks)}")
if result:
dataturks.append(result)
except json.JSONDecodeError:
print("Skipped invalid DataTurks JSON line")
manual = build_manual_examples()
long_resumes = build_long_examples()
resume_resource = []
rr_path = DATA_DIR / "gold" / "resume_resource_gold.json"
if rr_path.exists():
with open(rr_path) as f:
resume_resource = json.load(f)["data"]
print(f"DataTurks: {len(dataturks)}")
print(f"Generated: {len(selected)}")
print(f"Manual templates: {len(manual)}")
print(f"Long resumes: {len(long_resumes)}")
print(f"Resume resource: {len(resume_resource)}")
all_data = dataturks + selected + manual + long_resumes + resume_resource
clean = []
for e in all_data:
name_tokens = [t for t, tag in zip(e["tokens"], e["ner_tags"]) if ID2LABEL[tag] in ("B-NAME", "I-NAME")]
name = " ".join(name_tokens).lower()
if name in ("not provided", "unknown", "", "n/a"):
continue
if sum(1 for t in e["ner_tags"] if t != 0) < 5:
continue
clean.append(e)
clean, duplicates_removed = dedupe_examples(clean)
train, val = stable_split_examples(clean, train_ratio=0.85)
try:
from training.noise_augment import augment_examples
except ModuleNotFoundError:
from noise_augment import augment_examples
augmented = augment_examples(train, multiplier=2, seed=42)
print(f"Noise augmented: {len(augmented)} (from {len(train)} train examples)")
train = train + augmented
write_dataset(
train,
val,
DATA_DIR,
manifest={
"builder": "generate_from_structured.py",
"sources": {
"dataturks": len(dataturks),
"generated": len(selected),
"manual_templates": len(manual),
"long_resumes": len(long_resumes),
"resume_resource": len(resume_resource),
"noise_augmented": len(augmented),
"duplicates_removed": duplicates_removed,
},
},
)
print(f"\nFinal: Train={len(train)}, Val={len(val)}")
from collections import Counter
counts = Counter()
for e in clean:
for tag in e["ner_tags"]:
label = ID2LABEL[tag]
if label != "O":
counts[label[2:]] += 1
print("\nLabels:")
for l, c in counts.most_common():
print(f" {l:15s}: {c}")
if __name__ == "__main__":
main()