resume-ner / training /generate_from_structured.py

Somasundaram Ayyappan

Add Kaggle silver training data, retrain model, reorganize data directory

ae7305b 20 days ago

12.1 kB

	"""
	Generate BIO-tagged NER training data from datasetmaster/resumes structured data.
	Creates diverse resume text formats + aligned BIO tags.
	"""

	import json
	import random
	from pathlib import Path

	from datasets import load_dataset

	try:
	from training.dataset_utils import dedupe_examples, stable_split_examples, write_dataset
	from training.labels import ID2LABEL, LABEL2ID
	from training.synthetic_assets import COMMON_CERTS, generate_email, generate_phone, load_companies, load_titles
	from training.synthetic_formats import RESUME_FORMATS
	from training.tagging import tag_exact_words
	except ModuleNotFoundError:
	from dataset_utils import dedupe_examples, stable_split_examples, write_dataset
	from labels import ID2LABEL, LABEL2ID
	from synthetic_assets import COMMON_CERTS, generate_email, generate_phone, load_companies, load_titles
	from synthetic_formats import RESUME_FORMATS
	from tagging import tag_exact_words

	DATA_DIR = Path(__file__).parent / "data"


	def tag_skill_individually(tokens, labels, skill_name):
	skill_clean = skill_name.strip().rstrip(",.;:\|")
	skill_words = skill_clean.split()
	if not skill_words:
	return
	for i in range(len(tokens)):
	if len(skill_words) == 1:
	clean_tok = tokens[i].rstrip(",.;:\|").lstrip("-")
	if clean_tok.lower() == skill_clean.lower():
	if labels[i] == "O":
	labels[i] = "B-SKILL"
	return
	else:
	match = True
	for j, sw in enumerate(skill_words):
	if i + j >= len(tokens):
	match = False
	break
	if tokens[i + j].rstrip(",.;:\|").lower() != sw.rstrip(",.;:\|").lower():
	match = False
	break
	if match:
	for j in range(len(skill_words)):
	if labels[i + j] == "O":
	labels[i + j] = "B-SKILL" if j == 0 else "I-SKILL"
	return


	def extract_fields(sample):
	p = sample.get("personal_info") or {}
	if not isinstance(p, dict): return None
	name = p.get("name", "Unknown")
	if not name or name == "Unknown": return None

	loc = p.get("location") or {}
	city = loc.get("city", "") if isinstance(loc, dict) else ""
	country = loc.get("country", "") if isinstance(loc, dict) else ""
	location = f"{city}, {country}".strip(", ") if city or country else ""

	email = generate_email(name)
	phone = generate_phone()
	summary = p.get("summary", "")
	if summary == "Unknown": summary = ""

	exp_list = []
	for e in (sample.get("experience") or []):
	if not isinstance(e, dict): continue
	title = e.get("title", "")
	company = e.get("company", "")
	if not title or title == "Unknown": continue
	if company in ("Unknown", "Fresher", ""): company = ""
	dates_d = e.get("dates", {}) or {}
	start = dates_d.get("start", "") if isinstance(dates_d, dict) else ""
	end = dates_d.get("end", "") if isinstance(dates_d, dict) else ""
	if start == "Unknown": start = ""
	if end == "Unknown": end = ""
	dates = ""
	if start:
	dates = start
	if end: dates += f" - {end}"
	resps = e.get("responsibilities", [])
	desc = ""
	if isinstance(resps, list) and resps and resps[0] != "Unknown":
	desc = resps[0]

	exp_loc = ""
	company_info = e.get("company_info") or {}
	if isinstance(company_info, dict):
	exp_loc = company_info.get("location", "") or ""
	if exp_loc == "Unknown": exp_loc = ""

	# 90% chance to swap with real company names, 50% for titles
	companies = load_companies(DATA_DIR)
	titles = load_titles(DATA_DIR)
	if companies and random.random() < 0.9:
	company = random.choice(companies)
	if titles and random.random() < 0.5:
	title = random.choice(titles)

	exp_list.append({"title": title, "company": company, "location": exp_loc, "dates": dates, "desc": desc, "start": start, "end": end})

	edu_list = []
	for ed in (sample.get("education") or []):
	if not isinstance(ed, dict): continue
	degree = ed.get("degree", {}) or {}
	level = degree.get("level", "") if isinstance(degree, dict) else ""
	field = degree.get("field", "") if isinstance(degree, dict) else ""
	inst = ed.get("institution", {}) or {}
	inst_name = inst.get("name", "") if isinstance(inst, dict) else ""
	if level == "Unknown": level = ""
	if field == "Unknown": field = ""
	if inst_name == "Unknown": inst_name = ""
	if not level and not inst_name: continue
	line_parts = []
	if level: line_parts.append(level)
	if field: line_parts.append(f"in {field}")
	if inst_name: line_parts.append(f"from {inst_name}" if line_parts else inst_name)
	edu_list.append({"level": level, "field": field, "institution": inst_name, "line": " ".join(line_parts)})

	skills_data = sample.get("skills") or {}
	tech = skills_data.get("technical", {}) if isinstance(skills_data, dict) else {}
	all_skills = []
	if isinstance(tech, dict):
	for cat in tech.values():
	if isinstance(cat, list):
	for sk in cat:
	if isinstance(sk, dict) and sk.get("name") and sk["name"] != "Unknown":
	all_skills.append(sk["name"])

	certs = sample.get("certifications") or []
	cert_names = []
	if isinstance(certs, list):
	for c in certs:
	if isinstance(c, dict) and c.get("name") and c["name"] != "Unknown":
	cert_names.append(c["name"])

	# Add synthetic certs for variety (some resumes should have them)
	if not cert_names and random.random() < 0.3:
	cert_names = random.sample(COMMON_CERTS, random.randint(1, 2))

	return {
	"name": name, "email": email, "phone": phone, "location": location,
	"summary": summary, "exp": exp_list, "edu": edu_list,
	"skills": all_skills, "certs": cert_names,
	}


	def build_resume_and_tags(sample):
	fields = extract_fields(sample)
	if not fields or len(fields["skills"]) < 2:
	return None

	fmt = random.choice(RESUME_FORMATS)
	text = fmt(
	fields["name"], fields["email"], fields["phone"], fields["location"],
	fields["summary"], fields["exp"], fields["edu"], fields["skills"], fields["certs"],
	)

	tokens = text.split()
	if len(tokens) < 15:
	return None
	labels = ["O"] * len(tokens)

	tag_exact_words(tokens, labels, fields["name"], "NAME")
	tag_exact_words(tokens, labels, fields["email"], "EMAIL")
	tag_exact_words(tokens, labels, fields["phone"], "PHONE")
	if fields["location"]:
	tag_exact_words(tokens, labels, fields["location"], "LOCATION")

	for e in fields["exp"]:
	if e["title"]:
	tag_exact_words(tokens, labels, e["title"], "TITLE")
	if e["company"]:
	tag_exact_words(tokens, labels, e["company"], "COMPANY")
	if e["start"]:
	tag_exact_words(tokens, labels, e["start"], "DATE")
	if e["end"] and e["end"] != "Present":
	tag_exact_words(tokens, labels, e["end"], "DATE")
	if e["end"] == "Present":
	tag_exact_words(tokens, labels, "Present", "DATE")

	for ed in fields["edu"]:
	if ed["level"]:
	tag_exact_words(tokens, labels, ed["level"], "DEGREE")
	if ed["field"]:
	tag_exact_words(tokens, labels, ed["field"], "FIELD")
	if ed["institution"]:
	tag_exact_words(tokens, labels, ed["institution"], "INSTITUTION")

	for sk in fields["skills"]:
	tag_skill_individually(tokens, labels, sk)

	for cn in fields["certs"]:
	tag_exact_words(tokens, labels, cn, "CERT")

	tagged = sum(1 for l in labels if l != "O")
	if tagged < 5:
	return None

	return {"tokens": tokens, "ner_tags": [LABEL2ID.get(l, 0) for l in labels]}


	def main():
	print("Loading datasetmaster/resumes...")
	ds = load_dataset("datasetmaster/resumes", split="train")
	print(f"Total: {len(ds)}")

	random.seed(42)
	converted = []
	for idx, sample in enumerate(ds):
	result = build_resume_and_tags(sample)
	if result:
	result["metadata"] = {
	"source": "datasetmaster_resumes",
	"source_id": f"generated:{idx}",
	"group_id": f"generated:{idx}",
	}
	converted.append(result)

	print(f"Converted: {len(converted)}")

	random.shuffle(converted)
	selected = converted[:4000]

	try:
	from training.convert_dataturks import convert_dataturks_sample
	from training.manual_resumes import build_manual_examples
	from training.build_long_resumes import build_examples as build_long_examples
	except ModuleNotFoundError:
	from convert_dataturks import convert_dataturks_sample
	from manual_resumes import build_manual_examples
	from build_long_resumes import build_examples as build_long_examples

	dataturks = []
	with open(DATA_DIR / "sources" / "dataturks_raw.json") as f:
	for line in f:
	line = line.strip()
	if line:
	try:
	item = json.loads(line)
	result = convert_dataturks_sample(item, source_id=f"dataturks:{len(dataturks)}")
	if result:
	dataturks.append(result)
	except json.JSONDecodeError:
	print("Skipped invalid DataTurks JSON line")

	manual = build_manual_examples()
	long_resumes = build_long_examples()

	resume_resource = []
	rr_path = DATA_DIR / "gold" / "resume_resource_gold.json"
	if rr_path.exists():
	with open(rr_path) as f:
	resume_resource = json.load(f)["data"]

	print(f"DataTurks: {len(dataturks)}")
	print(f"Generated: {len(selected)}")
	print(f"Manual templates: {len(manual)}")
	print(f"Long resumes: {len(long_resumes)}")
	print(f"Resume resource: {len(resume_resource)}")

	all_data = dataturks + selected + manual + long_resumes + resume_resource

	clean = []
	for e in all_data:
	name_tokens = [t for t, tag in zip(e["tokens"], e["ner_tags"]) if ID2LABEL[tag] in ("B-NAME", "I-NAME")]
	name = " ".join(name_tokens).lower()
	if name in ("not provided", "unknown", "", "n/a"):
	continue
	if sum(1 for t in e["ner_tags"] if t != 0) < 5:
	continue
	clean.append(e)

	clean, duplicates_removed = dedupe_examples(clean)
	train, val = stable_split_examples(clean, train_ratio=0.85)

	try:
	from training.noise_augment import augment_examples
	except ModuleNotFoundError:
	from noise_augment import augment_examples

	augmented = augment_examples(train, multiplier=2, seed=42)
	print(f"Noise augmented: {len(augmented)} (from {len(train)} train examples)")
	train = train + augmented
	write_dataset(
	train,
	val,
	DATA_DIR,
	manifest={
	"builder": "generate_from_structured.py",
	"sources": {
	"dataturks": len(dataturks),
	"generated": len(selected),
	"manual_templates": len(manual),
	"long_resumes": len(long_resumes),
	"resume_resource": len(resume_resource),
	"noise_augmented": len(augmented),
	"duplicates_removed": duplicates_removed,
	},
	},
	)

	print(f"\nFinal: Train={len(train)}, Val={len(val)}")

	from collections import Counter
	counts = Counter()
	for e in clean:
	for tag in e["ner_tags"]:
	label = ID2LABEL[tag]
	if label != "O":
	counts[label[2:]] += 1
	print("\nLabels:")
	for l, c in counts.most_common():
	print(f" {l:15s}: {c}")


	if __name__ == "__main__":
	main()