Token Classification
Transformers
ONNX
Safetensors
English
distilbert
resume-parsing
ner
resume
cv
information-extraction
Instructions to use oksomu/resume-ner with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use oksomu/resume-ner with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="oksomu/resume-ner")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("oksomu/resume-ner") model = AutoModelForTokenClassification.from_pretrained("oksomu/resume-ner") - Notebooks
- Google Colab
- Kaggle
Somasundaram Ayyappan
Add Kaggle silver training data, retrain model, reorganize data directory
ae7305b | """ | |
| Generate BIO-tagged NER training data from datasetmaster/resumes structured data. | |
| Creates diverse resume text formats + aligned BIO tags. | |
| """ | |
| import json | |
| import random | |
| from pathlib import Path | |
| from datasets import load_dataset | |
| try: | |
| from training.dataset_utils import dedupe_examples, stable_split_examples, write_dataset | |
| from training.labels import ID2LABEL, LABEL2ID | |
| from training.synthetic_assets import COMMON_CERTS, generate_email, generate_phone, load_companies, load_titles | |
| from training.synthetic_formats import RESUME_FORMATS | |
| from training.tagging import tag_exact_words | |
| except ModuleNotFoundError: | |
| from dataset_utils import dedupe_examples, stable_split_examples, write_dataset | |
| from labels import ID2LABEL, LABEL2ID | |
| from synthetic_assets import COMMON_CERTS, generate_email, generate_phone, load_companies, load_titles | |
| from synthetic_formats import RESUME_FORMATS | |
| from tagging import tag_exact_words | |
| DATA_DIR = Path(__file__).parent / "data" | |
| def tag_skill_individually(tokens, labels, skill_name): | |
| skill_clean = skill_name.strip().rstrip(",.;:|") | |
| skill_words = skill_clean.split() | |
| if not skill_words: | |
| return | |
| for i in range(len(tokens)): | |
| if len(skill_words) == 1: | |
| clean_tok = tokens[i].rstrip(",.;:|").lstrip("-") | |
| if clean_tok.lower() == skill_clean.lower(): | |
| if labels[i] == "O": | |
| labels[i] = "B-SKILL" | |
| return | |
| else: | |
| match = True | |
| for j, sw in enumerate(skill_words): | |
| if i + j >= len(tokens): | |
| match = False | |
| break | |
| if tokens[i + j].rstrip(",.;:|").lower() != sw.rstrip(",.;:|").lower(): | |
| match = False | |
| break | |
| if match: | |
| for j in range(len(skill_words)): | |
| if labels[i + j] == "O": | |
| labels[i + j] = "B-SKILL" if j == 0 else "I-SKILL" | |
| return | |
| def extract_fields(sample): | |
| p = sample.get("personal_info") or {} | |
| if not isinstance(p, dict): return None | |
| name = p.get("name", "Unknown") | |
| if not name or name == "Unknown": return None | |
| loc = p.get("location") or {} | |
| city = loc.get("city", "") if isinstance(loc, dict) else "" | |
| country = loc.get("country", "") if isinstance(loc, dict) else "" | |
| location = f"{city}, {country}".strip(", ") if city or country else "" | |
| email = generate_email(name) | |
| phone = generate_phone() | |
| summary = p.get("summary", "") | |
| if summary == "Unknown": summary = "" | |
| exp_list = [] | |
| for e in (sample.get("experience") or []): | |
| if not isinstance(e, dict): continue | |
| title = e.get("title", "") | |
| company = e.get("company", "") | |
| if not title or title == "Unknown": continue | |
| if company in ("Unknown", "Fresher", ""): company = "" | |
| dates_d = e.get("dates", {}) or {} | |
| start = dates_d.get("start", "") if isinstance(dates_d, dict) else "" | |
| end = dates_d.get("end", "") if isinstance(dates_d, dict) else "" | |
| if start == "Unknown": start = "" | |
| if end == "Unknown": end = "" | |
| dates = "" | |
| if start: | |
| dates = start | |
| if end: dates += f" - {end}" | |
| resps = e.get("responsibilities", []) | |
| desc = "" | |
| if isinstance(resps, list) and resps and resps[0] != "Unknown": | |
| desc = resps[0] | |
| exp_loc = "" | |
| company_info = e.get("company_info") or {} | |
| if isinstance(company_info, dict): | |
| exp_loc = company_info.get("location", "") or "" | |
| if exp_loc == "Unknown": exp_loc = "" | |
| # 90% chance to swap with real company names, 50% for titles | |
| companies = load_companies(DATA_DIR) | |
| titles = load_titles(DATA_DIR) | |
| if companies and random.random() < 0.9: | |
| company = random.choice(companies) | |
| if titles and random.random() < 0.5: | |
| title = random.choice(titles) | |
| exp_list.append({"title": title, "company": company, "location": exp_loc, "dates": dates, "desc": desc, "start": start, "end": end}) | |
| edu_list = [] | |
| for ed in (sample.get("education") or []): | |
| if not isinstance(ed, dict): continue | |
| degree = ed.get("degree", {}) or {} | |
| level = degree.get("level", "") if isinstance(degree, dict) else "" | |
| field = degree.get("field", "") if isinstance(degree, dict) else "" | |
| inst = ed.get("institution", {}) or {} | |
| inst_name = inst.get("name", "") if isinstance(inst, dict) else "" | |
| if level == "Unknown": level = "" | |
| if field == "Unknown": field = "" | |
| if inst_name == "Unknown": inst_name = "" | |
| if not level and not inst_name: continue | |
| line_parts = [] | |
| if level: line_parts.append(level) | |
| if field: line_parts.append(f"in {field}") | |
| if inst_name: line_parts.append(f"from {inst_name}" if line_parts else inst_name) | |
| edu_list.append({"level": level, "field": field, "institution": inst_name, "line": " ".join(line_parts)}) | |
| skills_data = sample.get("skills") or {} | |
| tech = skills_data.get("technical", {}) if isinstance(skills_data, dict) else {} | |
| all_skills = [] | |
| if isinstance(tech, dict): | |
| for cat in tech.values(): | |
| if isinstance(cat, list): | |
| for sk in cat: | |
| if isinstance(sk, dict) and sk.get("name") and sk["name"] != "Unknown": | |
| all_skills.append(sk["name"]) | |
| certs = sample.get("certifications") or [] | |
| cert_names = [] | |
| if isinstance(certs, list): | |
| for c in certs: | |
| if isinstance(c, dict) and c.get("name") and c["name"] != "Unknown": | |
| cert_names.append(c["name"]) | |
| # Add synthetic certs for variety (some resumes should have them) | |
| if not cert_names and random.random() < 0.3: | |
| cert_names = random.sample(COMMON_CERTS, random.randint(1, 2)) | |
| return { | |
| "name": name, "email": email, "phone": phone, "location": location, | |
| "summary": summary, "exp": exp_list, "edu": edu_list, | |
| "skills": all_skills, "certs": cert_names, | |
| } | |
| def build_resume_and_tags(sample): | |
| fields = extract_fields(sample) | |
| if not fields or len(fields["skills"]) < 2: | |
| return None | |
| fmt = random.choice(RESUME_FORMATS) | |
| text = fmt( | |
| fields["name"], fields["email"], fields["phone"], fields["location"], | |
| fields["summary"], fields["exp"], fields["edu"], fields["skills"], fields["certs"], | |
| ) | |
| tokens = text.split() | |
| if len(tokens) < 15: | |
| return None | |
| labels = ["O"] * len(tokens) | |
| tag_exact_words(tokens, labels, fields["name"], "NAME") | |
| tag_exact_words(tokens, labels, fields["email"], "EMAIL") | |
| tag_exact_words(tokens, labels, fields["phone"], "PHONE") | |
| if fields["location"]: | |
| tag_exact_words(tokens, labels, fields["location"], "LOCATION") | |
| for e in fields["exp"]: | |
| if e["title"]: | |
| tag_exact_words(tokens, labels, e["title"], "TITLE") | |
| if e["company"]: | |
| tag_exact_words(tokens, labels, e["company"], "COMPANY") | |
| if e["start"]: | |
| tag_exact_words(tokens, labels, e["start"], "DATE") | |
| if e["end"] and e["end"] != "Present": | |
| tag_exact_words(tokens, labels, e["end"], "DATE") | |
| if e["end"] == "Present": | |
| tag_exact_words(tokens, labels, "Present", "DATE") | |
| for ed in fields["edu"]: | |
| if ed["level"]: | |
| tag_exact_words(tokens, labels, ed["level"], "DEGREE") | |
| if ed["field"]: | |
| tag_exact_words(tokens, labels, ed["field"], "FIELD") | |
| if ed["institution"]: | |
| tag_exact_words(tokens, labels, ed["institution"], "INSTITUTION") | |
| for sk in fields["skills"]: | |
| tag_skill_individually(tokens, labels, sk) | |
| for cn in fields["certs"]: | |
| tag_exact_words(tokens, labels, cn, "CERT") | |
| tagged = sum(1 for l in labels if l != "O") | |
| if tagged < 5: | |
| return None | |
| return {"tokens": tokens, "ner_tags": [LABEL2ID.get(l, 0) for l in labels]} | |
| def main(): | |
| print("Loading datasetmaster/resumes...") | |
| ds = load_dataset("datasetmaster/resumes", split="train") | |
| print(f"Total: {len(ds)}") | |
| random.seed(42) | |
| converted = [] | |
| for idx, sample in enumerate(ds): | |
| result = build_resume_and_tags(sample) | |
| if result: | |
| result["metadata"] = { | |
| "source": "datasetmaster_resumes", | |
| "source_id": f"generated:{idx}", | |
| "group_id": f"generated:{idx}", | |
| } | |
| converted.append(result) | |
| print(f"Converted: {len(converted)}") | |
| random.shuffle(converted) | |
| selected = converted[:4000] | |
| try: | |
| from training.convert_dataturks import convert_dataturks_sample | |
| from training.manual_resumes import build_manual_examples | |
| from training.build_long_resumes import build_examples as build_long_examples | |
| except ModuleNotFoundError: | |
| from convert_dataturks import convert_dataturks_sample | |
| from manual_resumes import build_manual_examples | |
| from build_long_resumes import build_examples as build_long_examples | |
| dataturks = [] | |
| with open(DATA_DIR / "sources" / "dataturks_raw.json") as f: | |
| for line in f: | |
| line = line.strip() | |
| if line: | |
| try: | |
| item = json.loads(line) | |
| result = convert_dataturks_sample(item, source_id=f"dataturks:{len(dataturks)}") | |
| if result: | |
| dataturks.append(result) | |
| except json.JSONDecodeError: | |
| print("Skipped invalid DataTurks JSON line") | |
| manual = build_manual_examples() | |
| long_resumes = build_long_examples() | |
| resume_resource = [] | |
| rr_path = DATA_DIR / "gold" / "resume_resource_gold.json" | |
| if rr_path.exists(): | |
| with open(rr_path) as f: | |
| resume_resource = json.load(f)["data"] | |
| print(f"DataTurks: {len(dataturks)}") | |
| print(f"Generated: {len(selected)}") | |
| print(f"Manual templates: {len(manual)}") | |
| print(f"Long resumes: {len(long_resumes)}") | |
| print(f"Resume resource: {len(resume_resource)}") | |
| all_data = dataturks + selected + manual + long_resumes + resume_resource | |
| clean = [] | |
| for e in all_data: | |
| name_tokens = [t for t, tag in zip(e["tokens"], e["ner_tags"]) if ID2LABEL[tag] in ("B-NAME", "I-NAME")] | |
| name = " ".join(name_tokens).lower() | |
| if name in ("not provided", "unknown", "", "n/a"): | |
| continue | |
| if sum(1 for t in e["ner_tags"] if t != 0) < 5: | |
| continue | |
| clean.append(e) | |
| clean, duplicates_removed = dedupe_examples(clean) | |
| train, val = stable_split_examples(clean, train_ratio=0.85) | |
| try: | |
| from training.noise_augment import augment_examples | |
| except ModuleNotFoundError: | |
| from noise_augment import augment_examples | |
| augmented = augment_examples(train, multiplier=2, seed=42) | |
| print(f"Noise augmented: {len(augmented)} (from {len(train)} train examples)") | |
| train = train + augmented | |
| write_dataset( | |
| train, | |
| val, | |
| DATA_DIR, | |
| manifest={ | |
| "builder": "generate_from_structured.py", | |
| "sources": { | |
| "dataturks": len(dataturks), | |
| "generated": len(selected), | |
| "manual_templates": len(manual), | |
| "long_resumes": len(long_resumes), | |
| "resume_resource": len(resume_resource), | |
| "noise_augmented": len(augmented), | |
| "duplicates_removed": duplicates_removed, | |
| }, | |
| }, | |
| ) | |
| print(f"\nFinal: Train={len(train)}, Val={len(val)}") | |
| from collections import Counter | |
| counts = Counter() | |
| for e in clean: | |
| for tag in e["ner_tags"]: | |
| label = ID2LABEL[tag] | |
| if label != "O": | |
| counts[label[2:]] += 1 | |
| print("\nLabels:") | |
| for l, c in counts.most_common(): | |
| print(f" {l:15s}: {c}") | |
| if __name__ == "__main__": | |
| main() | |