Token Classification
Transformers
ONNX
Safetensors
English
distilbert
resume-parsing
ner
resume
cv
information-extraction
Instructions to use oksomu/resume-ner with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use oksomu/resume-ner with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="oksomu/resume-ner")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("oksomu/resume-ner") model = AutoModelForTokenClassification.from_pretrained("oksomu/resume-ner") - Notebooks
- Google Colab
- Kaggle
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import re | |
| from collections import Counter, defaultdict | |
| import torch | |
| from transformers import AutoModelForTokenClassification, AutoTokenizer | |
| from training.benchmark_utils import classify_resume_noise | |
| from training.labels import ID2LABEL | |
| from training.structured_postprocess import StructuredPostProcessor, build_text_and_spans | |
| ALLOWED_INPUTS = {"input_ids", "attention_mask"} | |
| def predicted_spans_from_text(text: str, offset_mapping: list[tuple[int, int]], pred_ids: list[int]) -> tuple[str, list]: | |
| spans = [] | |
| current = None | |
| for (start, end), tag_id in zip(offset_mapping, pred_ids): | |
| if start == end: | |
| continue | |
| label = ID2LABEL[tag_id] | |
| if label == "O": | |
| if current: | |
| spans.append(current) | |
| current = None | |
| continue | |
| bio, base = label.split("-", 1) | |
| piece = text[start:end] | |
| if current is None or bio == "B" or current.label != base: | |
| if current: | |
| spans.append(current) | |
| from training.structured_postprocess import Span | |
| current = Span(label=base, text=piece, start=start, end=end, bio=bio, score=1.0) | |
| else: | |
| gap = text[current.end:start] | |
| current.text += gap + piece | |
| current.end = end | |
| if current: | |
| spans.append(current) | |
| return text, spans | |
| def _split_into_sections(text: str) -> list[str]: | |
| """Split resume text at double-newline boundaries into paragraph blocks.""" | |
| return [block for block in re.split(r"\n{2,}", text) if block.strip()] | |
| def chunked_predicted_spans( | |
| text: str, | |
| model, | |
| tokenizer, | |
| max_length: int = 512, | |
| ) -> tuple[str, list]: | |
| """Run inference with section-aware chunking for texts exceeding max_length. | |
| Splits at paragraph boundaries so entities are never cut mid-span. | |
| Each chunk is a group of consecutive sections that fits within max_length. | |
| Character offsets are mapped back to the original text. | |
| """ | |
| num_tokens = len(tokenizer(text, truncation=False)["input_ids"]) | |
| if num_tokens <= max_length: | |
| tokenized = tokenizer(text, return_tensors="pt", return_offsets_mapping=True, truncation=True, max_length=max_length) | |
| encoded = {k: v for k, v in tokenized.items() if k in ALLOWED_INPUTS} | |
| with torch.no_grad(): | |
| pred_ids = model(**encoded).logits.argmax(dim=-1).squeeze(0).cpu().tolist() | |
| offsets = [tuple(pair) for pair in tokenized["offset_mapping"].squeeze(0).cpu().tolist()][1:-1] | |
| return predicted_spans_from_text(text, offsets, pred_ids[1:-1]) | |
| sections = _split_into_sections(text) | |
| chunks: list[str] = [] | |
| chunk_offsets: list[int] = [] | |
| current_sections: list[str] = [] | |
| current_offset = 0 | |
| for section in sections: | |
| candidate = "\n\n".join(current_sections + [section]) if current_sections else section | |
| tok_len = len(tokenizer(candidate, truncation=False)["input_ids"]) | |
| if tok_len > max_length and current_sections: | |
| chunk_text = "\n\n".join(current_sections) | |
| chunks.append(chunk_text) | |
| chunk_offsets.append(current_offset) | |
| current_offset = text.index(section, current_offset) | |
| current_sections = [section] | |
| else: | |
| if not current_sections: | |
| current_offset = text.index(section, current_offset) | |
| current_sections.append(section) | |
| if current_sections: | |
| chunks.append("\n\n".join(current_sections)) | |
| chunk_offsets.append(current_offset) | |
| all_spans = [] | |
| for chunk_text, char_offset in zip(chunks, chunk_offsets): | |
| tokenized = tokenizer(chunk_text, return_tensors="pt", return_offsets_mapping=True, truncation=True, max_length=max_length) | |
| encoded = {k: v for k, v in tokenized.items() if k in ALLOWED_INPUTS} | |
| with torch.no_grad(): | |
| pred_ids = model(**encoded).logits.argmax(dim=-1).squeeze(0).cpu().tolist() | |
| offsets = [tuple(pair) for pair in tokenized["offset_mapping"].squeeze(0).cpu().tolist()][1:-1] | |
| _, spans = predicted_spans_from_text(chunk_text, offsets, pred_ids[1:-1]) | |
| for span in spans: | |
| from training.structured_postprocess import Span | |
| all_spans.append(Span( | |
| label=span.label, | |
| text=span.text, | |
| start=span.start + char_offset, | |
| end=span.end + char_offset, | |
| bio=span.bio, | |
| score=span.score, | |
| )) | |
| return text, all_spans | |
| def normalize_value(field: str, value: str | None) -> str | None: | |
| if not value: | |
| return None | |
| normalized = " ".join(value.lower().split()).strip() | |
| if not normalized: | |
| return None | |
| if "phone" in field: | |
| normalized = normalized.replace("+", "plus") | |
| normalized = "".join(ch for ch in normalized if ch.isdigit() or ch.isalpha()) | |
| if "email" in field: | |
| normalized = normalized.replace(" ", "") | |
| if "date" in field: | |
| month_map = { | |
| "jan": "january", | |
| "feb": "february", | |
| "mar": "march", | |
| "apr": "april", | |
| "jun": "june", | |
| "jul": "july", | |
| "aug": "august", | |
| "sep": "september", | |
| "oct": "october", | |
| "nov": "november", | |
| "dec": "december", | |
| } | |
| for short, full in month_map.items(): | |
| normalized = normalized.replace(short, full) | |
| normalized = normalized.replace(" - ", "-") | |
| return normalized.strip(" ,.;:|/-") | |
| def flatten_resume(parsed: dict) -> dict[str, list[str]]: | |
| flat: dict[str, list[str]] = defaultdict(list) | |
| def push(field: str, value: str | None) -> None: | |
| normalized = normalize_value(field, value) | |
| if normalized: | |
| flat[field].append(normalized) | |
| personal = parsed["personal"] | |
| push("personal.name", personal.get("name")) | |
| push("personal.email", personal.get("email")) | |
| push("personal.phone", personal.get("phone")) | |
| push("personal.location", personal.get("location")) | |
| for exp in parsed["experience"]: | |
| push("experience.title", exp.get("title")) | |
| push("experience.company", exp.get("company")) | |
| push("experience.start_date", exp.get("start_date")) | |
| push("experience.end_date", exp.get("end_date")) | |
| for edu in parsed["education"]: | |
| push("education.degree", edu.get("degree")) | |
| push("education.field", edu.get("field")) | |
| push("education.institution", edu.get("institution")) | |
| for skill in parsed["skills"]: | |
| push("skills", skill) | |
| for cert in parsed["certifications"]: | |
| push("certifications", cert) | |
| push("country", parsed.get("country")) | |
| push("seniority", parsed.get("seniority")) | |
| return flat | |
| def score_field(gold: list[str], pred: list[str]) -> Counter: | |
| gold_counter = Counter(gold) | |
| pred_counter = Counter(pred) | |
| overlap = gold_counter & pred_counter | |
| return Counter( | |
| tp=sum(overlap.values()), | |
| fp=sum((pred_counter - overlap).values()), | |
| fn=sum((gold_counter - overlap).values()), | |
| ) | |
| def metrics_from_counts(counts: Counter) -> dict[str, float]: | |
| tp = counts["tp"] | |
| fp = counts["fp"] | |
| fn = counts["fn"] | |
| precision = tp / (tp + fp) if tp + fp else 0.0 | |
| recall = tp / (tp + fn) if tp + fn else 0.0 | |
| f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0 | |
| return {"precision": precision, "recall": recall, "f1": f1} | |
| def main() -> None: | |
| parser = argparse.ArgumentParser( | |
| description="Structured extraction benchmark using in-repo post-processing. Better than raw span proxy, still internal-facing." | |
| ) | |
| parser.add_argument("--model-dir", default=".") | |
| parser.add_argument("--val-path", default="training/data/ner_val.json") | |
| args = parser.parse_args() | |
| payload = json.load(open(args.val_path)) | |
| examples = payload["data"] | |
| tokenizer = AutoTokenizer.from_pretrained(args.model_dir) | |
| model = AutoModelForTokenClassification.from_pretrained(args.model_dir) | |
| model.eval() | |
| postprocessor = StructuredPostProcessor(args.model_dir) | |
| totals_by_field: dict[str, Counter] = {} | |
| bucket_totals: dict[str, Counter] = defaultdict(lambda: Counter(tp=0, fp=0, fn=0, examples=0)) | |
| for example in examples: | |
| gold_text, gold_spans = build_text_and_spans(example["tokens"], example["ner_tags"], ID2LABEL) | |
| gold_structured = postprocessor.build_structured_resume_from_spans(gold_spans, gold_text) | |
| bucket_info = classify_resume_noise(gold_text) | |
| bucket = str(bucket_info["bucket"]) | |
| bucket_totals[bucket]["examples"] += 1 | |
| pred_text, pred_spans = chunked_predicted_spans(gold_text, model, tokenizer) | |
| pred_structured = postprocessor.build_structured_resume_from_spans(pred_spans, pred_text) | |
| gold_flat = flatten_resume(gold_structured) | |
| pred_flat = flatten_resume(pred_structured) | |
| for field in sorted(set(gold_flat) | set(pred_flat)): | |
| counts = score_field(gold_flat.get(field, []), pred_flat.get(field, [])) | |
| totals_by_field.setdefault(field, Counter(tp=0, fp=0, fn=0)).update(counts) | |
| bucket_totals[bucket].update(counts) | |
| micro = Counter(tp=0, fp=0, fn=0) | |
| macro_f1 = 0.0 | |
| per_field = {} | |
| for field in sorted(totals_by_field): | |
| counts = totals_by_field[field] | |
| micro.update(counts) | |
| metrics = metrics_from_counts(counts) | |
| macro_f1 += metrics["f1"] | |
| per_field[field] = {**counts, **metrics} | |
| output = { | |
| "examples": len(examples), | |
| "micro": {**micro, **metrics_from_counts(micro)}, | |
| "macro_f1": macro_f1 / len(per_field) if per_field else 0.0, | |
| "by_bucket": { | |
| bucket: { | |
| "examples": counts["examples"], | |
| "tp": counts["tp"], | |
| "fp": counts["fp"], | |
| "fn": counts["fn"], | |
| **metrics_from_counts(counts), | |
| } | |
| for bucket, counts in sorted(bucket_totals.items()) | |
| }, | |
| "per_field": per_field, | |
| "note": "Uses in-repo structured post-processing for gold spans and predictions. Better than raw span matching, but still internal regression metric.", | |
| } | |
| print(json.dumps(output, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |