Token Classification
Transformers
ONNX
Safetensors
English
distilbert
resume-parsing
ner
resume
cv
information-extraction
Instructions to use oksomu/resume-ner with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use oksomu/resume-ner with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="oksomu/resume-ner")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("oksomu/resume-ner") model = AutoModelForTokenClassification.from_pretrained("oksomu/resume-ner") - Notebooks
- Google Colab
- Kaggle
| """Detect resume sections and extract entities from untagged regions. | |
| Rule-based section header detection + heuristic entity extraction for | |
| sections where NER model has gaps (especially SKILLS, CERTIFICATIONS, | |
| LANGUAGES, and EDUCATION). | |
| Runs AFTER NER inference and BEFORE structured post-processing. | |
| Fills in entities the model missed by using section context. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from dataclasses import dataclass | |
| from training.structured_postprocess import Span | |
| SECTION_PATTERNS: dict[str, list[str]] = { | |
| "skills": [ | |
| "skills", "technical skills", "core competencies", "competencies", | |
| "areas of expertise", "areas of excellence", "proficiencies", | |
| "technical proficiencies", "key skills", "professional skills", | |
| "summary of qualifications", "qualifications", "tools & technologies", | |
| "tools and technologies", "technologies", "tech stack", | |
| "devops tools & technologies", "devops tools", | |
| ], | |
| "experience": [ | |
| "experience", "work experience", "professional experience", | |
| "employment history", "work history", "career history", | |
| "professional background", "clinical experience", "teaching experience", | |
| ], | |
| "education": [ | |
| "education", "academic background", "academic qualifications", | |
| "educational background", "academic history", | |
| ], | |
| "certifications": [ | |
| "certifications", "licenses & certifications", "licenses", | |
| "professional certifications", "credentials", | |
| "certifications & licenses", "awards & certifications", | |
| ], | |
| "languages": [ | |
| "languages", "language skills", "linguistic skills", | |
| ], | |
| "projects": [ | |
| "projects", "personal projects", "key projects", "selected projects", | |
| ], | |
| } | |
| class Section: | |
| name: str | |
| start: int | |
| end: int | |
| text: str | |
| def detect_sections(text: str) -> list[Section]: | |
| """Find section boundaries using header keywords.""" | |
| lines = text.split("\n") | |
| sections: list[Section] = [] | |
| char_pos = 0 | |
| line_positions = [] | |
| for line in lines: | |
| line_positions.append(char_pos) | |
| char_pos += len(line) + 1 | |
| header_lines: list[tuple[int, str]] = [] | |
| for i, line in enumerate(lines): | |
| stripped = line.strip().rstrip(":").lower() | |
| stripped = re.sub(r"[^a-z\s&]", "", stripped).strip() | |
| if not stripped or len(stripped) > 60: | |
| continue | |
| for section_name, patterns in SECTION_PATTERNS.items(): | |
| if stripped in patterns: | |
| header_lines.append((i, section_name)) | |
| break | |
| for idx, (line_idx, section_name) in enumerate(header_lines): | |
| start = line_positions[line_idx] | |
| if idx + 1 < len(header_lines): | |
| end = line_positions[header_lines[idx + 1][0]] | |
| else: | |
| end = len(text) | |
| section_text = text[start:end] | |
| sections.append(Section(name=section_name, start=start, end=end, text=section_text)) | |
| return sections | |
| def _extract_list_items(text: str) -> list[str]: | |
| """Extract items from bullet lists, comma/dash/pipe-separated text, or Category: items format.""" | |
| items = [] | |
| for line in text.split("\n"): | |
| line = line.strip() | |
| line = re.sub(r"^[-●•▪■▸►‣⁃]\s*", "", line) | |
| if not line or len(line) > 120: | |
| continue | |
| # Strip "Category:" prefix if present | |
| colon_match = re.match(r"^[A-Za-z\s&/()-]+:\s*(.+)$", line) | |
| if colon_match: | |
| line = colon_match.group(1) | |
| # Split by comma, pipe, dash (but not inside words like "C++") | |
| parts = re.split(r"\s*[,|]\s*|\s+-\s+|\s+\+\s+", line) | |
| for part in parts: | |
| part = part.strip().rstrip(".,;:") | |
| if 2 < len(part) < 50 and not part[0].islower(): | |
| items.append(part) | |
| elif 2 < len(part) < 50: | |
| items.append(part) | |
| # Also handle single bullet items | |
| if len(parts) == 1 and len(line) < 50 and not line.endswith("."): | |
| clean = line.strip().rstrip(".,;:") | |
| if 2 < len(clean) < 50 and clean not in items: | |
| items.append(clean) | |
| return items | |
| def _is_tagged(start: int, end: int, existing_spans: list[Span]) -> bool: | |
| """Check if a character range overlaps any existing span.""" | |
| for span in existing_spans: | |
| if span.start < end and span.end > start: | |
| return True | |
| return False | |
| def fill_missing_entities( | |
| text: str, | |
| spans: list[Span], | |
| sections: list[Section] | None = None, | |
| ) -> list[Span]: | |
| """Add entities from detected sections that NER model missed. | |
| Runs after NER inference. For each detected section, extracts | |
| candidate entities using heuristics and adds them if the model | |
| didn't tag that text region. | |
| """ | |
| if sections is None: | |
| sections = detect_sections(text) | |
| added: list[Span] = [] | |
| for section in sections: | |
| if section.name == "skills": | |
| items = _extract_list_items(section.text) | |
| first_line = section.text.split("\n")[0] | |
| for item in items: | |
| if item.lower() in first_line.lower(): | |
| continue | |
| idx = text.find(item, section.start) | |
| if idx == -1: | |
| continue | |
| if not _is_tagged(idx, idx + len(item), spans): | |
| added.append(Span( | |
| label="SKILL", text=item, | |
| start=idx, end=idx + len(item), | |
| bio="B", score=0.8, | |
| )) | |
| elif section.name == "certifications": | |
| for line in section.text.split("\n"): | |
| line = line.strip() | |
| line = re.sub(r"^[-●•▪■]\s*", "", line) | |
| if not line or len(line) < 5 or len(line) > 100: | |
| continue | |
| stripped_lower = re.sub(r"[^a-z\s&]", "", line.lower()).strip() | |
| is_header = any(stripped_lower == p for p in SECTION_PATTERNS["certifications"]) | |
| if is_header: | |
| continue | |
| idx = text.find(line, section.start) | |
| if idx == -1: | |
| continue | |
| if not _is_tagged(idx, idx + len(line), spans): | |
| added.append(Span( | |
| label="CERT", text=line, | |
| start=idx, end=idx + len(line), | |
| bio="B", score=0.8, | |
| )) | |
| elif section.name == "languages": | |
| for line in section.text.split("\n"): | |
| line = line.strip() | |
| line = re.sub(r"^[-●•▪■]\s*", "", line) | |
| if not line or len(line) < 3 or len(line) > 60: | |
| continue | |
| stripped_lower = re.sub(r"[^a-z\s]", "", line.lower()).strip() | |
| if stripped_lower in ("languages", "language skills", "linguistic skills"): | |
| continue | |
| lang_match = re.match(r"^([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)", line) | |
| if lang_match: | |
| lang = lang_match.group(1) | |
| idx = text.find(lang, section.start) | |
| if idx != -1 and not _is_tagged(idx, idx + len(lang), spans): | |
| added.append(Span( | |
| label="LANGUAGE", text=lang, | |
| start=idx, end=idx + len(lang), | |
| bio="B", score=0.8, | |
| )) | |
| all_spans = spans + added | |
| all_spans.sort(key=lambda s: s.start) | |
| return all_spans | |