Token Classification
Transformers
ONNX
Safetensors
English
distilbert
resume-parsing
ner
resume
cv
information-extraction
Instructions to use oksomu/resume-ner with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use oksomu/resume-ner with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="oksomu/resume-ner")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("oksomu/resume-ner") model = AutoModelForTokenClassification.from_pretrained("oksomu/resume-ner") - Notebooks
- Google Colab
- Kaggle
| """Text preprocessing for resume text before NER inference. | |
| Normalizes PDF extraction artifacts: CRLF, em-dashes, bullet characters, | |
| flattened two-column tables, and multi-space runs. All rules driven by | |
| resume_config.json pre_processing section. | |
| Usage: | |
| from training.text_preprocess import preprocess_resume_text | |
| clean = preprocess_resume_text(raw_text) | |
| # Or with explicit config: | |
| from training.text_preprocess import ResumeTextPreprocessor | |
| pp = ResumeTextPreprocessor(".") | |
| clean = pp.preprocess(raw_text) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from pathlib import Path | |
| class ResumeTextPreprocessor: | |
| def __init__(self, config_dir: str | Path): | |
| config_path = Path(config_dir) / "resume_config.json" | |
| with open(config_path) as f: | |
| full_config = json.load(f) | |
| self.config = full_config.get("pre_processing", {}) | |
| self._build_category_pattern() | |
| def _build_category_pattern(self): | |
| categories = self.config.get("skill_table_categories", []) | |
| if categories: | |
| sorted_cats = sorted(categories, key=len, reverse=True) | |
| pattern_str = r"^(" + "|".join(re.escape(c) for c in sorted_cats) + r")\s+([A-Za-z0-9#.+].+)$" | |
| self._category_re = re.compile(pattern_str, re.IGNORECASE) | |
| else: | |
| self._category_re = None | |
| def preprocess(self, text: str) -> str: | |
| text = self._normalize_whitespace(text) | |
| if self.config.get("normalize_dashes", True): | |
| text = self._normalize_dashes(text) | |
| if self.config.get("normalize_bullets", True): | |
| text = self._normalize_bullets(text) | |
| text = self._strip_labels(text) | |
| if self.config.get("expand_skill_tables", True): | |
| text = self._expand_flattened_table(text) | |
| if self.config.get("collapse_multi_spaces", True): | |
| text = self._collapse_multi_spaces(text) | |
| return text.strip() | |
| def _normalize_whitespace(self, text: str) -> str: | |
| text = text.replace("\r\n", "\n").replace("\r", "\n") | |
| return "\n".join(line.rstrip() for line in text.split("\n")) | |
| def _normalize_dashes(self, text: str) -> str: | |
| for old, new in self.config.get("dash_replacements", {"–": "-", "—": "-"}).items(): | |
| text = text.replace(old, new) | |
| return text | |
| def _normalize_bullets(self, text: str) -> str: | |
| chars = self.config.get("bullet_chars", ["●", "•", "▪", "■", "▸", "►", "‣", "⁃"]) | |
| replacement = self.config.get("bullet_replacement", "- ") | |
| if chars: | |
| pattern = r"^[" + re.escape("".join(chars)) + r"]\s*" | |
| text = re.sub(pattern, replacement, text, flags=re.MULTILINE) | |
| return text | |
| def _strip_labels(self, text: str) -> str: | |
| for label in self.config.get("strip_labels", []): | |
| text = re.sub(r"(?i)\b" + re.escape(label) + r"\s*", "", text) | |
| return text | |
| def _expand_flattened_table(self, text: str) -> str: | |
| if not self._category_re: | |
| return text | |
| max_prose_words = self.config.get("table_prose_max_words", 15) | |
| max_cont_chars = self.config.get("table_continuation_max_chars", 60) | |
| lines = text.split("\n") | |
| result = [] | |
| i = 0 | |
| while i < len(lines): | |
| line = lines[i] | |
| match = self._category_re.match(line.strip()) | |
| if not match: | |
| ms = re.match(r"^([A-Za-z][A-Za-z\s&/()-]{2,40}?)\s{2,}([A-Za-z0-9#.+].+)$", line) | |
| if ms and "," in ms.group(2): | |
| match = ms | |
| if match: | |
| category = match.group(1).strip() | |
| items = match.group(2).strip() | |
| if len(items.split()) > max_prose_words or ("," not in items and len(items.split()) > 5): | |
| result.append(line) | |
| i += 1 | |
| continue | |
| while i + 1 < len(lines): | |
| next_line = lines[i + 1].strip() | |
| is_continuation = ( | |
| re.match(r"^\s{2,}[A-Za-z0-9#.+]", lines[i + 1]) | |
| or ( | |
| next_line | |
| and next_line[0].isupper() | |
| and "," in next_line | |
| and not self._category_re.match(next_line) | |
| and len(next_line) < max_cont_chars | |
| ) | |
| ) | |
| if is_continuation: | |
| i += 1 | |
| items += " " + next_line | |
| else: | |
| break | |
| items = re.sub(r"\s*,\s*", ", ", items).rstrip(",").strip() | |
| result.append(f"{category}: {items}") | |
| else: | |
| result.append(line) | |
| i += 1 | |
| return "\n".join(result) | |
| def _collapse_multi_spaces(text: str) -> str: | |
| return re.sub(r" +", " ", text) | |
| _default_preprocessor: ResumeTextPreprocessor | None = None | |
| def preprocess_resume_text(text: str, config_dir: str | Path = ".") -> str: | |
| """Convenience function using default config from current directory.""" | |
| global _default_preprocessor | |
| if _default_preprocessor is None: | |
| _default_preprocessor = ResumeTextPreprocessor(config_dir) | |
| return _default_preprocessor.preprocess(text) | |