Token Classification
Transformers
ONNX
Safetensors
English
distilbert
resume-parsing
ner
resume
cv
information-extraction
Instructions to use oksomu/resume-ner with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use oksomu/resume-ner with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="oksomu/resume-ner")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("oksomu/resume-ner") model = AutoModelForTokenClassification.from_pretrained("oksomu/resume-ner") - Notebooks
- Google Colab
- Kaggle
File size: 5,416 Bytes
b34f048 e6b7b6f b34f048 e6b7b6f b34f048 e6b7b6f b34f048 e6b7b6f b34f048 e6b7b6f b34f048 e6b7b6f b34f048 e6b7b6f b34f048 e6b7b6f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | """Text preprocessing for resume text before NER inference.
Normalizes PDF extraction artifacts: CRLF, em-dashes, bullet characters,
flattened two-column tables, and multi-space runs. All rules driven by
resume_config.json pre_processing section.
Usage:
from training.text_preprocess import preprocess_resume_text
clean = preprocess_resume_text(raw_text)
# Or with explicit config:
from training.text_preprocess import ResumeTextPreprocessor
pp = ResumeTextPreprocessor(".")
clean = pp.preprocess(raw_text)
"""
from __future__ import annotations
import json
import re
from pathlib import Path
class ResumeTextPreprocessor:
def __init__(self, config_dir: str | Path):
config_path = Path(config_dir) / "resume_config.json"
with open(config_path) as f:
full_config = json.load(f)
self.config = full_config.get("pre_processing", {})
self._build_category_pattern()
def _build_category_pattern(self):
categories = self.config.get("skill_table_categories", [])
if categories:
sorted_cats = sorted(categories, key=len, reverse=True)
pattern_str = r"^(" + "|".join(re.escape(c) for c in sorted_cats) + r")\s+([A-Za-z0-9#.+].+)$"
self._category_re = re.compile(pattern_str, re.IGNORECASE)
else:
self._category_re = None
def preprocess(self, text: str) -> str:
text = self._normalize_whitespace(text)
if self.config.get("normalize_dashes", True):
text = self._normalize_dashes(text)
if self.config.get("normalize_bullets", True):
text = self._normalize_bullets(text)
text = self._strip_labels(text)
if self.config.get("expand_skill_tables", True):
text = self._expand_flattened_table(text)
if self.config.get("collapse_multi_spaces", True):
text = self._collapse_multi_spaces(text)
return text.strip()
def _normalize_whitespace(self, text: str) -> str:
text = text.replace("\r\n", "\n").replace("\r", "\n")
return "\n".join(line.rstrip() for line in text.split("\n"))
def _normalize_dashes(self, text: str) -> str:
for old, new in self.config.get("dash_replacements", {"–": "-", "—": "-"}).items():
text = text.replace(old, new)
return text
def _normalize_bullets(self, text: str) -> str:
chars = self.config.get("bullet_chars", ["●", "•", "▪", "■", "▸", "►", "‣", "⁃"])
replacement = self.config.get("bullet_replacement", "- ")
if chars:
pattern = r"^[" + re.escape("".join(chars)) + r"]\s*"
text = re.sub(pattern, replacement, text, flags=re.MULTILINE)
return text
def _strip_labels(self, text: str) -> str:
for label in self.config.get("strip_labels", []):
text = re.sub(r"(?i)\b" + re.escape(label) + r"\s*", "", text)
return text
def _expand_flattened_table(self, text: str) -> str:
if not self._category_re:
return text
max_prose_words = self.config.get("table_prose_max_words", 15)
max_cont_chars = self.config.get("table_continuation_max_chars", 60)
lines = text.split("\n")
result = []
i = 0
while i < len(lines):
line = lines[i]
match = self._category_re.match(line.strip())
if not match:
ms = re.match(r"^([A-Za-z][A-Za-z\s&/()-]{2,40}?)\s{2,}([A-Za-z0-9#.+].+)$", line)
if ms and "," in ms.group(2):
match = ms
if match:
category = match.group(1).strip()
items = match.group(2).strip()
if len(items.split()) > max_prose_words or ("," not in items and len(items.split()) > 5):
result.append(line)
i += 1
continue
while i + 1 < len(lines):
next_line = lines[i + 1].strip()
is_continuation = (
re.match(r"^\s{2,}[A-Za-z0-9#.+]", lines[i + 1])
or (
next_line
and next_line[0].isupper()
and "," in next_line
and not self._category_re.match(next_line)
and len(next_line) < max_cont_chars
)
)
if is_continuation:
i += 1
items += " " + next_line
else:
break
items = re.sub(r"\s*,\s*", ", ", items).rstrip(",").strip()
result.append(f"{category}: {items}")
else:
result.append(line)
i += 1
return "\n".join(result)
@staticmethod
def _collapse_multi_spaces(text: str) -> str:
return re.sub(r" +", " ", text)
_default_preprocessor: ResumeTextPreprocessor | None = None
def preprocess_resume_text(text: str, config_dir: str | Path = ".") -> str:
"""Convenience function using default config from current directory."""
global _default_preprocessor
if _default_preprocessor is None:
_default_preprocessor = ResumeTextPreprocessor(config_dir)
return _default_preprocessor.preprocess(text)
|