resume-ner / training /text_preprocess.py

Somasundaram Ayyappan

Make text preprocessor config-driven via resume_config.json

e6b7b6f 16 days ago

5.42 kB

	"""Text preprocessing for resume text before NER inference.

	Normalizes PDF extraction artifacts: CRLF, em-dashes, bullet characters,
	flattened two-column tables, and multi-space runs. All rules driven by
	resume_config.json pre_processing section.

	Usage:
	from training.text_preprocess import preprocess_resume_text
	clean = preprocess_resume_text(raw_text)

	# Or with explicit config:
	from training.text_preprocess import ResumeTextPreprocessor
	pp = ResumeTextPreprocessor(".")
	clean = pp.preprocess(raw_text)
	"""

	from __future__ import annotations

	import json
	import re
	from pathlib import Path


	class ResumeTextPreprocessor:
	def __init__(self, config_dir: str \| Path):
	config_path = Path(config_dir) / "resume_config.json"
	with open(config_path) as f:
	full_config = json.load(f)
	self.config = full_config.get("pre_processing", {})
	self._build_category_pattern()

	def _build_category_pattern(self):
	categories = self.config.get("skill_table_categories", [])
	if categories:
	sorted_cats = sorted(categories, key=len, reverse=True)
	pattern_str = r"^(" + "\|".join(re.escape(c) for c in sorted_cats) + r")\s+([A-Za-z0-9#.+].+)$"
	self._category_re = re.compile(pattern_str, re.IGNORECASE)
	else:
	self._category_re = None

	def preprocess(self, text: str) -> str:
	text = self._normalize_whitespace(text)
	if self.config.get("normalize_dashes", True):
	text = self._normalize_dashes(text)
	if self.config.get("normalize_bullets", True):
	text = self._normalize_bullets(text)
	text = self._strip_labels(text)
	if self.config.get("expand_skill_tables", True):
	text = self._expand_flattened_table(text)
	if self.config.get("collapse_multi_spaces", True):
	text = self._collapse_multi_spaces(text)
	return text.strip()

	def _normalize_whitespace(self, text: str) -> str:
	text = text.replace("\r\n", "\n").replace("\r", "\n")
	return "\n".join(line.rstrip() for line in text.split("\n"))

	def _normalize_dashes(self, text: str) -> str:
	for old, new in self.config.get("dash_replacements", {"–": "-", "—": "-"}).items():
	text = text.replace(old, new)
	return text

	def _normalize_bullets(self, text: str) -> str:
	chars = self.config.get("bullet_chars", ["●", "•", "▪", "■", "▸", "►", "‣", "⁃"])
	replacement = self.config.get("bullet_replacement", "- ")
	if chars:
	pattern = r"^[" + re.escape("".join(chars)) + r"]\s*"
	text = re.sub(pattern, replacement, text, flags=re.MULTILINE)
	return text

	def _strip_labels(self, text: str) -> str:
	for label in self.config.get("strip_labels", []):
	text = re.sub(r"(?i)\b" + re.escape(label) + r"\s*", "", text)
	return text

	def _expand_flattened_table(self, text: str) -> str:
	if not self._category_re:
	return text
	max_prose_words = self.config.get("table_prose_max_words", 15)
	max_cont_chars = self.config.get("table_continuation_max_chars", 60)

	lines = text.split("\n")
	result = []
	i = 0
	while i < len(lines):
	line = lines[i]
	match = self._category_re.match(line.strip())
	if not match:
	ms = re.match(r"^([A-Za-z][A-Za-z\s&/()-]{2,40}?)\s{2,}([A-Za-z0-9#.+].+)$", line)
	if ms and "," in ms.group(2):
	match = ms
	if match:
	category = match.group(1).strip()
	items = match.group(2).strip()
	if len(items.split()) > max_prose_words or ("," not in items and len(items.split()) > 5):
	result.append(line)
	i += 1
	continue
	while i + 1 < len(lines):
	next_line = lines[i + 1].strip()
	is_continuation = (
	re.match(r"^\s{2,}[A-Za-z0-9#.+]", lines[i + 1])
	or (
	next_line
	and next_line[0].isupper()
	and "," in next_line
	and not self._category_re.match(next_line)
	and len(next_line) < max_cont_chars
	)
	)
	if is_continuation:
	i += 1
	items += " " + next_line
	else:
	break
	items = re.sub(r"\s,\s", ", ", items).rstrip(",").strip()
	result.append(f"{category}: {items}")
	else:
	result.append(line)
	i += 1
	return "\n".join(result)

	@staticmethod
	def _collapse_multi_spaces(text: str) -> str:
	return re.sub(r" +", " ", text)


	_default_preprocessor: ResumeTextPreprocessor \| None = None


	def preprocess_resume_text(text: str, config_dir: str \| Path = ".") -> str:
	"""Convenience function using default config from current directory."""
	global _default_preprocessor
	if _default_preprocessor is None:
	_default_preprocessor = ResumeTextPreprocessor(config_dir)
	return _default_preprocessor.preprocess(text)