"""Text preprocessing for resume text before NER inference. Normalizes PDF extraction artifacts: CRLF, em-dashes, bullet characters, flattened two-column tables, and multi-space runs. All rules driven by resume_config.json pre_processing section. Usage: from training.text_preprocess import preprocess_resume_text clean = preprocess_resume_text(raw_text) # Or with explicit config: from training.text_preprocess import ResumeTextPreprocessor pp = ResumeTextPreprocessor(".") clean = pp.preprocess(raw_text) """ from __future__ import annotations import json import re from pathlib import Path class ResumeTextPreprocessor: def __init__(self, config_dir: str | Path): config_path = Path(config_dir) / "resume_config.json" with open(config_path) as f: full_config = json.load(f) self.config = full_config.get("pre_processing", {}) self._build_category_pattern() def _build_category_pattern(self): categories = self.config.get("skill_table_categories", []) if categories: sorted_cats = sorted(categories, key=len, reverse=True) pattern_str = r"^(" + "|".join(re.escape(c) for c in sorted_cats) + r")\s+([A-Za-z0-9#.+].+)$" self._category_re = re.compile(pattern_str, re.IGNORECASE) else: self._category_re = None def preprocess(self, text: str) -> str: text = self._normalize_whitespace(text) if self.config.get("normalize_dashes", True): text = self._normalize_dashes(text) if self.config.get("normalize_bullets", True): text = self._normalize_bullets(text) text = self._strip_labels(text) if self.config.get("expand_skill_tables", True): text = self._expand_flattened_table(text) if self.config.get("collapse_multi_spaces", True): text = self._collapse_multi_spaces(text) return text.strip() def _normalize_whitespace(self, text: str) -> str: text = text.replace("\r\n", "\n").replace("\r", "\n") return "\n".join(line.rstrip() for line in text.split("\n")) def _normalize_dashes(self, text: str) -> str: for old, new in self.config.get("dash_replacements", {"–": "-", "—": "-"}).items(): text = text.replace(old, new) return text def _normalize_bullets(self, text: str) -> str: chars = self.config.get("bullet_chars", ["●", "•", "▪", "■", "▸", "►", "‣", "⁃"]) replacement = self.config.get("bullet_replacement", "- ") if chars: pattern = r"^[" + re.escape("".join(chars)) + r"]\s*" text = re.sub(pattern, replacement, text, flags=re.MULTILINE) return text def _strip_labels(self, text: str) -> str: for label in self.config.get("strip_labels", []): text = re.sub(r"(?i)\b" + re.escape(label) + r"\s*", "", text) return text def _expand_flattened_table(self, text: str) -> str: if not self._category_re: return text max_prose_words = self.config.get("table_prose_max_words", 15) max_cont_chars = self.config.get("table_continuation_max_chars", 60) lines = text.split("\n") result = [] i = 0 while i < len(lines): line = lines[i] match = self._category_re.match(line.strip()) if not match: ms = re.match(r"^([A-Za-z][A-Za-z\s&/()-]{2,40}?)\s{2,}([A-Za-z0-9#.+].+)$", line) if ms and "," in ms.group(2): match = ms if match: category = match.group(1).strip() items = match.group(2).strip() if len(items.split()) > max_prose_words or ("," not in items and len(items.split()) > 5): result.append(line) i += 1 continue while i + 1 < len(lines): next_line = lines[i + 1].strip() is_continuation = ( re.match(r"^\s{2,}[A-Za-z0-9#.+]", lines[i + 1]) or ( next_line and next_line[0].isupper() and "," in next_line and not self._category_re.match(next_line) and len(next_line) < max_cont_chars ) ) if is_continuation: i += 1 items += " " + next_line else: break items = re.sub(r"\s*,\s*", ", ", items).rstrip(",").strip() result.append(f"{category}: {items}") else: result.append(line) i += 1 return "\n".join(result) @staticmethod def _collapse_multi_spaces(text: str) -> str: return re.sub(r" +", " ", text) _default_preprocessor: ResumeTextPreprocessor | None = None def preprocess_resume_text(text: str, config_dir: str | Path = ".") -> str: """Convenience function using default config from current directory.""" global _default_preprocessor if _default_preprocessor is None: _default_preprocessor = ResumeTextPreprocessor(config_dir) return _default_preprocessor.preprocess(text)