File size: 5,416 Bytes
b34f048
 
 
e6b7b6f
 
b34f048
 
 
 
e6b7b6f
 
 
 
 
b34f048
 
 
 
e6b7b6f
b34f048
e6b7b6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b34f048
e6b7b6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b34f048
e6b7b6f
 
 
b34f048
 
e6b7b6f
b34f048
 
e6b7b6f
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""Text preprocessing for resume text before NER inference.

Normalizes PDF extraction artifacts: CRLF, em-dashes, bullet characters,
flattened two-column tables, and multi-space runs. All rules driven by
resume_config.json pre_processing section.

Usage:
    from training.text_preprocess import preprocess_resume_text
    clean = preprocess_resume_text(raw_text)

    # Or with explicit config:
    from training.text_preprocess import ResumeTextPreprocessor
    pp = ResumeTextPreprocessor(".")
    clean = pp.preprocess(raw_text)
"""

from __future__ import annotations

import json
import re
from pathlib import Path


class ResumeTextPreprocessor:
    def __init__(self, config_dir: str | Path):
        config_path = Path(config_dir) / "resume_config.json"
        with open(config_path) as f:
            full_config = json.load(f)
        self.config = full_config.get("pre_processing", {})
        self._build_category_pattern()

    def _build_category_pattern(self):
        categories = self.config.get("skill_table_categories", [])
        if categories:
            sorted_cats = sorted(categories, key=len, reverse=True)
            pattern_str = r"^(" + "|".join(re.escape(c) for c in sorted_cats) + r")\s+([A-Za-z0-9#.+].+)$"
            self._category_re = re.compile(pattern_str, re.IGNORECASE)
        else:
            self._category_re = None

    def preprocess(self, text: str) -> str:
        text = self._normalize_whitespace(text)
        if self.config.get("normalize_dashes", True):
            text = self._normalize_dashes(text)
        if self.config.get("normalize_bullets", True):
            text = self._normalize_bullets(text)
        text = self._strip_labels(text)
        if self.config.get("expand_skill_tables", True):
            text = self._expand_flattened_table(text)
        if self.config.get("collapse_multi_spaces", True):
            text = self._collapse_multi_spaces(text)
        return text.strip()

    def _normalize_whitespace(self, text: str) -> str:
        text = text.replace("\r\n", "\n").replace("\r", "\n")
        return "\n".join(line.rstrip() for line in text.split("\n"))

    def _normalize_dashes(self, text: str) -> str:
        for old, new in self.config.get("dash_replacements", {"–": "-", "—": "-"}).items():
            text = text.replace(old, new)
        return text

    def _normalize_bullets(self, text: str) -> str:
        chars = self.config.get("bullet_chars", ["●", "•", "▪", "■", "▸", "►", "‣", "⁃"])
        replacement = self.config.get("bullet_replacement", "- ")
        if chars:
            pattern = r"^[" + re.escape("".join(chars)) + r"]\s*"
            text = re.sub(pattern, replacement, text, flags=re.MULTILINE)
        return text

    def _strip_labels(self, text: str) -> str:
        for label in self.config.get("strip_labels", []):
            text = re.sub(r"(?i)\b" + re.escape(label) + r"\s*", "", text)
        return text

    def _expand_flattened_table(self, text: str) -> str:
        if not self._category_re:
            return text
        max_prose_words = self.config.get("table_prose_max_words", 15)
        max_cont_chars = self.config.get("table_continuation_max_chars", 60)

        lines = text.split("\n")
        result = []
        i = 0
        while i < len(lines):
            line = lines[i]
            match = self._category_re.match(line.strip())
            if not match:
                ms = re.match(r"^([A-Za-z][A-Za-z\s&/()-]{2,40}?)\s{2,}([A-Za-z0-9#.+].+)$", line)
                if ms and "," in ms.group(2):
                    match = ms
            if match:
                category = match.group(1).strip()
                items = match.group(2).strip()
                if len(items.split()) > max_prose_words or ("," not in items and len(items.split()) > 5):
                    result.append(line)
                    i += 1
                    continue
                while i + 1 < len(lines):
                    next_line = lines[i + 1].strip()
                    is_continuation = (
                        re.match(r"^\s{2,}[A-Za-z0-9#.+]", lines[i + 1])
                        or (
                            next_line
                            and next_line[0].isupper()
                            and "," in next_line
                            and not self._category_re.match(next_line)
                            and len(next_line) < max_cont_chars
                        )
                    )
                    if is_continuation:
                        i += 1
                        items += " " + next_line
                    else:
                        break
                items = re.sub(r"\s*,\s*", ", ", items).rstrip(",").strip()
                result.append(f"{category}: {items}")
            else:
                result.append(line)
            i += 1
        return "\n".join(result)

    @staticmethod
    def _collapse_multi_spaces(text: str) -> str:
        return re.sub(r"  +", " ", text)


_default_preprocessor: ResumeTextPreprocessor | None = None


def preprocess_resume_text(text: str, config_dir: str | Path = ".") -> str:
    """Convenience function using default config from current directory."""
    global _default_preprocessor
    if _default_preprocessor is None:
        _default_preprocessor = ResumeTextPreprocessor(config_dir)
    return _default_preprocessor.preprocess(text)