File size: 21,152 Bytes
750e1a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae7305b
750e1a2
 
 
 
ae7305b
750e1a2
 
 
a56f909
cdacc8d
 
 
a56f909
 
 
 
 
 
 
 
 
 
 
750e1a2
a05717f
 
 
 
 
 
 
 
 
 
 
 
750e1a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4129d85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750e1a2
 
 
 
 
 
 
a56f909
 
 
750e1a2
 
a56f909
 
750e1a2
a56f909
a11ed90
a56f909
 
 
750e1a2
03116a3
750e1a2
a56f909
 
750e1a2
 
a56f909
 
 
 
 
 
 
 
 
a11ed90
a56f909
 
 
 
 
 
 
 
 
 
 
a11ed90
750e1a2
 
cdacc8d
750e1a2
a56f909
750e1a2
 
 
 
 
 
a56f909
750e1a2
 
a56f909
750e1a2
 
 
 
 
 
 
a56f909
750e1a2
 
 
 
 
 
 
 
 
 
a56f909
750e1a2
 
 
 
 
 
 
 
 
 
 
 
 
 
4129d85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a56f909
 
 
 
 
 
 
4129d85
 
 
 
 
 
 
 
 
 
 
 
 
 
750e1a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a56f909
 
750e1a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4129d85
750e1a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4129d85
750e1a2
a56f909
 
750e1a2
 
 
 
 
 
4129d85
a56f909
 
 
 
 
 
750e1a2
4129d85
 
 
 
 
 
 
 
 
750e1a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a56f909
 
 
750e1a2
 
 
 
 
 
 
 
 
 
a56f909
 
 
a05717f
750e1a2
 
 
 
 
 
 
 
 
 
a56f909
750e1a2
 
 
 
 
 
a56f909
750e1a2
 
 
 
 
 
a56f909
750e1a2
4129d85
750e1a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a56f909
 
 
 
 
750e1a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
from __future__ import annotations

import json
import re
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path


@dataclass
class Span:
    label: str
    text: str
    start: int
    end: int
    bio: str = "B"
    score: float = 1.0


class StructuredPostProcessor:
    def __init__(self, model_dir: str | Path):
        self.model_dir = Path(model_dir)
        with open(self.model_dir / "resume_config.json", encoding="utf-8") as fh:
            self.config = json.load(fh)
        companies_path = self.model_dir / "companies.json"
        self.companies = set()
        if companies_path.exists():
            with open(companies_path, encoding="utf-8") as fh:
                data = json.load(fh)
            self.companies = {company.lower() for companies in data.values() for company in companies}
        self.multi_word_skills = {skill.lower() for skill in self.config.get("multi_word_skills", [])}

        pp = self.config.get("post_processing", {})
        self.span_merge_max_gap = pp.get("span_merge_max_gap", 3)
        self.span_merge_labels = set(pp.get("span_merge_labels", ["TITLE", "COMPANY"]))
        self.entity_rules = pp.get("entity_rules", {})
        self.skill_aliases = self.entity_rules.get("SKILL", {}).get("aliases", {})
        self.cert_aliases = self.entity_rules.get("CERT", {}).get("aliases", {})
        self.date_words = set(pp.get("date_words", []))
        self.present_words = set(pp.get("present_words", ["present", "current"]))
        self.gazetteer_match_max_words = pp.get("company_gazetteer_match_max_words", 3)
        self.title_company_separators = pp.get("title_company_separators", [" at "])
        self.max_experience_months = pp.get("max_experience_months", 600)
        self.space_collapse_pairs = pp.get("space_collapse_pairs", [])
        self.country_name_aliases = self.config.get("country_name_aliases", {})
        self.seniority_by_exp_count = self.config.get("seniority_by_experience_count", {"Senior": 4, "Mid": 2, "Junior": 0})

        self.city_country_map = {}
        ccm_file = self.config.get("city_country_map_file")
        if ccm_file:
            ccm_path = self.model_dir / ccm_file
            if ccm_path.exists():
                with open(ccm_path) as fh:
                    data = json.load(fh)
                for region in data.values():
                    self.city_country_map.update(region)
        if not self.city_country_map:
            self.city_country_map = self.config.get("city_country_map", {})

    def build_structured_resume_from_spans(self, spans: list[Span], raw_text: str = "") -> dict:
        spans = [Span(**{**span.__dict__, "text": self.clean_entity(span.label, span.text) or ""}) for span in spans]
        spans = [span for span in spans if span.text]
        spans = self.apply_post_processing(spans)
        grouped = self.group_into_entries(spans)
        years = self.compute_years(grouped["experience"])
        seniority = self.infer_seniority(grouped["experience"], years)
        country = self.infer_country(grouped["personal"].get("location"), grouped["personal"].get("phone"))
        return {
            "personal": {
                **grouped["personal"],
                "name": self.clean_spaces(grouped["personal"].get("name")) if grouped["personal"].get("name") else None,
            },
            "experience": grouped["experience"],
            "education": grouped["education"],
            "skills": [self.clean_spaces(skill) for skill in grouped["skills"]],
            "certifications": grouped["certifications"],
            "seniority": seniority,
            "country": country,
            "experience_years": years,
            "_rawText": raw_text,
        }

    @staticmethod
    def _merge_same_label_neighbors(spans: list[Span], labels: set[str], max_gap: int = 3) -> list[Span]:
        merged: list[Span] = []
        for span in spans:
            if merged and span.label in labels and merged[-1].label == span.label and span.start - merged[-1].end <= max_gap:
                merged[-1] = Span(
                    label=merged[-1].label,
                    text=f"{merged[-1].text} {span.text}",
                    start=merged[-1].start,
                    end=span.end,
                    bio=merged[-1].bio,
                    score=max(merged[-1].score, span.score),
                )
            else:
                merged.append(span)
        return merged

    def clean_entity(self, label: str, raw: str) -> str | None:
        cleaned = re.sub(r"\s+", " ", raw).strip()
        cleaned = re.sub(r"^[,.;:|/\-\s]+|[,.;:|/\-\s]+$", "", cleaned)
        if not cleaned or (len(cleaned) == 1 and not re.search(r"[a-zA-Z]", cleaned)):
            return None
        if re.fullmatch(r"[\W_]+", cleaned):
            return None

        rules = self.entity_rules.get(label, {})

        if label == "EMAIL":
            cleaned = re.sub(r"\s+", "", cleaned)
            for prefix in rules.get("strip_prefixes", []):
                cleaned = re.sub(rf"^{re.escape(prefix)}\s*", "", cleaned, flags=re.I)
            cleaned = re.sub(r"^[^a-zA-Z0-9]+", "", cleaned)
            if rules.get("require") and rules["require"] not in cleaned:
                return None
            for pattern in rules.get("reject_patterns", []):
                if pattern.lower() in cleaned.lower():
                    return None
        elif label == "SKILL":
            cleaned = re.sub(r"[,.;:]+$", "", cleaned)
        elif label == "COMPANY":
            if rules.get("strip_trailing_state_code"):
                cleaned = re.sub(r",?\s+[A-Z]{2}$", "", cleaned).strip()
        elif label == "DATE":
            cleaned = re.sub(r"^[| ]+|[| ]+$", "", cleaned)

        if not cleaned:
            return None

        min_len = rules.get("min_length", 2)
        exceptions = {e.lower() for e in rules.get("exceptions", [])}
        blocked = {w.lower() for w in rules.get("blocked_words", [])}

        if cleaned.lower() in blocked:
            return None

        if cleaned.lower() in exceptions:
            return cleaned

        if len(cleaned) < min_len:
            if rules.get("gazetteer_bypass") and cleaned.lower() in self.companies:
                return cleaned
            if rules.get("uppercase_bypass") and cleaned.isupper():
                return cleaned
            return None

        return cleaned

    def apply_post_processing(self, spans: list[Span]) -> list[Span]:
        spans = self._merge_same_label_neighbors(spans, self.span_merge_labels, self.span_merge_max_gap)
        result = [Span(**{**span.__dict__, "label": "COMPANY"}) if span.label == "TITLE" and span.text.lower().strip() in self.companies else span for span in spans]

        stripped = []
        for span in result:
            if span.label != "COMPANY":
                stripped.append(span)
                continue
            words = span.text.split()
            while len(words) > 1 and (words[-1].lower() in self.date_words or re.fullmatch(r"\d{4}", words[-1])):
                words.pop()
            stripped.append(Span(**{**span.__dict__, "text": " ".join(words)}))

        result = []
        for span in stripped:
            if span.label != "TITLE":
                result.append(span)
                continue
            words = span.text.split()
            split_done = False
            for length in range(min(self.gazetteer_match_max_words, len(words)), 0, -1):
                prefix = " ".join(words[:length])
                if prefix.lower() in self.companies:
                    result.append(Span(**{**span.__dict__, "label": "COMPANY", "text": prefix}))
                    suffix = " ".join(words[length:])
                    if len(suffix) > 1:
                        result.append(Span(**{**span.__dict__, "label": "TITLE", "text": suffix}))
                    split_done = True
                    break
            if not split_done:
                result.append(span)

        merged = []
        i = 0
        while i < len(result):
            current = result[i]
            if current.label == "SKILL" and i + 1 < len(result) and result[i + 1].label == "SKILL":
                combined = f"{current.text} {result[i + 1].text}".rstrip(",.")
                if combined.lower() in self.multi_word_skills:
                    merged.append(Span(**{**current.__dict__, "text": combined, "end": result[i + 1].end}))
                    i += 2
                    continue
            merged.append(current)
            i += 1
        return merged

    def normalize_skill(self, text: str) -> str:
        normalized = self.clean_spaces(text.strip().rstrip(",."))
        alias = self.skill_aliases.get(normalized.lower())
        return alias if alias else normalized

    def normalize_certification(self, text: str) -> str:
        normalized = self.clean_spaces(text.strip().rstrip(",."))
        normalized = re.sub(r"^the\s+", "", normalized, flags=re.I)
        alias = self.cert_aliases.get(normalized.lower())
        return alias if alias else normalized

    @staticmethod
    def _dedupe_dict_items(items: list[dict]) -> list[dict]:
        seen = set()
        deduped = []
        for item in items:
            key = tuple(sorted((k, v) for k, v in item.items() if v))
            if key and key not in seen:
                seen.add(key)
                deduped.append(item)
        return deduped

    def clean_experiences(self, experiences: list[dict]) -> list[dict]:
        cleaned = []
        for exp in experiences:
            exp = {k: self.clean_spaces(v) if isinstance(v, str) else v for k, v in exp.items() if v}
            if "title" in exp and "company" not in exp:
                for sep in self.title_company_separators:
                    if sep.lower() in exp["title"].lower():
                        title, company = re.split(re.escape(sep.strip()), exp["title"], maxsplit=1, flags=re.I)
                        exp["title"] = self.clean_spaces(title)
                        exp["company"] = self.clean_spaces(company)
                        break
            cleaned.append(exp)
        if any(exp.get("company") or exp.get("start_date") for exp in cleaned[1:]):
            while cleaned and cleaned[0].get("title") and not cleaned[0].get("company") and not cleaned[0].get("start_date"):
                cleaned.pop(0)
        return self._dedupe_dict_items(cleaned)

    def clean_education(self, education: list[dict]) -> list[dict]:
        cleaned = []
        for edu in education:
            item = {k: self.clean_spaces(v) if isinstance(v, str) else v for k, v in edu.items() if v}
            if item:
                cleaned.append(item)
        return self._dedupe_dict_items(cleaned)

    def group_into_entries(self, spans: list[Span]) -> dict:
        personal = {"name": None, "email": None, "phone": None, "location": None}
        for span in spans:
            if span.label == "NAME" and not personal["name"]:
                personal["name"] = span.text
            elif span.label == "EMAIL" and not personal["email"]:
                cleaned = self.clean_entity("EMAIL", span.text)
                if cleaned:
                    personal["email"] = cleaned
            elif span.label == "PHONE" and not personal["phone"]:
                personal["phone"] = self.clean_phone(span.text)
            elif span.label == "LOCATION" and not personal["location"]:
                personal["location"] = self.clean_spaces(span.text)

        exp_spans = sorted([span for span in spans if span.label in {"TITLE", "COMPANY", "DATE"}], key=lambda span: span.start)
        experiences = []
        current = {}
        for span in exp_spans:
            if span.label == "TITLE":
                if current.get("title") and (current.get("company") or current.get("start_date")):
                    experiences.append(current)
                    current = {}
                current["title"] = self.clean_spaces(span.text)
            elif span.label == "COMPANY":
                if current.get("company") and (current.get("title") or current.get("start_date")):
                    experiences.append(current)
                    current = {}
                current["company"] = self.clean_spaces(self.clean_entity("COMPANY", span.text) or "")
            elif span.label == "DATE":
                date_text = re.sub(r"^[| ]+|[| ]+$", "", span.text)
                if not date_text:
                    continue
                present_pattern = "|".join(re.escape(w) for w in self.present_words)
                present_match = re.match(rf"^(.+?)\s+({present_pattern})$", date_text, flags=re.I)
                if present_match and not current.get("start_date"):
                    current["start_date"] = present_match.group(1).strip()
                    current["end_date"] = present_match.group(2)
                    continue
                if current.get("start_date") and not current.get("end_date") and re.fullmatch(r"[a-zA-Z]+", current["start_date"]) and re.match(r"^\d{4}", date_text):
                    year_match = re.match(r"^(\d{4})\s*(.*)", date_text)
                    if year_match:
                        current["start_date"] = f"{current['start_date']} {year_match.group(1)}"
                        if year_match.group(2):
                            current["end_date"] = year_match.group(2).strip()
                        continue
                if current.get("start_date") and current.get("end_date"):
                    if current.get("title") or current.get("company"):
                        experiences.append(current)
                        current = {}
                if not current.get("start_date"):
                    current["start_date"] = date_text
                elif not current.get("end_date"):
                    current["end_date"] = date_text
        if current.get("title") or current.get("company"):
            experiences.append(current)
        experiences = self.clean_experiences(experiences)

        edu_spans = sorted([span for span in spans if span.label in {"DEGREE", "FIELD", "INSTITUTION"}], key=lambda span: span.start)
        education = []
        current_edu = {}
        for span in edu_spans:
            if span.label == "DEGREE":
                if current_edu.get("degree"):
                    education.append(current_edu)
                    current_edu = {}
                current_edu["degree"] = self.clean_spaces(span.text)
            elif span.label == "FIELD":
                current_edu["field"] = self.clean_spaces(span.text)
            elif span.label == "INSTITUTION":
                current_edu["institution"] = re.sub(r",?\s*\d{4}\s*$", "", self.clean_spaces(span.text))
                education.append(current_edu)
                current_edu = {}
        if current_edu.get("degree") or current_edu.get("institution"):
            education.append(current_edu)
        education = self.clean_education(education)

        skill_rules = self.entity_rules.get("SKILL", {})
        skill_min = skill_rules.get("min_length", 2)
        skills = []
        seen = set()
        for span in spans:
            if span.label != "SKILL":
                continue
            for part in re.split(r",\s*", span.text):
                clean = self.normalize_skill(part)
                if not clean or clean.lower() in seen:
                    continue
                if len(clean) < skill_min and not clean.isupper() and clean.lower() not in {e.lower() for e in skill_rules.get("exceptions", [])}:
                    continue
                seen.add(clean.lower())
                skills.append(clean)

        certifications = []
        cert_seen = set()
        for span in spans:
            if span.label != "CERT":
                continue
            clean = self.normalize_certification(span.text)
            if len(clean) > 1 and clean.lower() not in cert_seen:
                cert_seen.add(clean.lower())
                certifications.append(clean)
        return {"personal": personal, "experience": experiences, "education": education, "skills": skills, "certifications": certifications}

    def infer_seniority(self, experiences: list[dict], years: int | None) -> str:
        keywords = self.config["seniority_keywords"]
        titles = [(exp.get("title") or "").lower() for exp in experiences if exp.get("title")]
        for level, level_keywords in keywords.items():
            for title in titles:
                for keyword in level_keywords:
                    if keyword in title:
                        return level
        if years is not None:
            bounds = self.config["seniority_by_years"]
            if years >= bounds["Staff"]:
                return "Staff"
            if years >= bounds["Senior"]:
                return "Senior"
            if years >= bounds["Mid"]:
                return "Mid"
            return "Junior"
        for level, min_count in sorted(self.seniority_by_exp_count.items(), key=lambda x: -x[1]):
            if len(experiences) >= min_count:
                return level
        return "Junior"

    def infer_country(self, location: str | None, phone: str | None) -> str | None:
        if phone:
            clean = re.sub(r"[\s\-()]", "", phone)
            for prefix, country in self.config["phone_country_prefixes"].items():
                if clean.startswith(prefix):
                    return country
        if location:
            loc = location.lower()
            for alias, country in self.country_name_aliases.items():
                if alias in loc:
                    return country
            for city, country in self.city_country_map.items():
                if city in loc:
                    return country
            for part in loc.replace(",", " ").split():
                if part.upper() in self.config["us_states"]:
                    return "United States"
        return None

    def compute_years(self, experiences: list[dict]) -> int | None:
        total_months = 0
        now = datetime.now()
        present_re = "|".join(re.escape(w) for w in self.present_words)
        for exp in experiences:
            if not exp.get("start_date"):
                continue
            start = self.parse_date(exp["start_date"])
            if not start:
                continue
            if not exp.get("end_date") or re.search(present_re, exp["end_date"], flags=re.I):
                end = now
            else:
                end = self.parse_date(exp["end_date"])
                if not end:
                    continue
            months = (end.year - start.year) * 12 + (end.month - start.month)
            if 0 < months < self.max_experience_months:
                total_months += months
        return total_months // 12 if total_months > 0 else None

    @staticmethod
    def parse_date(text: str) -> datetime | None:
        months = {
            "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, "july": 7, "august": 8,
            "september": 9, "october": 10, "november": 11, "december": 12, "jan": 1, "feb": 2, "mar": 3,
            "apr": 4, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12,
        }
        lower = text.lower().strip()
        for name, month in months.items():
            match = re.search(rf"{name}\s+(\d{{4}})", lower)
            if match:
                return datetime(int(match.group(1)), month, 1)
        year_match = re.search(r"\b(19|20)\d{2}\b", text)
        if year_match:
            return datetime(int(year_match.group(0)), 6, 1)
        return None

    def clean_spaces(self, text: str) -> str:
        result = text
        for old, new in self.space_collapse_pairs:
            result = result.replace(old, new)
        return result.rstrip(",").strip()

    @staticmethod
    def clean_phone(phone: str) -> str:
        return re.sub(r"\s+", " ", re.sub(r"\+\s+", "+", re.sub(r"\s+-\s+", "-", re.sub(r"\s+\)", ")", re.sub(r"\(\s+", "(", phone))))).strip()


def build_text_and_spans(tokens: list[str], ner_tags: list[int], id2label: dict[int, str]) -> tuple[str, list[Span]]:
    text = ""
    positions = []
    for token in tokens:
        start = len(text)
        text += (" " if text else "") + token
        real_start = start + (1 if start else 0)
        positions.append((real_start, real_start + len(token)))
    spans = []
    current = None
    for i, token in enumerate(tokens):
        label = id2label[ner_tags[i]]
        if label == "O":
            if current:
                spans.append(current)
                current = None
            continue
        bio, base = label.split("-", 1)
        start, end = positions[i]
        if current is None or bio == "B" or current.label != base:
            if current:
                spans.append(current)
            current = Span(label=base, text=token, start=start, end=end, bio=bio, score=1.0)
        else:
            current.text += f" {token}"
            current.end = end
    if current:
        spans.append(current)
    return text, spans