File size: 21,152 Bytes

from __future__ import annotations

import json
import re
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path


@dataclass
class Span:
    label: str
    text: str
    start: int
    end: int
    bio: str = "B"
    score: float = 1.0


class StructuredPostProcessor:
    def __init__(self, model_dir: str | Path):
        self.model_dir = Path(model_dir)
        with open(self.model_dir / "resume_config.json", encoding="utf-8") as fh:
            self.config = json.load(fh)
        companies_path = self.model_dir / "companies.json"
        self.companies = set()
        if companies_path.exists():
            with open(companies_path, encoding="utf-8") as fh:
                data = json.load(fh)
            self.companies = {company.lower() for companies in data.values() for company in companies}
        self.multi_word_skills = {skill.lower() for skill in self.config.get("multi_word_skills", [])}

        pp = self.config.get("post_processing", {})
        self.span_merge_max_gap = pp.get("span_merge_max_gap", 3)
        self.span_merge_labels = set(pp.get("span_merge_labels", ["TITLE", "COMPANY"]))
        self.entity_rules = pp.get("entity_rules", {})
        self.skill_aliases = self.entity_rules.get("SKILL", {}).get("aliases", {})
        self.cert_aliases = self.entity_rules.get("CERT", {}).get("aliases", {})
        self.date_words = set(pp.get("date_words", []))
        self.present_words = set(pp.get("present_words", ["present", "current"]))
        self.gazetteer_match_max_words = pp.get("company_gazetteer_match_max_words", 3)
        self.title_company_separators = pp.get("title_company_separators", [" at "])
        self.max_experience_months = pp.get("max_experience_months", 600)
        self.space_collapse_pairs = pp.get("space_collapse_pairs", [])
        self.country_name_aliases = self.config.get("country_name_aliases", {})
        self.seniority_by_exp_count = self.config.get("seniority_by_experience_count", {"Senior": 4, "Mid": 2, "Junior": 0})

        self.city_country_map = {}
        ccm_file = self.config.get("city_country_map_file")
        if ccm_file:
            ccm_path = self.model_dir / ccm_file
            if ccm_path.exists():
                with open(ccm_path) as fh:
                    data = json.load(fh)
                for region in data.values():
                    self.city_country_map.update(region)
        if not self.city_country_map:
            self.city_country_map = self.config.get("city_country_map", {})

    def build_structured_resume_from_spans(self, spans: list[Span], raw_text: str = "") -> dict:
        spans = [Span(**{**span.__dict__, "text": self.clean_entity(span.label, span.text) or ""}) for span in spans]
        spans = [span for span in spans if span.text]
        spans = self.apply_post_processing(spans)
        grouped = self.group_into_entries(spans)
        years = self.compute_years(grouped["experience"])
        seniority = self.infer_seniority(grouped["experience"], years)
        country = self.infer_country(grouped["personal"].get("location"), grouped["personal"].get("phone"))
        return {
            "personal": {
                **grouped["personal"],
                "name": self.clean_spaces(grouped["personal"].get("name")) if grouped["personal"].get("name") else None,
            },
            "experience": grouped["experience"],
            "education": grouped["education"],
            "skills": [self.clean_spaces(skill) for skill in grouped["skills"]],
            "certifications": grouped["certifications"],
            "seniority": seniority,
            "country": country,
            "experience_years": years,
            "_rawText": raw_text,
        }

    @staticmethod
    def _merge_same_label_neighbors(spans: list[Span], labels: set[str], max_gap: int = 3) -> list[Span]:
        merged: list[Span] = []
        for span in spans:
            if merged and span.label in labels and merged[-1].label == span.label and span.start - merged[-1].end <= max_gap:
                merged[-1] = Span(
                    label=merged[-1].label,
                    text=f"{merged[-1].text} {span.text}",
                    start=merged[-1].start,
                    end=span.end,
                    bio=merged[-1].bio,
                    score=max(merged[-1].score, span.score),
                )
            else:
                merged.append(span)
        return merged

    def clean_entity(self, label: str, raw: str) -> str | None:
        cleaned = re.sub(r"\s+", " ", raw).strip()
        cleaned = re.sub(r"^[,.;:|/\-\s]+|[,.;:|/\-\s]+$", "", cleaned)
        if not cleaned or (len(cleaned) == 1 and not re.search(r"[a-zA-Z]", cleaned)):
            return None
        if re.fullmatch(r"[\W_]+", cleaned):
            return None

        rules = self.entity_rules.get(label, {})

        if label == "EMAIL":
            cleaned = re.sub(r"\s+", "", cleaned)
            for prefix in rules.get("strip_prefixes", []):
                cleaned = re.sub(rf"^{re.escape(prefix)}\s*", "", cleaned, flags=re.I)
            cleaned = re.sub(r"^[^a-zA-Z0-9]+", "", cleaned)
            if rules.get("require") and rules["require"] not in cleaned:
                return None
            for pattern in rules.get("reject_patterns", []):
                if pattern.lower() in cleaned.lower():
                    return None
        elif label == "SKILL":
            cleaned = re.sub(r"[,.;:]+$", "", cleaned)
        elif label == "COMPANY":
            if rules.get("strip_trailing_state_code"):
                cleaned = re.sub(r",?\s+[A-Z]{2}$", "", cleaned).strip()
        elif label == "DATE":
            cleaned = re.sub(r"^[| ]+|[| ]+$", "", cleaned)

        if not cleaned:
            return None

        min_len = rules.get("min_length", 2)
        exceptions = {e.lower() for e in rules.get("exceptions", [])}
        blocked = {w.lower() for w in rules.get("blocked_words", [])}

        if cleaned.lower() in blocked:
            return None

        if cleaned.lower() in exceptions:
            return cleaned

        if len(cleaned) < min_len:
            if rules.get("gazetteer_bypass") and cleaned.lower() in self.companies:
                return cleaned
            if rules.get("uppercase_bypass") and cleaned.isupper():
                return cleaned
            return None

        return cleaned

    def apply_post_processing(self, spans: list[Span]) -> list[Span]:
        spans = self._merge_same_label_neighbors(spans, self.span_merge_labels, self.span_merge_max_gap)
        result = [Span(**{**span.__dict__, "label": "COMPANY"}) if span.label == "TITLE" and span.text.lower().strip() in self.companies else span for span in spans]

        stripped = []
        for span in result:
            if span.label != "COMPANY":
                stripped.append(span)
                continue
            words = span.text.split()
            while len(words) > 1 and (words[-1].lower() in self.date_words or re.fullmatch(r"\d{4}", words[-1])):
                words.pop()
            stripped.append(Span(**{**span.__dict__, "text": " ".join(words)}))

        result = []
        for span in stripped:
            if span.label != "TITLE":
                result.append(span)
                continue
            words = span.text.split()
            split_done = False
            for length in range(min(self.gazetteer_match_max_words, len(words)), 0, -1):
                prefix = " ".join(words[:length])
                if prefix.lower() in self.companies:
                    result.append(Span(**{**span.__dict__, "label": "COMPANY", "text": prefix}))
                    suffix = " ".join(words[length:])
                    if len(suffix) > 1:
                        result.append(Span(**{**span.__dict__, "label": "TITLE", "text": suffix}))
                    split_done = True
                    break
            if not split_done:
                result.append(span)

        merged = []
        i = 0
        while i < len(result):
            current = result[i]
            if current.label == "SKILL" and i + 1 < len(result) and result[i + 1].label == "SKILL":
                combined = f"{current.text} {result[i + 1].text}".rstrip(",.")
                if combined.lower() in self.multi_word_skills:
                    merged.append(Span(**{**current.__dict__, "text": combined, "end": result[i + 1].end}))
                    i += 2
                    continue
            merged.append(current)
            i += 1
        return merged

    def normalize_skill(self, text: str) -> str:
        normalized = self.clean_spaces(text.strip().rstrip(",."))
        alias = self.skill_aliases.get(normalized.lower())
        return alias if alias else normalized

    def normalize_certification(self, text: str) -> str:
        normalized = self.clean_spaces(text.strip().rstrip(",."))
        normalized = re.sub(r"^the\s+", "", normalized, flags=re.I)
        alias = self.cert_aliases.get(normalized.lower())
        return alias if alias else normalized

    @staticmethod
    def _dedupe_dict_items(items: list[dict]) -> list[dict]:
        seen = set()
        deduped = []
        for item in items:
            key = tuple(sorted((k, v) for k, v in item.items() if v))
            if key and key not in seen:
                seen.add(key)
                deduped.append(item)
        return deduped

    def clean_experiences(self, experiences: list[dict]) -> list[dict]:
        cleaned = []
        for exp in experiences:
            exp = {k: self.clean_spaces(v) if isinstance(v, str) else v for k, v in exp.items() if v}
            if "title" in exp and "company" not in exp:
                for sep in self.title_company_separators:
                    if sep.lower() in exp["title"].lower():
                        title, company = re.split(re.escape(sep.strip()), exp["title"], maxsplit=1, flags=re.I)
                        exp["title"] = self.clean_spaces(title)
                        exp["company"] = self.clean_spaces(company)
                        break
            cleaned.append(exp)
        if any(exp.get("company") or exp.get("start_date") for exp in cleaned[1:]):
            while cleaned and cleaned[0].get("title") and not cleaned[0].get("company") and not cleaned[0].get("start_date"):
                cleaned.pop(0)
        return self._dedupe_dict_items(cleaned)

    def clean_education(self, education: list[dict]) -> list[dict]:
        cleaned = []
        for edu in education:
            item = {k: self.clean_spaces(v) if isinstance(v, str) else v for k, v in edu.items() if v}
            if item:
                cleaned.append(item)
        return self._dedupe_dict_items(cleaned)

    def group_into_entries(self, spans: list[Span]) -> dict:
        personal = {"name": None, "email": None, "phone": None, "location": None}
        for span in spans:
            if span.label == "NAME" and not personal["name"]:
                personal["name"] = span.text
            elif span.label == "EMAIL" and not personal["email"]:
                cleaned = self.clean_entity("EMAIL", span.text)
                if cleaned:
                    personal["email"] = cleaned
            elif span.label == "PHONE" and not personal["phone"]:
                personal["phone"] = self.clean_phone(span.text)
            elif span.label == "LOCATION" and not personal["location"]:
                personal["location"] = self.clean_spaces(span.text)

        exp_spans = sorted([span for span in spans if span.label in {"TITLE", "COMPANY", "DATE"}], key=lambda span: span.start)
        experiences = []
        current = {}
        for span in exp_spans:
            if span.label == "TITLE":
                if current.get("title") and (current.get("company") or current.get("start_date")):
                    experiences.append(current)
                    current = {}
                current["title"] = self.clean_spaces(span.text)
            elif span.label == "COMPANY":
                if current.get("company") and (current.get("title") or current.get("start_date")):
                    experiences.append(current)
                    current = {}
                current["company"] = self.clean_spaces(self.clean_entity("COMPANY", span.text) or "")
            elif span.label == "DATE":
                date_text = re.sub(r"^[| ]+|[| ]+$", "", span.text)
                if not date_text:
                    continue
                present_pattern = "|".join(re.escape(w) for w in self.present_words)
                present_match = re.match(rf"^(.+?)\s+({present_pattern})$", date_text, flags=re.I)
                if present_match and not current.get("start_date"):
                    current["start_date"] = present_match.group(1).strip()
                    current["end_date"] = present_match.group(2)
                    continue
                if current.get("start_date") and not current.get("end_date") and re.fullmatch(r"[a-zA-Z]+", current["start_date"]) and re.match(r"^\d{4}", date_text):
                    year_match = re.match(r"^(\d{4})\s*(.*)", date_text)
                    if year_match:
                        current["start_date"] = f"{current['start_date']} {year_match.group(1)}"
                        if year_match.group(2):
                            current["end_date"] = year_match.group(2).strip()
                        continue
                if current.get("start_date") and current.get("end_date"):
                    if current.get("title") or current.get("company"):
                        experiences.append(current)
                        current = {}
                if not current.get("start_date"):
                    current["start_date"] = date_text
                elif not current.get("end_date"):
                    current["end_date"] = date_text
        if current.get("title") or current.get("company"):
            experiences.append(current)
        experiences = self.clean_experiences(experiences)

        edu_spans = sorted([span for span in spans if span.label in {"DEGREE", "FIELD", "INSTITUTION"}], key=lambda span: span.start)
        education = []
        current_edu = {}
        for span in edu_spans:
            if span.label == "DEGREE":
                if current_edu.get("degree"):
                    education.append(current_edu)
                    current_edu = {}
                current_edu["degree"] = self.clean_spaces(span.text)
            elif span.label == "FIELD":
                current_edu["field"] = self.clean_spaces(span.text)
            elif span.label == "INSTITUTION":
                current_edu["institution"] = re.sub(r",?\s*\d{4}\s*$", "", self.clean_spaces(span.text))
                education.append(current_edu)
                current_edu = {}
        if current_edu.get("degree") or current_edu.get("institution"):
            education.append(current_edu)
        education = self.clean_education(education)

        skill_rules = self.entity_rules.get("SKILL", {})
        skill_min = skill_rules.get("min_length", 2)
        skills = []
        seen = set()
        for span in spans:
            if span.label != "SKILL":
                continue
            for part in re.split(r",\s*", span.text):
                clean = self.normalize_skill(part)
                if not clean or clean.lower() in seen:
                    continue
                if len(clean) < skill_min and not clean.isupper() and clean.lower() not in {e.lower() for e in skill_rules.get("exceptions", [])}:
                    continue
                seen.add(clean.lower())
                skills.append(clean)

        certifications = []
        cert_seen = set()
        for span in spans:
            if span.label != "CERT":
                continue
            clean = self.normalize_certification(span.text)
            if len(clean) > 1 and clean.lower() not in cert_seen:
                cert_seen.add(clean.lower())
                certifications.append(clean)
        return {"personal": personal, "experience": experiences, "education": education, "skills": skills, "certifications": certifications}

    def infer_seniority(self, experiences: list[dict], years: int | None) -> str:
        keywords = self.config["seniority_keywords"]
        titles = [(exp.get("title") or "").lower() for exp in experiences if exp.get("title")]
        for level, level_keywords in keywords.items():
            for title in titles:
                for keyword in level_keywords:
                    if keyword in title:
                        return level
        if years is not None:
            bounds = self.config["seniority_by_years"]
            if years >= bounds["Staff"]:
                return "Staff"
            if years >= bounds["Senior"]:
                return "Senior"
            if years >= bounds["Mid"]:
                return "Mid"
            return "Junior"
        for level, min_count in sorted(self.seniority_by_exp_count.items(), key=lambda x: -x[1]):
            if len(experiences) >= min_count:
                return level
        return "Junior"

    def infer_country(self, location: str | None, phone: str | None) -> str | None:
        if phone:
            clean = re.sub(r"[\s\-()]", "", phone)
            for prefix, country in self.config["phone_country_prefixes"].items():
                if clean.startswith(prefix):
                    return country
        if location:
            loc = location.lower()
            for alias, country in self.country_name_aliases.items():
                if alias in loc:
                    return country
            for city, country in self.city_country_map.items():
                if city in loc:
                    return country
            for part in loc.replace(",", " ").split():
                if part.upper() in self.config["us_states"]:
                    return "United States"
        return None

    def compute_years(self, experiences: list[dict]) -> int | None:
        total_months = 0
        now = datetime.now()
        present_re = "|".join(re.escape(w) for w in self.present_words)
        for exp in experiences:
            if not exp.get("start_date"):
                continue
            start = self.parse_date(exp["start_date"])
            if not start:
                continue
            if not exp.get("end_date") or re.search(present_re, exp["end_date"], flags=re.I):
                end = now
            else:
                end = self.parse_date(exp["end_date"])
                if not end:
                    continue
            months = (end.year - start.year) * 12 + (end.month - start.month)
            if 0 < months < self.max_experience_months:
                total_months += months
        return total_months // 12 if total_months > 0 else None

    @staticmethod
    def parse_date(text: str) -> datetime | None:
        months = {
            "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, "july": 7, "august": 8,
            "september": 9, "october": 10, "november": 11, "december": 12, "jan": 1, "feb": 2, "mar": 3,
            "apr": 4, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12,
        }
        lower = text.lower().strip()
        for name, month in months.items():
            match = re.search(rf"{name}\s+(\d{{4}})", lower)
            if match:
                return datetime(int(match.group(1)), month, 1)
        year_match = re.search(r"\b(19|20)\d{2}\b", text)
        if year_match:
            return datetime(int(year_match.group(0)), 6, 1)
        return None

    def clean_spaces(self, text: str) -> str:
        result = text
        for old, new in self.space_collapse_pairs:
            result = result.replace(old, new)
        return result.rstrip(",").strip()

    @staticmethod
    def clean_phone(phone: str) -> str:
        return re.sub(r"\s+", " ", re.sub(r"\+\s+", "+", re.sub(r"\s+-\s+", "-", re.sub(r"\s+\)", ")", re.sub(r"\(\s+", "(", phone))))).strip()


def build_text_and_spans(tokens: list[str], ner_tags: list[int], id2label: dict[int, str]) -> tuple[str, list[Span]]:
    text = ""
    positions = []
    for token in tokens:
        start = len(text)
        text += (" " if text else "") + token
        real_start = start + (1 if start else 0)
        positions.append((real_start, real_start + len(token)))
    spans = []
    current = None
    for i, token in enumerate(tokens):
        label = id2label[ner_tags[i]]
        if label == "O":
            if current:
                spans.append(current)
                current = None
            continue
        bio, base = label.split("-", 1)
        start, end = positions[i]
        if current is None or bio == "B" or current.label != base:
            if current:
                spans.append(current)
            current = Span(label=base, text=token, start=start, end=end, bio=bio, score=1.0)
        else:
            current.text += f" {token}"
            current.end = end
    if current:
        spans.append(current)
    return text, spans