"""Detect resume sections and extract entities from untagged regions. Rule-based section header detection + heuristic entity extraction for sections where NER model has gaps (especially SKILLS, CERTIFICATIONS, LANGUAGES, and EDUCATION). Runs AFTER NER inference and BEFORE structured post-processing. Fills in entities the model missed by using section context. """ from __future__ import annotations import re from dataclasses import dataclass from training.structured_postprocess import Span SECTION_PATTERNS: dict[str, list[str]] = { "skills": [ "skills", "technical skills", "core competencies", "competencies", "areas of expertise", "areas of excellence", "proficiencies", "technical proficiencies", "key skills", "professional skills", "summary of qualifications", "qualifications", "tools & technologies", "tools and technologies", "technologies", "tech stack", "devops tools & technologies", "devops tools", ], "experience": [ "experience", "work experience", "professional experience", "employment history", "work history", "career history", "professional background", "clinical experience", "teaching experience", ], "education": [ "education", "academic background", "academic qualifications", "educational background", "academic history", ], "certifications": [ "certifications", "licenses & certifications", "licenses", "professional certifications", "credentials", "certifications & licenses", "awards & certifications", ], "languages": [ "languages", "language skills", "linguistic skills", ], "projects": [ "projects", "personal projects", "key projects", "selected projects", ], } @dataclass class Section: name: str start: int end: int text: str def detect_sections(text: str) -> list[Section]: """Find section boundaries using header keywords.""" lines = text.split("\n") sections: list[Section] = [] char_pos = 0 line_positions = [] for line in lines: line_positions.append(char_pos) char_pos += len(line) + 1 header_lines: list[tuple[int, str]] = [] for i, line in enumerate(lines): stripped = line.strip().rstrip(":").lower() stripped = re.sub(r"[^a-z\s&]", "", stripped).strip() if not stripped or len(stripped) > 60: continue for section_name, patterns in SECTION_PATTERNS.items(): if stripped in patterns: header_lines.append((i, section_name)) break for idx, (line_idx, section_name) in enumerate(header_lines): start = line_positions[line_idx] if idx + 1 < len(header_lines): end = line_positions[header_lines[idx + 1][0]] else: end = len(text) section_text = text[start:end] sections.append(Section(name=section_name, start=start, end=end, text=section_text)) return sections def _extract_list_items(text: str) -> list[str]: """Extract items from bullet lists, comma/dash/pipe-separated text, or Category: items format.""" items = [] for line in text.split("\n"): line = line.strip() line = re.sub(r"^[-●•▪■▸►‣⁃]\s*", "", line) if not line or len(line) > 120: continue # Strip "Category:" prefix if present colon_match = re.match(r"^[A-Za-z\s&/()-]+:\s*(.+)$", line) if colon_match: line = colon_match.group(1) # Split by comma, pipe, dash (but not inside words like "C++") parts = re.split(r"\s*[,|]\s*|\s+-\s+|\s+\+\s+", line) for part in parts: part = part.strip().rstrip(".,;:") if 2 < len(part) < 50 and not part[0].islower(): items.append(part) elif 2 < len(part) < 50: items.append(part) # Also handle single bullet items if len(parts) == 1 and len(line) < 50 and not line.endswith("."): clean = line.strip().rstrip(".,;:") if 2 < len(clean) < 50 and clean not in items: items.append(clean) return items def _is_tagged(start: int, end: int, existing_spans: list[Span]) -> bool: """Check if a character range overlaps any existing span.""" for span in existing_spans: if span.start < end and span.end > start: return True return False def fill_missing_entities( text: str, spans: list[Span], sections: list[Section] | None = None, ) -> list[Span]: """Add entities from detected sections that NER model missed. Runs after NER inference. For each detected section, extracts candidate entities using heuristics and adds them if the model didn't tag that text region. """ if sections is None: sections = detect_sections(text) added: list[Span] = [] for section in sections: if section.name == "skills": items = _extract_list_items(section.text) first_line = section.text.split("\n")[0] for item in items: if item.lower() in first_line.lower(): continue idx = text.find(item, section.start) if idx == -1: continue if not _is_tagged(idx, idx + len(item), spans): added.append(Span( label="SKILL", text=item, start=idx, end=idx + len(item), bio="B", score=0.8, )) elif section.name == "certifications": for line in section.text.split("\n"): line = line.strip() line = re.sub(r"^[-●•▪■]\s*", "", line) if not line or len(line) < 5 or len(line) > 100: continue stripped_lower = re.sub(r"[^a-z\s&]", "", line.lower()).strip() is_header = any(stripped_lower == p for p in SECTION_PATTERNS["certifications"]) if is_header: continue idx = text.find(line, section.start) if idx == -1: continue if not _is_tagged(idx, idx + len(line), spans): added.append(Span( label="CERT", text=line, start=idx, end=idx + len(line), bio="B", score=0.8, )) elif section.name == "languages": for line in section.text.split("\n"): line = line.strip() line = re.sub(r"^[-●•▪■]\s*", "", line) if not line or len(line) < 3 or len(line) > 60: continue stripped_lower = re.sub(r"[^a-z\s]", "", line.lower()).strip() if stripped_lower in ("languages", "language skills", "linguistic skills"): continue lang_match = re.match(r"^([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)", line) if lang_match: lang = lang_match.group(1) idx = text.find(lang, section.start) if idx != -1 and not _is_tagged(idx, idx + len(lang), spans): added.append(Span( label="LANGUAGE", text=lang, start=idx, end=idx + len(lang), bio="B", score=0.8, )) all_spans = spans + added all_spans.sort(key=lambda s: s.start) return all_spans