resume-ner / training /section_detector.py
Somasundaram Ayyappan
Add section detection for hybrid NER entity extraction
613cc9b
"""Detect resume sections and extract entities from untagged regions.
Rule-based section header detection + heuristic entity extraction for
sections where NER model has gaps (especially SKILLS, CERTIFICATIONS,
LANGUAGES, and EDUCATION).
Runs AFTER NER inference and BEFORE structured post-processing.
Fills in entities the model missed by using section context.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from training.structured_postprocess import Span
SECTION_PATTERNS: dict[str, list[str]] = {
"skills": [
"skills", "technical skills", "core competencies", "competencies",
"areas of expertise", "areas of excellence", "proficiencies",
"technical proficiencies", "key skills", "professional skills",
"summary of qualifications", "qualifications", "tools & technologies",
"tools and technologies", "technologies", "tech stack",
"devops tools & technologies", "devops tools",
],
"experience": [
"experience", "work experience", "professional experience",
"employment history", "work history", "career history",
"professional background", "clinical experience", "teaching experience",
],
"education": [
"education", "academic background", "academic qualifications",
"educational background", "academic history",
],
"certifications": [
"certifications", "licenses & certifications", "licenses",
"professional certifications", "credentials",
"certifications & licenses", "awards & certifications",
],
"languages": [
"languages", "language skills", "linguistic skills",
],
"projects": [
"projects", "personal projects", "key projects", "selected projects",
],
}
@dataclass
class Section:
name: str
start: int
end: int
text: str
def detect_sections(text: str) -> list[Section]:
"""Find section boundaries using header keywords."""
lines = text.split("\n")
sections: list[Section] = []
char_pos = 0
line_positions = []
for line in lines:
line_positions.append(char_pos)
char_pos += len(line) + 1
header_lines: list[tuple[int, str]] = []
for i, line in enumerate(lines):
stripped = line.strip().rstrip(":").lower()
stripped = re.sub(r"[^a-z\s&]", "", stripped).strip()
if not stripped or len(stripped) > 60:
continue
for section_name, patterns in SECTION_PATTERNS.items():
if stripped in patterns:
header_lines.append((i, section_name))
break
for idx, (line_idx, section_name) in enumerate(header_lines):
start = line_positions[line_idx]
if idx + 1 < len(header_lines):
end = line_positions[header_lines[idx + 1][0]]
else:
end = len(text)
section_text = text[start:end]
sections.append(Section(name=section_name, start=start, end=end, text=section_text))
return sections
def _extract_list_items(text: str) -> list[str]:
"""Extract items from bullet lists, comma/dash/pipe-separated text, or Category: items format."""
items = []
for line in text.split("\n"):
line = line.strip()
line = re.sub(r"^[-●•▪■▸►‣⁃]\s*", "", line)
if not line or len(line) > 120:
continue
# Strip "Category:" prefix if present
colon_match = re.match(r"^[A-Za-z\s&/()-]+:\s*(.+)$", line)
if colon_match:
line = colon_match.group(1)
# Split by comma, pipe, dash (but not inside words like "C++")
parts = re.split(r"\s*[,|]\s*|\s+-\s+|\s+\+\s+", line)
for part in parts:
part = part.strip().rstrip(".,;:")
if 2 < len(part) < 50 and not part[0].islower():
items.append(part)
elif 2 < len(part) < 50:
items.append(part)
# Also handle single bullet items
if len(parts) == 1 and len(line) < 50 and not line.endswith("."):
clean = line.strip().rstrip(".,;:")
if 2 < len(clean) < 50 and clean not in items:
items.append(clean)
return items
def _is_tagged(start: int, end: int, existing_spans: list[Span]) -> bool:
"""Check if a character range overlaps any existing span."""
for span in existing_spans:
if span.start < end and span.end > start:
return True
return False
def fill_missing_entities(
text: str,
spans: list[Span],
sections: list[Section] | None = None,
) -> list[Span]:
"""Add entities from detected sections that NER model missed.
Runs after NER inference. For each detected section, extracts
candidate entities using heuristics and adds them if the model
didn't tag that text region.
"""
if sections is None:
sections = detect_sections(text)
added: list[Span] = []
for section in sections:
if section.name == "skills":
items = _extract_list_items(section.text)
first_line = section.text.split("\n")[0]
for item in items:
if item.lower() in first_line.lower():
continue
idx = text.find(item, section.start)
if idx == -1:
continue
if not _is_tagged(idx, idx + len(item), spans):
added.append(Span(
label="SKILL", text=item,
start=idx, end=idx + len(item),
bio="B", score=0.8,
))
elif section.name == "certifications":
for line in section.text.split("\n"):
line = line.strip()
line = re.sub(r"^[-●•▪■]\s*", "", line)
if not line or len(line) < 5 or len(line) > 100:
continue
stripped_lower = re.sub(r"[^a-z\s&]", "", line.lower()).strip()
is_header = any(stripped_lower == p for p in SECTION_PATTERNS["certifications"])
if is_header:
continue
idx = text.find(line, section.start)
if idx == -1:
continue
if not _is_tagged(idx, idx + len(line), spans):
added.append(Span(
label="CERT", text=line,
start=idx, end=idx + len(line),
bio="B", score=0.8,
))
elif section.name == "languages":
for line in section.text.split("\n"):
line = line.strip()
line = re.sub(r"^[-●•▪■]\s*", "", line)
if not line or len(line) < 3 or len(line) > 60:
continue
stripped_lower = re.sub(r"[^a-z\s]", "", line.lower()).strip()
if stripped_lower in ("languages", "language skills", "linguistic skills"):
continue
lang_match = re.match(r"^([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)", line)
if lang_match:
lang = lang_match.group(1)
idx = text.find(lang, section.start)
if idx != -1 and not _is_tagged(idx, idx + len(lang), spans):
added.append(Span(
label="LANGUAGE", text=lang,
start=idx, end=idx + len(lang),
bio="B", score=0.8,
))
all_spans = spans + added
all_spans.sort(key=lambda s: s.start)
return all_spans