resume-ner / training /section_detector.py

Somasundaram Ayyappan

Add section detection for hybrid NER entity extraction

613cc9b 26 days ago

7.64 kB

	"""Detect resume sections and extract entities from untagged regions.

	Rule-based section header detection + heuristic entity extraction for
	sections where NER model has gaps (especially SKILLS, CERTIFICATIONS,
	LANGUAGES, and EDUCATION).

	Runs AFTER NER inference and BEFORE structured post-processing.
	Fills in entities the model missed by using section context.
	"""

	from __future__ import annotations

	import re
	from dataclasses import dataclass

	from training.structured_postprocess import Span

	SECTION_PATTERNS: dict[str, list[str]] = {
	"skills": [
	"skills", "technical skills", "core competencies", "competencies",
	"areas of expertise", "areas of excellence", "proficiencies",
	"technical proficiencies", "key skills", "professional skills",
	"summary of qualifications", "qualifications", "tools & technologies",
	"tools and technologies", "technologies", "tech stack",
	"devops tools & technologies", "devops tools",
	],
	"experience": [
	"experience", "work experience", "professional experience",
	"employment history", "work history", "career history",
	"professional background", "clinical experience", "teaching experience",
	],
	"education": [
	"education", "academic background", "academic qualifications",
	"educational background", "academic history",
	],
	"certifications": [
	"certifications", "licenses & certifications", "licenses",
	"professional certifications", "credentials",
	"certifications & licenses", "awards & certifications",
	],
	"languages": [
	"languages", "language skills", "linguistic skills",
	],
	"projects": [
	"projects", "personal projects", "key projects", "selected projects",
	],
	}


	@dataclass
	class Section:
	name: str
	start: int
	end: int
	text: str


	def detect_sections(text: str) -> list[Section]:
	"""Find section boundaries using header keywords."""
	lines = text.split("\n")
	sections: list[Section] = []
	char_pos = 0
	line_positions = []
	for line in lines:
	line_positions.append(char_pos)
	char_pos += len(line) + 1

	header_lines: list[tuple[int, str]] = []
	for i, line in enumerate(lines):
	stripped = line.strip().rstrip(":").lower()
	stripped = re.sub(r"[^a-z\s&]", "", stripped).strip()
	if not stripped or len(stripped) > 60:
	continue
	for section_name, patterns in SECTION_PATTERNS.items():
	if stripped in patterns:
	header_lines.append((i, section_name))
	break

	for idx, (line_idx, section_name) in enumerate(header_lines):
	start = line_positions[line_idx]
	if idx + 1 < len(header_lines):
	end = line_positions[header_lines[idx + 1][0]]
	else:
	end = len(text)
	section_text = text[start:end]
	sections.append(Section(name=section_name, start=start, end=end, text=section_text))

	return sections


	def _extract_list_items(text: str) -> list[str]:
	"""Extract items from bullet lists, comma/dash/pipe-separated text, or Category: items format."""
	items = []
	for line in text.split("\n"):
	line = line.strip()
	line = re.sub(r"^[-●•▪■▸►‣⁃]\s*", "", line)
	if not line or len(line) > 120:
	continue
	# Strip "Category:" prefix if present
	colon_match = re.match(r"^[A-Za-z\s&/()-]+:\s*(.+)$", line)
	if colon_match:
	line = colon_match.group(1)
	# Split by comma, pipe, dash (but not inside words like "C++")
	parts = re.split(r"\s[,\|]\s\|\s+-\s+\|\s+\+\s+", line)
	for part in parts:
	part = part.strip().rstrip(".,;:")
	if 2 < len(part) < 50 and not part[0].islower():
	items.append(part)
	elif 2 < len(part) < 50:
	items.append(part)
	# Also handle single bullet items
	if len(parts) == 1 and len(line) < 50 and not line.endswith("."):
	clean = line.strip().rstrip(".,;:")
	if 2 < len(clean) < 50 and clean not in items:
	items.append(clean)
	return items


	def _is_tagged(start: int, end: int, existing_spans: list[Span]) -> bool:
	"""Check if a character range overlaps any existing span."""
	for span in existing_spans:
	if span.start < end and span.end > start:
	return True
	return False


	def fill_missing_entities(
	text: str,
	spans: list[Span],
	sections: list[Section] \| None = None,
	) -> list[Span]:
	"""Add entities from detected sections that NER model missed.

	Runs after NER inference. For each detected section, extracts
	candidate entities using heuristics and adds them if the model
	didn't tag that text region.
	"""
	if sections is None:
	sections = detect_sections(text)

	added: list[Span] = []

	for section in sections:
	if section.name == "skills":
	items = _extract_list_items(section.text)
	first_line = section.text.split("\n")[0]
	for item in items:
	if item.lower() in first_line.lower():
	continue
	idx = text.find(item, section.start)
	if idx == -1:
	continue
	if not _is_tagged(idx, idx + len(item), spans):
	added.append(Span(
	label="SKILL", text=item,
	start=idx, end=idx + len(item),
	bio="B", score=0.8,
	))

	elif section.name == "certifications":
	for line in section.text.split("\n"):
	line = line.strip()
	line = re.sub(r"^[-●•▪■]\s*", "", line)
	if not line or len(line) < 5 or len(line) > 100:
	continue
	stripped_lower = re.sub(r"[^a-z\s&]", "", line.lower()).strip()
	is_header = any(stripped_lower == p for p in SECTION_PATTERNS["certifications"])
	if is_header:
	continue
	idx = text.find(line, section.start)
	if idx == -1:
	continue
	if not _is_tagged(idx, idx + len(line), spans):
	added.append(Span(
	label="CERT", text=line,
	start=idx, end=idx + len(line),
	bio="B", score=0.8,
	))

	elif section.name == "languages":
	for line in section.text.split("\n"):
	line = line.strip()
	line = re.sub(r"^[-●•▪■]\s*", "", line)
	if not line or len(line) < 3 or len(line) > 60:
	continue
	stripped_lower = re.sub(r"[^a-z\s]", "", line.lower()).strip()
	if stripped_lower in ("languages", "language skills", "linguistic skills"):
	continue
	lang_match = re.match(r"^([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)", line)
	if lang_match:
	lang = lang_match.group(1)
	idx = text.find(lang, section.start)
	if idx != -1 and not _is_tagged(idx, idx + len(lang), spans):
	added.append(Span(
	label="LANGUAGE", text=lang,
	start=idx, end=idx + len(lang),
	bio="B", score=0.8,
	))

	all_spans = spans + added
	all_spans.sort(key=lambda s: s.start)
	return all_spans