Add section detection for hybrid NER entity extraction

Rule-based section header detection identifies SKILLS, EXPERIENCE,
EDUCATION, CERTIFICATIONS, LANGUAGES, and PROJECTS sections.
Fills entities the NER model missed using section context:
- Skills: extracts from bullet/comma/dash/pipe-separated lists
- Certifications: extracts from cert section lines
- Languages: extracts language names from language section

Tested results:
- Muthu resume: 23 → 38 skills (added Docker, Kubernetes, Jenkins, etc.)
- Accounting resume: 0 → 14 skills (was completely missing)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show

training/section_detector.py +202 -0

training/section_detector.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""Detect resume sections and extract entities from untagged regions.
+Rule-based section header detection + heuristic entity extraction for
+sections where NER model has gaps (especially SKILLS, CERTIFICATIONS,
+LANGUAGES, and EDUCATION).
+Runs AFTER NER inference and BEFORE structured post-processing.
+Fills in entities the model missed by using section context.
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from training.structured_postprocess import Span
+SECTION_PATTERNS: dict[str, list[str]] = {
+    "skills": [
+        "skills", "technical skills", "core competencies", "competencies",
+        "areas of expertise", "areas of excellence", "proficiencies",
+        "technical proficiencies", "key skills", "professional skills",
+        "summary of qualifications", "qualifications", "tools & technologies",
+        "tools and technologies", "technologies", "tech stack",
+        "devops tools & technologies", "devops tools",
+    ],
+    "experience": [
+        "experience", "work experience", "professional experience",
+        "employment history", "work history", "career history",
+        "professional background", "clinical experience", "teaching experience",
+    ],
+    "education": [
+        "education", "academic background", "academic qualifications",
+        "educational background", "academic history",
+    ],
+    "certifications": [
+        "certifications", "licenses & certifications", "licenses",
+        "professional certifications", "credentials",
+        "certifications & licenses", "awards & certifications",
+    ],
+    "languages": [
+        "languages", "language skills", "linguistic skills",
+    ],
+    "projects": [
+        "projects", "personal projects", "key projects", "selected projects",
+    ],
+}
+@dataclass
+class Section:
+    name: str
+    start: int
+    end: int
+    text: str
+def detect_sections(text: str) -> list[Section]:
+    """Find section boundaries using header keywords."""
+    lines = text.split("\n")
+    sections: list[Section] = []
+    char_pos = 0
+    line_positions = []
+    for line in lines:
+        line_positions.append(char_pos)
+        char_pos += len(line) + 1
+    header_lines: list[tuple[int, str]] = []
+    for i, line in enumerate(lines):
+        stripped = line.strip().rstrip(":").lower()
+        stripped = re.sub(r"[^a-z\s&]", "", stripped).strip()
+        if not stripped or len(stripped) > 60:
+            continue
+        for section_name, patterns in SECTION_PATTERNS.items():
+            if stripped in patterns:
+                header_lines.append((i, section_name))
+                break
+    for idx, (line_idx, section_name) in enumerate(header_lines):
+        start = line_positions[line_idx]
+        if idx + 1 < len(header_lines):
+            end = line_positions[header_lines[idx + 1][0]]
+        else:
+            end = len(text)
+        section_text = text[start:end]
+        sections.append(Section(name=section_name, start=start, end=end, text=section_text))
+    return sections
+def _extract_list_items(text: str) -> list[str]:
+    """Extract items from bullet lists, comma/dash/pipe-separated text, or Category: items format."""
+    items = []
+    for line in text.split("\n"):
+        line = line.strip()
+        line = re.sub(r"^[-●•▪■▸►‣⁃]\s*", "", line)
+        if not line or len(line) > 120:
+            continue
+        # Strip "Category:" prefix if present
+        colon_match = re.match(r"^[A-Za-z\s&/()-]+:\s*(.+)$", line)
+        if colon_match:
+            line = colon_match.group(1)
+        # Split by comma, pipe, dash (but not inside words like "C++")
+        parts = re.split(r"\s*[,|]\s*|\s+-\s+|\s+\+\s+", line)
+        for part in parts:
+            part = part.strip().rstrip(".,;:")
+            if 2 < len(part) < 50 and not part[0].islower():
+                items.append(part)
+            elif 2 < len(part) < 50:
+                items.append(part)
+        # Also handle single bullet items
+        if len(parts) == 1 and len(line) < 50 and not line.endswith("."):
+            clean = line.strip().rstrip(".,;:")
+            if 2 < len(clean) < 50 and clean not in items:
+                items.append(clean)
+    return items
+def _is_tagged(start: int, end: int, existing_spans: list[Span]) -> bool:
+    """Check if a character range overlaps any existing span."""
+    for span in existing_spans:
+        if span.start < end and span.end > start:
+            return True
+    return False
+def fill_missing_entities(
+    text: str,
+    spans: list[Span],
+    sections: list[Section] | None = None,
+) -> list[Span]:
+    """Add entities from detected sections that NER model missed.
+    Runs after NER inference. For each detected section, extracts
+    candidate entities using heuristics and adds them if the model
+    didn't tag that text region.
+    """
+    if sections is None:
+        sections = detect_sections(text)
+    added: list[Span] = []
+    for section in sections:
+        if section.name == "skills":
+            items = _extract_list_items(section.text)
+            first_line = section.text.split("\n")[0]
+            for item in items:
+                if item.lower() in first_line.lower():
+                    continue
+                idx = text.find(item, section.start)
+                if idx == -1:
+                    continue
+                if not _is_tagged(idx, idx + len(item), spans):
+                    added.append(Span(
+                        label="SKILL", text=item,
+                        start=idx, end=idx + len(item),
+                        bio="B", score=0.8,
+                    ))
+        elif section.name == "certifications":
+            for line in section.text.split("\n"):
+                line = line.strip()
+                line = re.sub(r"^[-●•▪■]\s*", "", line)
+                if not line or len(line) < 5 or len(line) > 100:
+                    continue
+                stripped_lower = re.sub(r"[^a-z\s&]", "", line.lower()).strip()
+                is_header = any(stripped_lower == p for p in SECTION_PATTERNS["certifications"])
+                if is_header:
+                    continue
+                idx = text.find(line, section.start)
+                if idx == -1:
+                    continue
+                if not _is_tagged(idx, idx + len(line), spans):
+                    added.append(Span(
+                        label="CERT", text=line,
+                        start=idx, end=idx + len(line),
+                        bio="B", score=0.8,
+                    ))
+        elif section.name == "languages":
+            for line in section.text.split("\n"):
+                line = line.strip()
+                line = re.sub(r"^[-●•▪■]\s*", "", line)
+                if not line or len(line) < 3 or len(line) > 60:
+                    continue
+                stripped_lower = re.sub(r"[^a-z\s]", "", line.lower()).strip()
+                if stripped_lower in ("languages", "language skills", "linguistic skills"):
+                    continue
+                lang_match = re.match(r"^([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)", line)
+                if lang_match:
+                    lang = lang_match.group(1)
+                    idx = text.find(lang, section.start)
+                    if idx != -1 and not _is_tagged(idx, idx + len(lang), spans):
+                        added.append(Span(
+                            label="LANGUAGE", text=lang,
+                            start=idx, end=idx + len(lang),
+                            bio="B", score=0.8,
+                        ))
+    all_spans = spans + added
+    all_spans.sort(key=lambda s: s.start)
+    return all_spans