Somasundaram Ayyappan Claude Opus 4.6 (1M context) commited on
Commit
613cc9b
·
1 Parent(s): b5f1c8a

Add section detection for hybrid NER entity extraction

Browse files

Rule-based section header detection identifies SKILLS, EXPERIENCE,
EDUCATION, CERTIFICATIONS, LANGUAGES, and PROJECTS sections.
Fills entities the NER model missed using section context:
- Skills: extracts from bullet/comma/dash/pipe-separated lists
- Certifications: extracts from cert section lines
- Languages: extracts language names from language section

Tested results:
- Muthu resume: 23 → 38 skills (added Docker, Kubernetes, Jenkins, etc.)
- Accounting resume: 0 → 14 skills (was completely missing)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. training/section_detector.py +202 -0
training/section_detector.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Detect resume sections and extract entities from untagged regions.
2
+
3
+ Rule-based section header detection + heuristic entity extraction for
4
+ sections where NER model has gaps (especially SKILLS, CERTIFICATIONS,
5
+ LANGUAGES, and EDUCATION).
6
+
7
+ Runs AFTER NER inference and BEFORE structured post-processing.
8
+ Fills in entities the model missed by using section context.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import re
14
+ from dataclasses import dataclass
15
+
16
+ from training.structured_postprocess import Span
17
+
18
+ SECTION_PATTERNS: dict[str, list[str]] = {
19
+ "skills": [
20
+ "skills", "technical skills", "core competencies", "competencies",
21
+ "areas of expertise", "areas of excellence", "proficiencies",
22
+ "technical proficiencies", "key skills", "professional skills",
23
+ "summary of qualifications", "qualifications", "tools & technologies",
24
+ "tools and technologies", "technologies", "tech stack",
25
+ "devops tools & technologies", "devops tools",
26
+ ],
27
+ "experience": [
28
+ "experience", "work experience", "professional experience",
29
+ "employment history", "work history", "career history",
30
+ "professional background", "clinical experience", "teaching experience",
31
+ ],
32
+ "education": [
33
+ "education", "academic background", "academic qualifications",
34
+ "educational background", "academic history",
35
+ ],
36
+ "certifications": [
37
+ "certifications", "licenses & certifications", "licenses",
38
+ "professional certifications", "credentials",
39
+ "certifications & licenses", "awards & certifications",
40
+ ],
41
+ "languages": [
42
+ "languages", "language skills", "linguistic skills",
43
+ ],
44
+ "projects": [
45
+ "projects", "personal projects", "key projects", "selected projects",
46
+ ],
47
+ }
48
+
49
+
50
+ @dataclass
51
+ class Section:
52
+ name: str
53
+ start: int
54
+ end: int
55
+ text: str
56
+
57
+
58
+ def detect_sections(text: str) -> list[Section]:
59
+ """Find section boundaries using header keywords."""
60
+ lines = text.split("\n")
61
+ sections: list[Section] = []
62
+ char_pos = 0
63
+ line_positions = []
64
+ for line in lines:
65
+ line_positions.append(char_pos)
66
+ char_pos += len(line) + 1
67
+
68
+ header_lines: list[tuple[int, str]] = []
69
+ for i, line in enumerate(lines):
70
+ stripped = line.strip().rstrip(":").lower()
71
+ stripped = re.sub(r"[^a-z\s&]", "", stripped).strip()
72
+ if not stripped or len(stripped) > 60:
73
+ continue
74
+ for section_name, patterns in SECTION_PATTERNS.items():
75
+ if stripped in patterns:
76
+ header_lines.append((i, section_name))
77
+ break
78
+
79
+ for idx, (line_idx, section_name) in enumerate(header_lines):
80
+ start = line_positions[line_idx]
81
+ if idx + 1 < len(header_lines):
82
+ end = line_positions[header_lines[idx + 1][0]]
83
+ else:
84
+ end = len(text)
85
+ section_text = text[start:end]
86
+ sections.append(Section(name=section_name, start=start, end=end, text=section_text))
87
+
88
+ return sections
89
+
90
+
91
+ def _extract_list_items(text: str) -> list[str]:
92
+ """Extract items from bullet lists, comma/dash/pipe-separated text, or Category: items format."""
93
+ items = []
94
+ for line in text.split("\n"):
95
+ line = line.strip()
96
+ line = re.sub(r"^[-●•▪■▸►‣⁃]\s*", "", line)
97
+ if not line or len(line) > 120:
98
+ continue
99
+ # Strip "Category:" prefix if present
100
+ colon_match = re.match(r"^[A-Za-z\s&/()-]+:\s*(.+)$", line)
101
+ if colon_match:
102
+ line = colon_match.group(1)
103
+ # Split by comma, pipe, dash (but not inside words like "C++")
104
+ parts = re.split(r"\s*[,|]\s*|\s+-\s+|\s+\+\s+", line)
105
+ for part in parts:
106
+ part = part.strip().rstrip(".,;:")
107
+ if 2 < len(part) < 50 and not part[0].islower():
108
+ items.append(part)
109
+ elif 2 < len(part) < 50:
110
+ items.append(part)
111
+ # Also handle single bullet items
112
+ if len(parts) == 1 and len(line) < 50 and not line.endswith("."):
113
+ clean = line.strip().rstrip(".,;:")
114
+ if 2 < len(clean) < 50 and clean not in items:
115
+ items.append(clean)
116
+ return items
117
+
118
+
119
+ def _is_tagged(start: int, end: int, existing_spans: list[Span]) -> bool:
120
+ """Check if a character range overlaps any existing span."""
121
+ for span in existing_spans:
122
+ if span.start < end and span.end > start:
123
+ return True
124
+ return False
125
+
126
+
127
+ def fill_missing_entities(
128
+ text: str,
129
+ spans: list[Span],
130
+ sections: list[Section] | None = None,
131
+ ) -> list[Span]:
132
+ """Add entities from detected sections that NER model missed.
133
+
134
+ Runs after NER inference. For each detected section, extracts
135
+ candidate entities using heuristics and adds them if the model
136
+ didn't tag that text region.
137
+ """
138
+ if sections is None:
139
+ sections = detect_sections(text)
140
+
141
+ added: list[Span] = []
142
+
143
+ for section in sections:
144
+ if section.name == "skills":
145
+ items = _extract_list_items(section.text)
146
+ first_line = section.text.split("\n")[0]
147
+ for item in items:
148
+ if item.lower() in first_line.lower():
149
+ continue
150
+ idx = text.find(item, section.start)
151
+ if idx == -1:
152
+ continue
153
+ if not _is_tagged(idx, idx + len(item), spans):
154
+ added.append(Span(
155
+ label="SKILL", text=item,
156
+ start=idx, end=idx + len(item),
157
+ bio="B", score=0.8,
158
+ ))
159
+
160
+ elif section.name == "certifications":
161
+ for line in section.text.split("\n"):
162
+ line = line.strip()
163
+ line = re.sub(r"^[-●•▪■]\s*", "", line)
164
+ if not line or len(line) < 5 or len(line) > 100:
165
+ continue
166
+ stripped_lower = re.sub(r"[^a-z\s&]", "", line.lower()).strip()
167
+ is_header = any(stripped_lower == p for p in SECTION_PATTERNS["certifications"])
168
+ if is_header:
169
+ continue
170
+ idx = text.find(line, section.start)
171
+ if idx == -1:
172
+ continue
173
+ if not _is_tagged(idx, idx + len(line), spans):
174
+ added.append(Span(
175
+ label="CERT", text=line,
176
+ start=idx, end=idx + len(line),
177
+ bio="B", score=0.8,
178
+ ))
179
+
180
+ elif section.name == "languages":
181
+ for line in section.text.split("\n"):
182
+ line = line.strip()
183
+ line = re.sub(r"^[-●•▪■]\s*", "", line)
184
+ if not line or len(line) < 3 or len(line) > 60:
185
+ continue
186
+ stripped_lower = re.sub(r"[^a-z\s]", "", line.lower()).strip()
187
+ if stripped_lower in ("languages", "language skills", "linguistic skills"):
188
+ continue
189
+ lang_match = re.match(r"^([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)", line)
190
+ if lang_match:
191
+ lang = lang_match.group(1)
192
+ idx = text.find(lang, section.start)
193
+ if idx != -1 and not _is_tagged(idx, idx + len(lang), spans):
194
+ added.append(Span(
195
+ label="LANGUAGE", text=lang,
196
+ start=idx, end=idx + len(lang),
197
+ bio="B", score=0.8,
198
+ ))
199
+
200
+ all_spans = spans + added
201
+ all_spans.sort(key=lambda s: s.start)
202
+ return all_spans