Add section-aware chunked inference for resumes exceeding 512 tokens

Splits text at paragraph boundaries (double newlines) and packs sections
into chunks that fit within the model's context window. Character offsets
are mapped back to the original text. Falls back to single-pass for
short inputs. Benchmark uses chunked inference — no regression on val set.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show

training/benchmark_structured.py +78 -7

training/benchmark_structured.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import argparse
 import json
 from collections import Counter, defaultdict
 import torch
@@ -43,6 +44,82 @@ def predicted_spans_from_text(text: str, offset_mapping: list[tuple[int, int]],
     return text, spans
 def normalize_value(field: str, value: str | None) -> str | None:
     if not value:
         return None
@@ -153,13 +230,7 @@ def main() -> None:
         bucket = str(bucket_info["bucket"])
         bucket_totals[bucket]["examples"] += 1
-        tokenized = tokenizer(gold_text, return_tensors="pt", return_offsets_mapping=True, truncation=True, max_length=512)
-        encoded = {k: v for k, v in tokenized.items() if k in ALLOWED_INPUTS}
-        with torch.no_grad():
-            pred_ids = model(**encoded).logits.argmax(dim=-1).squeeze(0).cpu().tolist()
-        offsets = [tuple(pair) for pair in tokenized["offset_mapping"].squeeze(0).cpu().tolist()][1:-1]
-        pred_text, pred_spans = predicted_spans_from_text(gold_text, offsets, pred_ids[1:-1])
         pred_structured = postprocessor.build_structured_resume_from_spans(pred_spans, pred_text)
         gold_flat = flatten_resume(gold_structured)

 import argparse
 import json
+import re
 from collections import Counter, defaultdict
 import torch
     return text, spans
+def _split_into_sections(text: str) -> list[str]:
+    """Split resume text at double-newline boundaries into paragraph blocks."""
+    return [block for block in re.split(r"\n{2,}", text) if block.strip()]
+def chunked_predicted_spans(
+    text: str,
+    model,
+    tokenizer,
+    max_length: int = 512,
+) -> tuple[str, list]:
+    """Run inference with section-aware chunking for texts exceeding max_length.
+    Splits at paragraph boundaries so entities are never cut mid-span.
+    Each chunk is a group of consecutive sections that fits within max_length.
+    Character offsets are mapped back to the original text.
+    """
+    num_tokens = len(tokenizer(text, truncation=False)["input_ids"])
+    if num_tokens <= max_length:
+        tokenized = tokenizer(text, return_tensors="pt", return_offsets_mapping=True, truncation=True, max_length=max_length)
+        encoded = {k: v for k, v in tokenized.items() if k in ALLOWED_INPUTS}
+        with torch.no_grad():
+            pred_ids = model(**encoded).logits.argmax(dim=-1).squeeze(0).cpu().tolist()
+        offsets = [tuple(pair) for pair in tokenized["offset_mapping"].squeeze(0).cpu().tolist()][1:-1]
+        return predicted_spans_from_text(text, offsets, pred_ids[1:-1])
+    sections = _split_into_sections(text)
+    chunks: list[str] = []
+    chunk_offsets: list[int] = []
+    current_sections: list[str] = []
+    current_offset = 0
+    for section in sections:
+        candidate = "\n\n".join(current_sections + [section]) if current_sections else section
+        tok_len = len(tokenizer(candidate, truncation=False)["input_ids"])
+        if tok_len > max_length and current_sections:
+            chunk_text = "\n\n".join(current_sections)
+            chunks.append(chunk_text)
+            chunk_offsets.append(current_offset)
+            current_offset = text.index(section, current_offset)
+            current_sections = [section]
+        else:
+            if not current_sections:
+                current_offset = text.index(section, current_offset)
+            current_sections.append(section)
+    if current_sections:
+        chunks.append("\n\n".join(current_sections))
+        chunk_offsets.append(current_offset)
+    all_spans = []
+    for chunk_text, char_offset in zip(chunks, chunk_offsets):
+        tokenized = tokenizer(chunk_text, return_tensors="pt", return_offsets_mapping=True, truncation=True, max_length=max_length)
+        encoded = {k: v for k, v in tokenized.items() if k in ALLOWED_INPUTS}
+        with torch.no_grad():
+            pred_ids = model(**encoded).logits.argmax(dim=-1).squeeze(0).cpu().tolist()
+        offsets = [tuple(pair) for pair in tokenized["offset_mapping"].squeeze(0).cpu().tolist()][1:-1]
+        _, spans = predicted_spans_from_text(chunk_text, offsets, pred_ids[1:-1])
+        for span in spans:
+            from training.structured_postprocess import Span
+            all_spans.append(Span(
+                label=span.label,
+                text=span.text,
+                start=span.start + char_offset,
+                end=span.end + char_offset,
+                bio=span.bio,
+                score=span.score,
+            ))
+    return text, all_spans
 def normalize_value(field: str, value: str | None) -> str | None:
     if not value:
         return None
         bucket = str(bucket_info["bucket"])
         bucket_totals[bucket]["examples"] += 1
+        pred_text, pred_spans = chunked_predicted_spans(gold_text, model, tokenizer)
         pred_structured = postprocessor.build_structured_resume_from_spans(pred_spans, pred_text)
         gold_flat = flatten_resume(gold_structured)