lexguard-backend / app /services /segmenter.py
Dar4devil's picture
LexGuard backend
c34b339
Raw
History Blame Contribute Delete
3.81 kB
"""Rule-based clause segmentation.
Strategy:
1. Detect heading lines via a battery of regexes (Section / ARTICLE / numbered / all-caps)
2. Split the document at heading boundaries to form candidate clauses
3. For oversized candidates, sub-split by paragraph
4. Trim and drop boilerplate (signature blocks, page headers)
Returns `CandidateClause` records with character spans into the original text so the UI
can highlight them in the source document.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
MAX_CLAUSE_CHARS = 2500
MIN_CLAUSE_CHARS = 40
_HEADING_PATTERNS = [
re.compile(r"^\s*(?:SECTION|Section|ARTICLE|Article)\s+[\divxlcIVXLC]+[\.:]?\s*.{0,80}$"),
re.compile(r"^\s*\d{1,3}(?:\.\d{1,3}){0,3}[\.\):]\s+\S.{0,120}$"),
re.compile(r"^\s*\([a-z]\)\s+\S.{0,120}$"),
re.compile(r"^\s*[A-Z][A-Z &/\-]{4,80}$"),
re.compile(r"^\s*[A-Z][A-Za-z &/\-]{4,80}:\s*$"),
]
_BOILERPLATE_PREFIXES = (
"page ",
"this page intentionally left blank",
"exhibit ",
"appendix ",
)
@dataclass
class CandidateClause:
text: str
span: tuple[int, int]
heading: str | None = None
def is_heading(line: str) -> bool:
stripped = line.strip()
if not stripped or len(stripped) > 140:
return False
return any(p.match(stripped) for p in _HEADING_PATTERNS)
def segment(text: str) -> list[CandidateClause]:
if not text.strip():
return []
lines = text.splitlines(keepends=True)
cursor = 0
line_spans: list[tuple[int, int, str]] = []
for line in lines:
line_spans.append((cursor, cursor + len(line), line))
cursor += len(line)
heading_indices = [i for i, (_, _, line) in enumerate(line_spans) if is_heading(line)]
if not heading_indices:
return _paragraph_split(text, base_offset=0)
candidates: list[CandidateClause] = []
if heading_indices[0] > 0:
preamble_end = line_spans[heading_indices[0]][0]
preamble = text[0:preamble_end].strip()
if len(preamble) >= MIN_CLAUSE_CHARS:
candidates.extend(_paragraph_split(preamble, base_offset=0))
for i, h_idx in enumerate(heading_indices):
h_start = line_spans[h_idx][0]
h_line = line_spans[h_idx][2].strip().rstrip(":")
end = line_spans[heading_indices[i + 1]][0] if i + 1 < len(heading_indices) else len(text)
body = text[h_start:end].rstrip()
if len(body) < MIN_CLAUSE_CHARS:
continue
if len(body) <= MAX_CLAUSE_CHARS:
candidates.append(CandidateClause(text=body, span=(h_start, h_start + len(body)), heading=h_line))
else:
for sub in _paragraph_split(body, base_offset=h_start):
sub.heading = h_line
candidates.append(sub)
return [c for c in candidates if _is_substantive(c.text)]
def _paragraph_split(text: str, *, base_offset: int) -> list[CandidateClause]:
out: list[CandidateClause] = []
parts = re.split(r"\n\s*\n", text)
cursor = base_offset
for part in parts:
leading_ws = len(part) - len(part.lstrip())
body = part.strip()
if not body:
cursor += len(part) + 2
continue
start = cursor + leading_ws
end = start + len(body)
if len(body) >= MIN_CLAUSE_CHARS:
out.append(CandidateClause(text=body, span=(start, end)))
cursor = end + 2
return out
def _is_substantive(text: str) -> bool:
lowered = text.strip().lower()
if not lowered:
return False
if any(lowered.startswith(prefix) for prefix in _BOILERPLATE_PREFIXES):
return False
if len(lowered) < MIN_CLAUSE_CHARS:
return False
if lowered.count(".") < 1 and len(lowered) < 120:
return False
return True