"""Rule-based clause segmentation. Strategy: 1. Detect heading lines via a battery of regexes (Section / ARTICLE / numbered / all-caps) 2. Split the document at heading boundaries to form candidate clauses 3. For oversized candidates, sub-split by paragraph 4. Trim and drop boilerplate (signature blocks, page headers) Returns `CandidateClause` records with character spans into the original text so the UI can highlight them in the source document. """ from __future__ import annotations import re from dataclasses import dataclass MAX_CLAUSE_CHARS = 2500 MIN_CLAUSE_CHARS = 40 _HEADING_PATTERNS = [ re.compile(r"^\s*(?:SECTION|Section|ARTICLE|Article)\s+[\divxlcIVXLC]+[\.:]?\s*.{0,80}$"), re.compile(r"^\s*\d{1,3}(?:\.\d{1,3}){0,3}[\.\):]\s+\S.{0,120}$"), re.compile(r"^\s*\([a-z]\)\s+\S.{0,120}$"), re.compile(r"^\s*[A-Z][A-Z &/\-]{4,80}$"), re.compile(r"^\s*[A-Z][A-Za-z &/\-]{4,80}:\s*$"), ] _BOILERPLATE_PREFIXES = ( "page ", "this page intentionally left blank", "exhibit ", "appendix ", ) @dataclass class CandidateClause: text: str span: tuple[int, int] heading: str | None = None def is_heading(line: str) -> bool: stripped = line.strip() if not stripped or len(stripped) > 140: return False return any(p.match(stripped) for p in _HEADING_PATTERNS) def segment(text: str) -> list[CandidateClause]: if not text.strip(): return [] lines = text.splitlines(keepends=True) cursor = 0 line_spans: list[tuple[int, int, str]] = [] for line in lines: line_spans.append((cursor, cursor + len(line), line)) cursor += len(line) heading_indices = [i for i, (_, _, line) in enumerate(line_spans) if is_heading(line)] if not heading_indices: return _paragraph_split(text, base_offset=0) candidates: list[CandidateClause] = [] if heading_indices[0] > 0: preamble_end = line_spans[heading_indices[0]][0] preamble = text[0:preamble_end].strip() if len(preamble) >= MIN_CLAUSE_CHARS: candidates.extend(_paragraph_split(preamble, base_offset=0)) for i, h_idx in enumerate(heading_indices): h_start = line_spans[h_idx][0] h_line = line_spans[h_idx][2].strip().rstrip(":") end = line_spans[heading_indices[i + 1]][0] if i + 1 < len(heading_indices) else len(text) body = text[h_start:end].rstrip() if len(body) < MIN_CLAUSE_CHARS: continue if len(body) <= MAX_CLAUSE_CHARS: candidates.append(CandidateClause(text=body, span=(h_start, h_start + len(body)), heading=h_line)) else: for sub in _paragraph_split(body, base_offset=h_start): sub.heading = h_line candidates.append(sub) return [c for c in candidates if _is_substantive(c.text)] def _paragraph_split(text: str, *, base_offset: int) -> list[CandidateClause]: out: list[CandidateClause] = [] parts = re.split(r"\n\s*\n", text) cursor = base_offset for part in parts: leading_ws = len(part) - len(part.lstrip()) body = part.strip() if not body: cursor += len(part) + 2 continue start = cursor + leading_ws end = start + len(body) if len(body) >= MIN_CLAUSE_CHARS: out.append(CandidateClause(text=body, span=(start, end))) cursor = end + 2 return out def _is_substantive(text: str) -> bool: lowered = text.strip().lower() if not lowered: return False if any(lowered.startswith(prefix) for prefix in _BOILERPLATE_PREFIXES): return False if len(lowered) < MIN_CLAUSE_CHARS: return False if lowered.count(".") < 1 and len(lowered) < 120: return False return True