Spaces:
Sleeping
Sleeping
| """Rule-based clause segmentation. | |
| Strategy: | |
| 1. Detect heading lines via a battery of regexes (Section / ARTICLE / numbered / all-caps) | |
| 2. Split the document at heading boundaries to form candidate clauses | |
| 3. For oversized candidates, sub-split by paragraph | |
| 4. Trim and drop boilerplate (signature blocks, page headers) | |
| Returns `CandidateClause` records with character spans into the original text so the UI | |
| can highlight them in the source document. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from dataclasses import dataclass | |
| MAX_CLAUSE_CHARS = 2500 | |
| MIN_CLAUSE_CHARS = 40 | |
| _HEADING_PATTERNS = [ | |
| re.compile(r"^\s*(?:SECTION|Section|ARTICLE|Article)\s+[\divxlcIVXLC]+[\.:]?\s*.{0,80}$"), | |
| re.compile(r"^\s*\d{1,3}(?:\.\d{1,3}){0,3}[\.\):]\s+\S.{0,120}$"), | |
| re.compile(r"^\s*\([a-z]\)\s+\S.{0,120}$"), | |
| re.compile(r"^\s*[A-Z][A-Z &/\-]{4,80}$"), | |
| re.compile(r"^\s*[A-Z][A-Za-z &/\-]{4,80}:\s*$"), | |
| ] | |
| _BOILERPLATE_PREFIXES = ( | |
| "page ", | |
| "this page intentionally left blank", | |
| "exhibit ", | |
| "appendix ", | |
| ) | |
| class CandidateClause: | |
| text: str | |
| span: tuple[int, int] | |
| heading: str | None = None | |
| def is_heading(line: str) -> bool: | |
| stripped = line.strip() | |
| if not stripped or len(stripped) > 140: | |
| return False | |
| return any(p.match(stripped) for p in _HEADING_PATTERNS) | |
| def segment(text: str) -> list[CandidateClause]: | |
| if not text.strip(): | |
| return [] | |
| lines = text.splitlines(keepends=True) | |
| cursor = 0 | |
| line_spans: list[tuple[int, int, str]] = [] | |
| for line in lines: | |
| line_spans.append((cursor, cursor + len(line), line)) | |
| cursor += len(line) | |
| heading_indices = [i for i, (_, _, line) in enumerate(line_spans) if is_heading(line)] | |
| if not heading_indices: | |
| return _paragraph_split(text, base_offset=0) | |
| candidates: list[CandidateClause] = [] | |
| if heading_indices[0] > 0: | |
| preamble_end = line_spans[heading_indices[0]][0] | |
| preamble = text[0:preamble_end].strip() | |
| if len(preamble) >= MIN_CLAUSE_CHARS: | |
| candidates.extend(_paragraph_split(preamble, base_offset=0)) | |
| for i, h_idx in enumerate(heading_indices): | |
| h_start = line_spans[h_idx][0] | |
| h_line = line_spans[h_idx][2].strip().rstrip(":") | |
| end = line_spans[heading_indices[i + 1]][0] if i + 1 < len(heading_indices) else len(text) | |
| body = text[h_start:end].rstrip() | |
| if len(body) < MIN_CLAUSE_CHARS: | |
| continue | |
| if len(body) <= MAX_CLAUSE_CHARS: | |
| candidates.append(CandidateClause(text=body, span=(h_start, h_start + len(body)), heading=h_line)) | |
| else: | |
| for sub in _paragraph_split(body, base_offset=h_start): | |
| sub.heading = h_line | |
| candidates.append(sub) | |
| return [c for c in candidates if _is_substantive(c.text)] | |
| def _paragraph_split(text: str, *, base_offset: int) -> list[CandidateClause]: | |
| out: list[CandidateClause] = [] | |
| parts = re.split(r"\n\s*\n", text) | |
| cursor = base_offset | |
| for part in parts: | |
| leading_ws = len(part) - len(part.lstrip()) | |
| body = part.strip() | |
| if not body: | |
| cursor += len(part) + 2 | |
| continue | |
| start = cursor + leading_ws | |
| end = start + len(body) | |
| if len(body) >= MIN_CLAUSE_CHARS: | |
| out.append(CandidateClause(text=body, span=(start, end))) | |
| cursor = end + 2 | |
| return out | |
| def _is_substantive(text: str) -> bool: | |
| lowered = text.strip().lower() | |
| if not lowered: | |
| return False | |
| if any(lowered.startswith(prefix) for prefix in _BOILERPLATE_PREFIXES): | |
| return False | |
| if len(lowered) < MIN_CLAUSE_CHARS: | |
| return False | |
| if lowered.count(".") < 1 and len(lowered) < 120: | |
| return False | |
| return True | |