"""Rule-based clause segmentation.

Strategy:
  1. Detect heading lines via a battery of regexes (Section / ARTICLE / numbered / all-caps)
  2. Split the document at heading boundaries to form candidate clauses
  3. For oversized candidates, sub-split by paragraph
  4. Trim and drop boilerplate (signature blocks, page headers)

Returns `CandidateClause` records with character spans into the original text so the UI
can highlight them in the source document.
"""

from __future__ import annotations

import re
from dataclasses import dataclass

MAX_CLAUSE_CHARS = 2500
MIN_CLAUSE_CHARS = 40

_HEADING_PATTERNS = [
    re.compile(r"^\s*(?:SECTION|Section|ARTICLE|Article)\s+[\divxlcIVXLC]+[\.:]?\s*.{0,80}$"),
    re.compile(r"^\s*\d{1,3}(?:\.\d{1,3}){0,3}[\.\):]\s+\S.{0,120}$"),
    re.compile(r"^\s*\([a-z]\)\s+\S.{0,120}$"),
    re.compile(r"^\s*[A-Z][A-Z &/\-]{4,80}$"),
    re.compile(r"^\s*[A-Z][A-Za-z &/\-]{4,80}:\s*$"),
]

_BOILERPLATE_PREFIXES = (
    "page ",
    "this page intentionally left blank",
    "exhibit ",
    "appendix ",
)


@dataclass
class CandidateClause:
    text: str
    span: tuple[int, int]
    heading: str | None = None


def is_heading(line: str) -> bool:
    stripped = line.strip()
    if not stripped or len(stripped) > 140:
        return False
    return any(p.match(stripped) for p in _HEADING_PATTERNS)


def segment(text: str) -> list[CandidateClause]:
    if not text.strip():
        return []

    lines = text.splitlines(keepends=True)
    cursor = 0
    line_spans: list[tuple[int, int, str]] = []
    for line in lines:
        line_spans.append((cursor, cursor + len(line), line))
        cursor += len(line)

    heading_indices = [i for i, (_, _, line) in enumerate(line_spans) if is_heading(line)]

    if not heading_indices:
        return _paragraph_split(text, base_offset=0)

    candidates: list[CandidateClause] = []

    if heading_indices[0] > 0:
        preamble_end = line_spans[heading_indices[0]][0]
        preamble = text[0:preamble_end].strip()
        if len(preamble) >= MIN_CLAUSE_CHARS:
            candidates.extend(_paragraph_split(preamble, base_offset=0))

    for i, h_idx in enumerate(heading_indices):
        h_start = line_spans[h_idx][0]
        h_line = line_spans[h_idx][2].strip().rstrip(":")
        end = line_spans[heading_indices[i + 1]][0] if i + 1 < len(heading_indices) else len(text)
        body = text[h_start:end].rstrip()
        if len(body) < MIN_CLAUSE_CHARS:
            continue

        if len(body) <= MAX_CLAUSE_CHARS:
            candidates.append(CandidateClause(text=body, span=(h_start, h_start + len(body)), heading=h_line))
        else:
            for sub in _paragraph_split(body, base_offset=h_start):
                sub.heading = h_line
                candidates.append(sub)

    return [c for c in candidates if _is_substantive(c.text)]


def _paragraph_split(text: str, *, base_offset: int) -> list[CandidateClause]:
    out: list[CandidateClause] = []
    parts = re.split(r"\n\s*\n", text)
    cursor = base_offset
    for part in parts:
        leading_ws = len(part) - len(part.lstrip())
        body = part.strip()
        if not body:
            cursor += len(part) + 2
            continue
        start = cursor + leading_ws
        end = start + len(body)
        if len(body) >= MIN_CLAUSE_CHARS:
            out.append(CandidateClause(text=body, span=(start, end)))
        cursor = end + 2
    return out


def _is_substantive(text: str) -> bool:
    lowered = text.strip().lower()
    if not lowered:
        return False
    if any(lowered.startswith(prefix) for prefix in _BOILERPLATE_PREFIXES):
        return False
    if len(lowered) < MIN_CLAUSE_CHARS:
        return False
    if lowered.count(".") < 1 and len(lowered) < 120:
        return False
    return True