Spaces:

Dar3devil
/

lexguard-backend

Sleeping

App Files Files Community

lexguard-backend / app /services /segmenter.py

Dar4devil

LexGuard backend

c34b339 about 2 months ago

Raw

History Blame Contribute Delete

3.81 kB

	"""Rule-based clause segmentation.

	Strategy:
	1. Detect heading lines via a battery of regexes (Section / ARTICLE / numbered / all-caps)
	2. Split the document at heading boundaries to form candidate clauses
	3. For oversized candidates, sub-split by paragraph
	4. Trim and drop boilerplate (signature blocks, page headers)

	Returns `CandidateClause` records with character spans into the original text so the UI
	can highlight them in the source document.
	"""

	from __future__ import annotations

	import re
	from dataclasses import dataclass

	MAX_CLAUSE_CHARS = 2500
	MIN_CLAUSE_CHARS = 40

	_HEADING_PATTERNS = [
	re.compile(r"^\s(?:SECTION\|Section\|ARTICLE\|Article)\s+[\divxlcIVXLC]+[\.:]?\s.{0,80}$"),
	re.compile(r"^\s*\d{1,3}(?:\.\d{1,3}){0,3}[\.\):]\s+\S.{0,120}$"),
	re.compile(r"^\s*$[a-z]$\s+\S.{0,120}$"),
	re.compile(r"^\s*[A-Z][A-Z &/\-]{4,80}$"),
	re.compile(r"^\s[A-Z][A-Za-z &/\-]{4,80}:\s$"),
	]

	_BOILERPLATE_PREFIXES = (
	"page ",
	"this page intentionally left blank",
	"exhibit ",
	"appendix ",
	)


	@dataclass
	class CandidateClause:
	text: str
	span: tuple[int, int]
	heading: str \| None = None


	def is_heading(line: str) -> bool:
	stripped = line.strip()
	if not stripped or len(stripped) > 140:
	return False
	return any(p.match(stripped) for p in _HEADING_PATTERNS)


	def segment(text: str) -> list[CandidateClause]:
	if not text.strip():
	return []

	lines = text.splitlines(keepends=True)
	cursor = 0
	line_spans: list[tuple[int, int, str]] = []
	for line in lines:
	line_spans.append((cursor, cursor + len(line), line))
	cursor += len(line)

	heading_indices = [i for i, (_, _, line) in enumerate(line_spans) if is_heading(line)]

	if not heading_indices:
	return _paragraph_split(text, base_offset=0)

	candidates: list[CandidateClause] = []

	if heading_indices[0] > 0:
	preamble_end = line_spans[heading_indices[0]][0]
	preamble = text[0:preamble_end].strip()
	if len(preamble) >= MIN_CLAUSE_CHARS:
	candidates.extend(_paragraph_split(preamble, base_offset=0))

	for i, h_idx in enumerate(heading_indices):
	h_start = line_spans[h_idx][0]
	h_line = line_spans[h_idx][2].strip().rstrip(":")
	end = line_spans[heading_indices[i + 1]][0] if i + 1 < len(heading_indices) else len(text)
	body = text[h_start:end].rstrip()
	if len(body) < MIN_CLAUSE_CHARS:
	continue

	if len(body) <= MAX_CLAUSE_CHARS:
	candidates.append(CandidateClause(text=body, span=(h_start, h_start + len(body)), heading=h_line))
	else:
	for sub in _paragraph_split(body, base_offset=h_start):
	sub.heading = h_line
	candidates.append(sub)

	return [c for c in candidates if _is_substantive(c.text)]


	def _paragraph_split(text: str, *, base_offset: int) -> list[CandidateClause]:
	out: list[CandidateClause] = []
	parts = re.split(r"\n\s*\n", text)
	cursor = base_offset
	for part in parts:
	leading_ws = len(part) - len(part.lstrip())
	body = part.strip()
	if not body:
	cursor += len(part) + 2
	continue
	start = cursor + leading_ws
	end = start + len(body)
	if len(body) >= MIN_CLAUSE_CHARS:
	out.append(CandidateClause(text=body, span=(start, end)))
	cursor = end + 2
	return out


	def _is_substantive(text: str) -> bool:
	lowered = text.strip().lower()
	if not lowered:
	return False
	if any(lowered.startswith(prefix) for prefix in _BOILERPLATE_PREFIXES):
	return False
	if len(lowered) < MIN_CLAUSE_CHARS:
	return False
	if lowered.count(".") < 1 and len(lowered) < 120:
	return False
	return True