Deploy harness v2 to root for HuggingFace Space

19d2058 24 days ago

7.94 kB

	"""
	extraction.py — Modal-Pattern Sieve for Commitment Extraction

	Implements the commitment extractor per paper Definition 2.4 and Figure 4.
	A commitment is a clause containing a deontic or alethic modal operator
	that creates a testable obligation, prohibition, or constraint.

	Three-stage sieve:
	1. Sentence segmentation (regex — deterministic, no model)
	2. Modal operator detection with type classification
	3. Commitment normalization (canonical form for comparison)

	Design principle: this is the MEASUREMENT INSTRUMENT.
	It must be deterministic and precise. No ML models here.
	False positives inflate scores. False negatives hide drift.
	"""

	import re
	from dataclasses import dataclass, field
	from typing import List, Set, Optional, Tuple


	# ---------------------------------------------------------------------------
	# Modal operator patterns — ordered longest-first to match multi-word first
	# ---------------------------------------------------------------------------

	# Prohibitions (check BEFORE obligations — "must not" before "must")
	PROHIBITION_PATTERNS = [
	(re.compile(r'\bmust\s+not\b', re.I), 'must not'),
	(re.compile(r'\bshall\s+not\b', re.I), 'shall not'),
	(re.compile(r'\bwill\s+not\b', re.I), 'will not'),
	(re.compile(r'\bcan\s*not\b', re.I), 'cannot'),
	(re.compile(r'\bmay\s+not\b', re.I), 'may not'),
	(re.compile(r'\bmust\s+never\b', re.I), 'must never'),
	(re.compile(r'\bshall\s+never\b', re.I), 'shall never'),
	(re.compile(r'\bis\s+prohibited\s+from\b', re.I), 'is prohibited from'),
	(re.compile(r'\bare\s+prohibited\s+from\b', re.I), 'are prohibited from'),
	(re.compile(r'\bis\s+forbidden\s+to\b', re.I), 'is forbidden to'),
	(re.compile(r'\bare\s+forbidden\s+to\b', re.I), 'are forbidden to'),
	(re.compile(r'\bdo\s+not\b', re.I), 'do not'),
	(re.compile(r'\bdoes\s+not\b', re.I), 'does not'),
	(re.compile(r'\bno\s+\w+\s+(?:or\|nor)\s+\w+\b', re.I), 'no X or Y'), # "No food or drink"
	]

	# Obligations (deontic necessity)
	OBLIGATION_PATTERNS = [
	(re.compile(r'\bmust\b', re.I), 'must'),
	(re.compile(r'\bshall\b', re.I), 'shall'),
	(re.compile(r'\bis\s+required\s+to\b', re.I), 'is required to'),
	(re.compile(r'\bare\s+required\s+to\b', re.I), 'are required to'),
	(re.compile(r'\bis\s+obligated\s+to\b', re.I), 'is obligated to'),
	(re.compile(r'\bare\s+obligated\s+to\b', re.I), 'are obligated to'),
	(re.compile(r'\bhas\s+to\b', re.I), 'has to'),
	(re.compile(r'\bhave\s+to\b', re.I), 'have to'),
	(re.compile(r'\bneeds?\s+to\b', re.I), 'needs to'),
	(re.compile(r'\bis\s+bound\s+to\b', re.I), 'is bound to'),
	]

	# Constraints (alethic / universal quantification)
	CONSTRAINT_PATTERNS = [
	(re.compile(r'\balways\b', re.I), 'always'),
	(re.compile(r'\bnever\b', re.I), 'never'),
	(re.compile(r'\bunder\s+no\s+circumstances?\b', re.I), 'under no circumstances'),
	(re.compile(r'\bwithout\s+exception\b', re.I), 'without exception'),
	(re.compile(r'\bat\s+all\s+times?\b', re.I), 'at all times'),
	(re.compile(r'\bin\s+(?:all\|every)\s+cases?\b', re.I), 'in all cases'),
	(re.compile(r'\bis\s+defined\s+as\b', re.I), 'is defined as'),
	]

	# Conditional prefixes
	CONDITIONAL_RE = re.compile(
	r'\b(if\|when\|unless\|provided\s+that\|in\s+the\s+event\s+that\|where\|before\|after\|prior\s+to)\b',
	re.I
	)


	# ---------------------------------------------------------------------------
	# Data structures
	# ---------------------------------------------------------------------------

	@dataclass(frozen=True)
	class Commitment:
	"""A single extracted commitment. Frozen for use in sets."""
	text: str # The clause text
	modal_type: str # 'obligation' \| 'prohibition' \| 'constraint'
	modal_operator: str # The matched operator
	source_sentence: str # Original sentence
	is_conditional: bool = False

	@property
	def canonical(self) -> str:
	"""Normalized form for comparison."""
	t = self.text.strip().lower()
	t = re.sub(r'\s+', ' ', t) # collapse whitespace
	t = re.sub(r'[.;,!?]+$', '', t) # strip trailing punct
	return t.strip()

	def __eq__(self, other):
	if not isinstance(other, Commitment):
	return False
	return self.canonical == other.canonical

	def __hash__(self):
	return hash(self.canonical)


	# ---------------------------------------------------------------------------
	# Sentence segmentation — deterministic regex, no model dependency
	# ---------------------------------------------------------------------------

	def segment_sentences(text: str) -> List[str]:
	"""Split text into sentences and sub-clauses (semicolons)."""
	text = text.strip()
	if not text:
	return []

	# First split on sentence boundaries
	# Match period/excl/question followed by space and uppercase
	raw_sents = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

	# Then split each sentence on semicolons
	result = []
	for sent in raw_sents:
	clauses = [c.strip() for c in sent.split(';') if c.strip()]
	result.extend(clauses)

	return result


	# ---------------------------------------------------------------------------
	# Core extraction
	# ---------------------------------------------------------------------------

	def classify_clause(clause: str) -> Optional[Tuple[str, str]]:
	"""
	Classify a clause by its modal operator.
	Returns (modal_type, operator_text) or None.

	Checks prohibitions FIRST (longest match) to avoid
	"must not" matching as obligation "must".
	"""
	# Check prohibitions first
	for pattern, operator in PROHIBITION_PATTERNS:
	if pattern.search(clause):
	return ('prohibition', operator)

	# Then obligations
	for pattern, operator in OBLIGATION_PATTERNS:
	if pattern.search(clause):
	return ('obligation', operator)

	# Then constraints
	for pattern, operator in CONSTRAINT_PATTERNS:
	if pattern.search(clause):
	return ('constraint', operator)

	return None


	def has_conditional(clause: str) -> bool:
	"""Check if a clause contains a conditional prefix."""
	return bool(CONDITIONAL_RE.search(clause))


	def extract_commitments(text: str) -> List[Commitment]:
	"""
	Extract all commitments from a text signal.

	This is the modal-pattern sieve (Figure 4):
	1. Segment into sentences/clauses
	2. Classify each by modal operator
	3. Return structured Commitment objects
	"""
	sentences = segment_sentences(text)
	commitments = []

	for sent in sentences:
	result = classify_clause(sent)
	if result is not None:
	modal_type, operator = result
	commitments.append(Commitment(
	text=sent.strip(),
	modal_type=modal_type,
	modal_operator=operator,
	source_sentence=sent.strip(),
	is_conditional=has_conditional(sent),
	))

	return commitments


	def extract_commitment_set(text: str) -> Set[Commitment]:
	"""Extract commitments as a set (deduped by canonical form)."""
	return set(extract_commitments(text))


	def extract_commitment_texts(text: str) -> Set[str]:
	"""
	Extract commitment canonical texts as a set of strings.
	This is the primary interface for fidelity scoring.
	"""
	return {c.canonical for c in extract_commitments(text)}


	# ---------------------------------------------------------------------------
	# Backward-compatible interface
	# ---------------------------------------------------------------------------

	def extract_hard_commitments(text: str, nlp=None) -> Set[str]:
	"""
	Backward-compatible interface. nlp parameter ignored.
	Returns set of canonical commitment strings.
	"""
	return extract_commitment_texts(text)