""" extraction.py — Modal-Pattern Sieve for Commitment Extraction Implements the commitment extractor per paper Definition 2.4 and Figure 4. A commitment is a clause containing a deontic or alethic modal operator that creates a testable obligation, prohibition, or constraint. Three-stage sieve: 1. Sentence segmentation (regex — deterministic, no model) 2. Modal operator detection with type classification 3. Commitment normalization (canonical form for comparison) Design principle: this is the MEASUREMENT INSTRUMENT. It must be deterministic and precise. No ML models here. False positives inflate scores. False negatives hide drift. """ import re from dataclasses import dataclass, field from typing import List, Set, Optional, Tuple # --------------------------------------------------------------------------- # Modal operator patterns — ordered longest-first to match multi-word first # --------------------------------------------------------------------------- # Prohibitions (check BEFORE obligations — "must not" before "must") PROHIBITION_PATTERNS = [ (re.compile(r'\bmust\s+not\b', re.I), 'must not'), (re.compile(r'\bshall\s+not\b', re.I), 'shall not'), (re.compile(r'\bwill\s+not\b', re.I), 'will not'), (re.compile(r'\bcan\s*not\b', re.I), 'cannot'), (re.compile(r'\bmay\s+not\b', re.I), 'may not'), (re.compile(r'\bmust\s+never\b', re.I), 'must never'), (re.compile(r'\bshall\s+never\b', re.I), 'shall never'), (re.compile(r'\bis\s+prohibited\s+from\b', re.I), 'is prohibited from'), (re.compile(r'\bare\s+prohibited\s+from\b', re.I), 'are prohibited from'), (re.compile(r'\bis\s+forbidden\s+to\b', re.I), 'is forbidden to'), (re.compile(r'\bare\s+forbidden\s+to\b', re.I), 'are forbidden to'), (re.compile(r'\bdo\s+not\b', re.I), 'do not'), (re.compile(r'\bdoes\s+not\b', re.I), 'does not'), (re.compile(r'\bno\s+\w+\s+(?:or|nor)\s+\w+\b', re.I), 'no X or Y'), # "No food or drink" ] # Obligations (deontic necessity) OBLIGATION_PATTERNS = [ (re.compile(r'\bmust\b', re.I), 'must'), (re.compile(r'\bshall\b', re.I), 'shall'), (re.compile(r'\bis\s+required\s+to\b', re.I), 'is required to'), (re.compile(r'\bare\s+required\s+to\b', re.I), 'are required to'), (re.compile(r'\bis\s+obligated\s+to\b', re.I), 'is obligated to'), (re.compile(r'\bare\s+obligated\s+to\b', re.I), 'are obligated to'), (re.compile(r'\bhas\s+to\b', re.I), 'has to'), (re.compile(r'\bhave\s+to\b', re.I), 'have to'), (re.compile(r'\bneeds?\s+to\b', re.I), 'needs to'), (re.compile(r'\bis\s+bound\s+to\b', re.I), 'is bound to'), ] # Constraints (alethic / universal quantification) CONSTRAINT_PATTERNS = [ (re.compile(r'\balways\b', re.I), 'always'), (re.compile(r'\bnever\b', re.I), 'never'), (re.compile(r'\bunder\s+no\s+circumstances?\b', re.I), 'under no circumstances'), (re.compile(r'\bwithout\s+exception\b', re.I), 'without exception'), (re.compile(r'\bat\s+all\s+times?\b', re.I), 'at all times'), (re.compile(r'\bin\s+(?:all|every)\s+cases?\b', re.I), 'in all cases'), (re.compile(r'\bis\s+defined\s+as\b', re.I), 'is defined as'), ] # Conditional prefixes CONDITIONAL_RE = re.compile( r'\b(if|when|unless|provided\s+that|in\s+the\s+event\s+that|where|before|after|prior\s+to)\b', re.I ) # --------------------------------------------------------------------------- # Data structures # --------------------------------------------------------------------------- @dataclass(frozen=True) class Commitment: """A single extracted commitment. Frozen for use in sets.""" text: str # The clause text modal_type: str # 'obligation' | 'prohibition' | 'constraint' modal_operator: str # The matched operator source_sentence: str # Original sentence is_conditional: bool = False @property def canonical(self) -> str: """Normalized form for comparison.""" t = self.text.strip().lower() t = re.sub(r'\s+', ' ', t) # collapse whitespace t = re.sub(r'[.;,!?]+$', '', t) # strip trailing punct return t.strip() def __eq__(self, other): if not isinstance(other, Commitment): return False return self.canonical == other.canonical def __hash__(self): return hash(self.canonical) # --------------------------------------------------------------------------- # Sentence segmentation — deterministic regex, no model dependency # --------------------------------------------------------------------------- def segment_sentences(text: str) -> List[str]: """Split text into sentences and sub-clauses (semicolons).""" text = text.strip() if not text: return [] # First split on sentence boundaries # Match period/excl/question followed by space and uppercase raw_sents = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text) # Then split each sentence on semicolons result = [] for sent in raw_sents: clauses = [c.strip() for c in sent.split(';') if c.strip()] result.extend(clauses) return result # --------------------------------------------------------------------------- # Core extraction # --------------------------------------------------------------------------- def classify_clause(clause: str) -> Optional[Tuple[str, str]]: """ Classify a clause by its modal operator. Returns (modal_type, operator_text) or None. Checks prohibitions FIRST (longest match) to avoid "must not" matching as obligation "must". """ # Check prohibitions first for pattern, operator in PROHIBITION_PATTERNS: if pattern.search(clause): return ('prohibition', operator) # Then obligations for pattern, operator in OBLIGATION_PATTERNS: if pattern.search(clause): return ('obligation', operator) # Then constraints for pattern, operator in CONSTRAINT_PATTERNS: if pattern.search(clause): return ('constraint', operator) return None def has_conditional(clause: str) -> bool: """Check if a clause contains a conditional prefix.""" return bool(CONDITIONAL_RE.search(clause)) def extract_commitments(text: str) -> List[Commitment]: """ Extract all commitments from a text signal. This is the modal-pattern sieve (Figure 4): 1. Segment into sentences/clauses 2. Classify each by modal operator 3. Return structured Commitment objects """ sentences = segment_sentences(text) commitments = [] for sent in sentences: result = classify_clause(sent) if result is not None: modal_type, operator = result commitments.append(Commitment( text=sent.strip(), modal_type=modal_type, modal_operator=operator, source_sentence=sent.strip(), is_conditional=has_conditional(sent), )) return commitments def extract_commitment_set(text: str) -> Set[Commitment]: """Extract commitments as a set (deduped by canonical form).""" return set(extract_commitments(text)) def extract_commitment_texts(text: str) -> Set[str]: """ Extract commitment canonical texts as a set of strings. This is the primary interface for fidelity scoring. """ return {c.canonical for c in extract_commitments(text)} # --------------------------------------------------------------------------- # Backward-compatible interface # --------------------------------------------------------------------------- def extract_hard_commitments(text: str, nlp=None) -> Set[str]: """ Backward-compatible interface. nlp parameter ignored. Returns set of canonical commitment strings. """ return extract_commitment_texts(text)