| """ |
| extraction.py — Modal-Pattern Sieve for Commitment Extraction |
| |
| Implements the commitment extractor per paper Definition 2.4 and Figure 4. |
| A commitment is a clause containing a deontic or alethic modal operator |
| that creates a testable obligation, prohibition, or constraint. |
| |
| Three-stage sieve: |
| 1. Sentence segmentation (regex — deterministic, no model) |
| 2. Modal operator detection with type classification |
| 3. Commitment normalization (canonical form for comparison) |
| |
| Design principle: this is the MEASUREMENT INSTRUMENT. |
| It must be deterministic and precise. No ML models here. |
| False positives inflate scores. False negatives hide drift. |
| """ |
|
|
| import re |
| from dataclasses import dataclass, field |
| from typing import List, Set, Optional, Tuple |
|
|
|
|
| |
| |
| |
|
|
| |
| PROHIBITION_PATTERNS = [ |
| (re.compile(r'\bmust\s+not\b', re.I), 'must not'), |
| (re.compile(r'\bshall\s+not\b', re.I), 'shall not'), |
| (re.compile(r'\bwill\s+not\b', re.I), 'will not'), |
| (re.compile(r'\bcan\s*not\b', re.I), 'cannot'), |
| (re.compile(r'\bmay\s+not\b', re.I), 'may not'), |
| (re.compile(r'\bmust\s+never\b', re.I), 'must never'), |
| (re.compile(r'\bshall\s+never\b', re.I), 'shall never'), |
| (re.compile(r'\bis\s+prohibited\s+from\b', re.I), 'is prohibited from'), |
| (re.compile(r'\bare\s+prohibited\s+from\b', re.I), 'are prohibited from'), |
| (re.compile(r'\bis\s+forbidden\s+to\b', re.I), 'is forbidden to'), |
| (re.compile(r'\bare\s+forbidden\s+to\b', re.I), 'are forbidden to'), |
| (re.compile(r'\bdo\s+not\b', re.I), 'do not'), |
| (re.compile(r'\bdoes\s+not\b', re.I), 'does not'), |
| (re.compile(r'\bno\s+\w+\s+(?:or|nor)\s+\w+\b', re.I), 'no X or Y'), |
| ] |
|
|
| |
| OBLIGATION_PATTERNS = [ |
| (re.compile(r'\bmust\b', re.I), 'must'), |
| (re.compile(r'\bshall\b', re.I), 'shall'), |
| (re.compile(r'\bis\s+required\s+to\b', re.I), 'is required to'), |
| (re.compile(r'\bare\s+required\s+to\b', re.I), 'are required to'), |
| (re.compile(r'\bis\s+obligated\s+to\b', re.I), 'is obligated to'), |
| (re.compile(r'\bare\s+obligated\s+to\b', re.I), 'are obligated to'), |
| (re.compile(r'\bhas\s+to\b', re.I), 'has to'), |
| (re.compile(r'\bhave\s+to\b', re.I), 'have to'), |
| (re.compile(r'\bneeds?\s+to\b', re.I), 'needs to'), |
| (re.compile(r'\bis\s+bound\s+to\b', re.I), 'is bound to'), |
| ] |
|
|
| |
| CONSTRAINT_PATTERNS = [ |
| (re.compile(r'\balways\b', re.I), 'always'), |
| (re.compile(r'\bnever\b', re.I), 'never'), |
| (re.compile(r'\bunder\s+no\s+circumstances?\b', re.I), 'under no circumstances'), |
| (re.compile(r'\bwithout\s+exception\b', re.I), 'without exception'), |
| (re.compile(r'\bat\s+all\s+times?\b', re.I), 'at all times'), |
| (re.compile(r'\bin\s+(?:all|every)\s+cases?\b', re.I), 'in all cases'), |
| (re.compile(r'\bis\s+defined\s+as\b', re.I), 'is defined as'), |
| ] |
|
|
| |
| CONDITIONAL_RE = re.compile( |
| r'\b(if|when|unless|provided\s+that|in\s+the\s+event\s+that|where|before|after|prior\s+to)\b', |
| re.I |
| ) |
|
|
|
|
| |
| |
| |
|
|
| @dataclass(frozen=True) |
| class Commitment: |
| """A single extracted commitment. Frozen for use in sets.""" |
| text: str |
| modal_type: str |
| modal_operator: str |
| source_sentence: str |
| is_conditional: bool = False |
| |
| @property |
| def canonical(self) -> str: |
| """Normalized form for comparison.""" |
| t = self.text.strip().lower() |
| t = re.sub(r'\s+', ' ', t) |
| t = re.sub(r'[.;,!?]+$', '', t) |
| return t.strip() |
| |
| def __eq__(self, other): |
| if not isinstance(other, Commitment): |
| return False |
| return self.canonical == other.canonical |
| |
| def __hash__(self): |
| return hash(self.canonical) |
|
|
|
|
| |
| |
| |
|
|
| def segment_sentences(text: str) -> List[str]: |
| """Split text into sentences and sub-clauses (semicolons).""" |
| text = text.strip() |
| if not text: |
| return [] |
| |
| |
| |
| raw_sents = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text) |
| |
| |
| result = [] |
| for sent in raw_sents: |
| clauses = [c.strip() for c in sent.split(';') if c.strip()] |
| result.extend(clauses) |
| |
| return result |
|
|
|
|
| |
| |
| |
|
|
| def classify_clause(clause: str) -> Optional[Tuple[str, str]]: |
| """ |
| Classify a clause by its modal operator. |
| Returns (modal_type, operator_text) or None. |
| |
| Checks prohibitions FIRST (longest match) to avoid |
| "must not" matching as obligation "must". |
| """ |
| |
| for pattern, operator in PROHIBITION_PATTERNS: |
| if pattern.search(clause): |
| return ('prohibition', operator) |
| |
| |
| for pattern, operator in OBLIGATION_PATTERNS: |
| if pattern.search(clause): |
| return ('obligation', operator) |
| |
| |
| for pattern, operator in CONSTRAINT_PATTERNS: |
| if pattern.search(clause): |
| return ('constraint', operator) |
| |
| return None |
|
|
|
|
| def has_conditional(clause: str) -> bool: |
| """Check if a clause contains a conditional prefix.""" |
| return bool(CONDITIONAL_RE.search(clause)) |
|
|
|
|
| def extract_commitments(text: str) -> List[Commitment]: |
| """ |
| Extract all commitments from a text signal. |
| |
| This is the modal-pattern sieve (Figure 4): |
| 1. Segment into sentences/clauses |
| 2. Classify each by modal operator |
| 3. Return structured Commitment objects |
| """ |
| sentences = segment_sentences(text) |
| commitments = [] |
| |
| for sent in sentences: |
| result = classify_clause(sent) |
| if result is not None: |
| modal_type, operator = result |
| commitments.append(Commitment( |
| text=sent.strip(), |
| modal_type=modal_type, |
| modal_operator=operator, |
| source_sentence=sent.strip(), |
| is_conditional=has_conditional(sent), |
| )) |
| |
| return commitments |
|
|
|
|
| def extract_commitment_set(text: str) -> Set[Commitment]: |
| """Extract commitments as a set (deduped by canonical form).""" |
| return set(extract_commitments(text)) |
|
|
|
|
| def extract_commitment_texts(text: str) -> Set[str]: |
| """ |
| Extract commitment canonical texts as a set of strings. |
| This is the primary interface for fidelity scoring. |
| """ |
| return {c.canonical for c in extract_commitments(text)} |
|
|
|
|
| |
| |
| |
|
|
| def extract_hard_commitments(text: str, nlp=None) -> Set[str]: |
| """ |
| Backward-compatible interface. nlp parameter ignored. |
| Returns set of canonical commitment strings. |
| """ |
| return extract_commitment_texts(text) |
|
|