File size: 7,942 Bytes
19d2058 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | """
extraction.py — Modal-Pattern Sieve for Commitment Extraction
Implements the commitment extractor per paper Definition 2.4 and Figure 4.
A commitment is a clause containing a deontic or alethic modal operator
that creates a testable obligation, prohibition, or constraint.
Three-stage sieve:
1. Sentence segmentation (regex — deterministic, no model)
2. Modal operator detection with type classification
3. Commitment normalization (canonical form for comparison)
Design principle: this is the MEASUREMENT INSTRUMENT.
It must be deterministic and precise. No ML models here.
False positives inflate scores. False negatives hide drift.
"""
import re
from dataclasses import dataclass, field
from typing import List, Set, Optional, Tuple
# ---------------------------------------------------------------------------
# Modal operator patterns — ordered longest-first to match multi-word first
# ---------------------------------------------------------------------------
# Prohibitions (check BEFORE obligations — "must not" before "must")
PROHIBITION_PATTERNS = [
(re.compile(r'\bmust\s+not\b', re.I), 'must not'),
(re.compile(r'\bshall\s+not\b', re.I), 'shall not'),
(re.compile(r'\bwill\s+not\b', re.I), 'will not'),
(re.compile(r'\bcan\s*not\b', re.I), 'cannot'),
(re.compile(r'\bmay\s+not\b', re.I), 'may not'),
(re.compile(r'\bmust\s+never\b', re.I), 'must never'),
(re.compile(r'\bshall\s+never\b', re.I), 'shall never'),
(re.compile(r'\bis\s+prohibited\s+from\b', re.I), 'is prohibited from'),
(re.compile(r'\bare\s+prohibited\s+from\b', re.I), 'are prohibited from'),
(re.compile(r'\bis\s+forbidden\s+to\b', re.I), 'is forbidden to'),
(re.compile(r'\bare\s+forbidden\s+to\b', re.I), 'are forbidden to'),
(re.compile(r'\bdo\s+not\b', re.I), 'do not'),
(re.compile(r'\bdoes\s+not\b', re.I), 'does not'),
(re.compile(r'\bno\s+\w+\s+(?:or|nor)\s+\w+\b', re.I), 'no X or Y'), # "No food or drink"
]
# Obligations (deontic necessity)
OBLIGATION_PATTERNS = [
(re.compile(r'\bmust\b', re.I), 'must'),
(re.compile(r'\bshall\b', re.I), 'shall'),
(re.compile(r'\bis\s+required\s+to\b', re.I), 'is required to'),
(re.compile(r'\bare\s+required\s+to\b', re.I), 'are required to'),
(re.compile(r'\bis\s+obligated\s+to\b', re.I), 'is obligated to'),
(re.compile(r'\bare\s+obligated\s+to\b', re.I), 'are obligated to'),
(re.compile(r'\bhas\s+to\b', re.I), 'has to'),
(re.compile(r'\bhave\s+to\b', re.I), 'have to'),
(re.compile(r'\bneeds?\s+to\b', re.I), 'needs to'),
(re.compile(r'\bis\s+bound\s+to\b', re.I), 'is bound to'),
]
# Constraints (alethic / universal quantification)
CONSTRAINT_PATTERNS = [
(re.compile(r'\balways\b', re.I), 'always'),
(re.compile(r'\bnever\b', re.I), 'never'),
(re.compile(r'\bunder\s+no\s+circumstances?\b', re.I), 'under no circumstances'),
(re.compile(r'\bwithout\s+exception\b', re.I), 'without exception'),
(re.compile(r'\bat\s+all\s+times?\b', re.I), 'at all times'),
(re.compile(r'\bin\s+(?:all|every)\s+cases?\b', re.I), 'in all cases'),
(re.compile(r'\bis\s+defined\s+as\b', re.I), 'is defined as'),
]
# Conditional prefixes
CONDITIONAL_RE = re.compile(
r'\b(if|when|unless|provided\s+that|in\s+the\s+event\s+that|where|before|after|prior\s+to)\b',
re.I
)
# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class Commitment:
"""A single extracted commitment. Frozen for use in sets."""
text: str # The clause text
modal_type: str # 'obligation' | 'prohibition' | 'constraint'
modal_operator: str # The matched operator
source_sentence: str # Original sentence
is_conditional: bool = False
@property
def canonical(self) -> str:
"""Normalized form for comparison."""
t = self.text.strip().lower()
t = re.sub(r'\s+', ' ', t) # collapse whitespace
t = re.sub(r'[.;,!?]+$', '', t) # strip trailing punct
return t.strip()
def __eq__(self, other):
if not isinstance(other, Commitment):
return False
return self.canonical == other.canonical
def __hash__(self):
return hash(self.canonical)
# ---------------------------------------------------------------------------
# Sentence segmentation — deterministic regex, no model dependency
# ---------------------------------------------------------------------------
def segment_sentences(text: str) -> List[str]:
"""Split text into sentences and sub-clauses (semicolons)."""
text = text.strip()
if not text:
return []
# First split on sentence boundaries
# Match period/excl/question followed by space and uppercase
raw_sents = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
# Then split each sentence on semicolons
result = []
for sent in raw_sents:
clauses = [c.strip() for c in sent.split(';') if c.strip()]
result.extend(clauses)
return result
# ---------------------------------------------------------------------------
# Core extraction
# ---------------------------------------------------------------------------
def classify_clause(clause: str) -> Optional[Tuple[str, str]]:
"""
Classify a clause by its modal operator.
Returns (modal_type, operator_text) or None.
Checks prohibitions FIRST (longest match) to avoid
"must not" matching as obligation "must".
"""
# Check prohibitions first
for pattern, operator in PROHIBITION_PATTERNS:
if pattern.search(clause):
return ('prohibition', operator)
# Then obligations
for pattern, operator in OBLIGATION_PATTERNS:
if pattern.search(clause):
return ('obligation', operator)
# Then constraints
for pattern, operator in CONSTRAINT_PATTERNS:
if pattern.search(clause):
return ('constraint', operator)
return None
def has_conditional(clause: str) -> bool:
"""Check if a clause contains a conditional prefix."""
return bool(CONDITIONAL_RE.search(clause))
def extract_commitments(text: str) -> List[Commitment]:
"""
Extract all commitments from a text signal.
This is the modal-pattern sieve (Figure 4):
1. Segment into sentences/clauses
2. Classify each by modal operator
3. Return structured Commitment objects
"""
sentences = segment_sentences(text)
commitments = []
for sent in sentences:
result = classify_clause(sent)
if result is not None:
modal_type, operator = result
commitments.append(Commitment(
text=sent.strip(),
modal_type=modal_type,
modal_operator=operator,
source_sentence=sent.strip(),
is_conditional=has_conditional(sent),
))
return commitments
def extract_commitment_set(text: str) -> Set[Commitment]:
"""Extract commitments as a set (deduped by canonical form)."""
return set(extract_commitments(text))
def extract_commitment_texts(text: str) -> Set[str]:
"""
Extract commitment canonical texts as a set of strings.
This is the primary interface for fidelity scoring.
"""
return {c.canonical for c in extract_commitments(text)}
# ---------------------------------------------------------------------------
# Backward-compatible interface
# ---------------------------------------------------------------------------
def extract_hard_commitments(text: str, nlp=None) -> Set[str]:
"""
Backward-compatible interface. nlp parameter ignored.
Returns set of canonical commitment strings.
"""
return extract_commitment_texts(text)
|