File size: 7,942 Bytes
19d2058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""
extraction.py — Modal-Pattern Sieve for Commitment Extraction

Implements the commitment extractor per paper Definition 2.4 and Figure 4.
A commitment is a clause containing a deontic or alethic modal operator
that creates a testable obligation, prohibition, or constraint.

Three-stage sieve:
  1. Sentence segmentation (regex — deterministic, no model)
  2. Modal operator detection with type classification
  3. Commitment normalization (canonical form for comparison)

Design principle: this is the MEASUREMENT INSTRUMENT.
It must be deterministic and precise. No ML models here.
False positives inflate scores. False negatives hide drift.
"""

import re
from dataclasses import dataclass, field
from typing import List, Set, Optional, Tuple


# ---------------------------------------------------------------------------
# Modal operator patterns — ordered longest-first to match multi-word first
# ---------------------------------------------------------------------------

# Prohibitions (check BEFORE obligations — "must not" before "must")
PROHIBITION_PATTERNS = [
    (re.compile(r'\bmust\s+not\b', re.I), 'must not'),
    (re.compile(r'\bshall\s+not\b', re.I), 'shall not'),
    (re.compile(r'\bwill\s+not\b', re.I), 'will not'),
    (re.compile(r'\bcan\s*not\b', re.I), 'cannot'),
    (re.compile(r'\bmay\s+not\b', re.I), 'may not'),
    (re.compile(r'\bmust\s+never\b', re.I), 'must never'),
    (re.compile(r'\bshall\s+never\b', re.I), 'shall never'),
    (re.compile(r'\bis\s+prohibited\s+from\b', re.I), 'is prohibited from'),
    (re.compile(r'\bare\s+prohibited\s+from\b', re.I), 'are prohibited from'),
    (re.compile(r'\bis\s+forbidden\s+to\b', re.I), 'is forbidden to'),
    (re.compile(r'\bare\s+forbidden\s+to\b', re.I), 'are forbidden to'),
    (re.compile(r'\bdo\s+not\b', re.I), 'do not'),
    (re.compile(r'\bdoes\s+not\b', re.I), 'does not'),
    (re.compile(r'\bno\s+\w+\s+(?:or|nor)\s+\w+\b', re.I), 'no X or Y'),  # "No food or drink"
]

# Obligations (deontic necessity)
OBLIGATION_PATTERNS = [
    (re.compile(r'\bmust\b', re.I), 'must'),
    (re.compile(r'\bshall\b', re.I), 'shall'),
    (re.compile(r'\bis\s+required\s+to\b', re.I), 'is required to'),
    (re.compile(r'\bare\s+required\s+to\b', re.I), 'are required to'),
    (re.compile(r'\bis\s+obligated\s+to\b', re.I), 'is obligated to'),
    (re.compile(r'\bare\s+obligated\s+to\b', re.I), 'are obligated to'),
    (re.compile(r'\bhas\s+to\b', re.I), 'has to'),
    (re.compile(r'\bhave\s+to\b', re.I), 'have to'),
    (re.compile(r'\bneeds?\s+to\b', re.I), 'needs to'),
    (re.compile(r'\bis\s+bound\s+to\b', re.I), 'is bound to'),
]

# Constraints (alethic / universal quantification)
CONSTRAINT_PATTERNS = [
    (re.compile(r'\balways\b', re.I), 'always'),
    (re.compile(r'\bnever\b', re.I), 'never'),
    (re.compile(r'\bunder\s+no\s+circumstances?\b', re.I), 'under no circumstances'),
    (re.compile(r'\bwithout\s+exception\b', re.I), 'without exception'),
    (re.compile(r'\bat\s+all\s+times?\b', re.I), 'at all times'),
    (re.compile(r'\bin\s+(?:all|every)\s+cases?\b', re.I), 'in all cases'),
    (re.compile(r'\bis\s+defined\s+as\b', re.I), 'is defined as'),
]

# Conditional prefixes
CONDITIONAL_RE = re.compile(
    r'\b(if|when|unless|provided\s+that|in\s+the\s+event\s+that|where|before|after|prior\s+to)\b',
    re.I
)


# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------

@dataclass(frozen=True)
class Commitment:
    """A single extracted commitment. Frozen for use in sets."""
    text: str                   # The clause text
    modal_type: str             # 'obligation' | 'prohibition' | 'constraint'
    modal_operator: str         # The matched operator
    source_sentence: str        # Original sentence
    is_conditional: bool = False
    
    @property
    def canonical(self) -> str:
        """Normalized form for comparison."""
        t = self.text.strip().lower()
        t = re.sub(r'\s+', ' ', t)             # collapse whitespace
        t = re.sub(r'[.;,!?]+$', '', t)        # strip trailing punct
        return t.strip()
    
    def __eq__(self, other):
        if not isinstance(other, Commitment):
            return False
        return self.canonical == other.canonical
    
    def __hash__(self):
        return hash(self.canonical)


# ---------------------------------------------------------------------------
# Sentence segmentation — deterministic regex, no model dependency
# ---------------------------------------------------------------------------

def segment_sentences(text: str) -> List[str]:
    """Split text into sentences and sub-clauses (semicolons)."""
    text = text.strip()
    if not text:
        return []
    
    # First split on sentence boundaries
    # Match period/excl/question followed by space and uppercase
    raw_sents = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    
    # Then split each sentence on semicolons
    result = []
    for sent in raw_sents:
        clauses = [c.strip() for c in sent.split(';') if c.strip()]
        result.extend(clauses)
    
    return result


# ---------------------------------------------------------------------------
# Core extraction
# ---------------------------------------------------------------------------

def classify_clause(clause: str) -> Optional[Tuple[str, str]]:
    """
    Classify a clause by its modal operator.
    Returns (modal_type, operator_text) or None.
    
    Checks prohibitions FIRST (longest match) to avoid
    "must not" matching as obligation "must".
    """
    # Check prohibitions first
    for pattern, operator in PROHIBITION_PATTERNS:
        if pattern.search(clause):
            return ('prohibition', operator)
    
    # Then obligations
    for pattern, operator in OBLIGATION_PATTERNS:
        if pattern.search(clause):
            return ('obligation', operator)
    
    # Then constraints
    for pattern, operator in CONSTRAINT_PATTERNS:
        if pattern.search(clause):
            return ('constraint', operator)
    
    return None


def has_conditional(clause: str) -> bool:
    """Check if a clause contains a conditional prefix."""
    return bool(CONDITIONAL_RE.search(clause))


def extract_commitments(text: str) -> List[Commitment]:
    """
    Extract all commitments from a text signal.
    
    This is the modal-pattern sieve (Figure 4):
    1. Segment into sentences/clauses
    2. Classify each by modal operator
    3. Return structured Commitment objects
    """
    sentences = segment_sentences(text)
    commitments = []
    
    for sent in sentences:
        result = classify_clause(sent)
        if result is not None:
            modal_type, operator = result
            commitments.append(Commitment(
                text=sent.strip(),
                modal_type=modal_type,
                modal_operator=operator,
                source_sentence=sent.strip(),
                is_conditional=has_conditional(sent),
            ))
    
    return commitments


def extract_commitment_set(text: str) -> Set[Commitment]:
    """Extract commitments as a set (deduped by canonical form)."""
    return set(extract_commitments(text))


def extract_commitment_texts(text: str) -> Set[str]:
    """
    Extract commitment canonical texts as a set of strings.
    This is the primary interface for fidelity scoring.
    """
    return {c.canonical for c in extract_commitments(text)}


# ---------------------------------------------------------------------------
# Backward-compatible interface
# ---------------------------------------------------------------------------

def extract_hard_commitments(text: str, nlp=None) -> Set[str]:
    """
    Backward-compatible interface. nlp parameter ignored.
    Returns set of canonical commitment strings.
    """
    return extract_commitment_texts(text)