File size: 7,942 Bytes

19d2058

"""
extraction.py — Modal-Pattern Sieve for Commitment Extraction

Implements the commitment extractor per paper Definition 2.4 and Figure 4.
A commitment is a clause containing a deontic or alethic modal operator
that creates a testable obligation, prohibition, or constraint.

Three-stage sieve:
  1. Sentence segmentation (regex — deterministic, no model)
  2. Modal operator detection with type classification
  3. Commitment normalization (canonical form for comparison)

Design principle: this is the MEASUREMENT INSTRUMENT.
It must be deterministic and precise. No ML models here.
False positives inflate scores. False negatives hide drift.
"""

import re
from dataclasses import dataclass, field
from typing import List, Set, Optional, Tuple


# ---------------------------------------------------------------------------
# Modal operator patterns — ordered longest-first to match multi-word first
# ---------------------------------------------------------------------------

# Prohibitions (check BEFORE obligations — "must not" before "must")
PROHIBITION_PATTERNS = [
    (re.compile(r'\bmust\s+not\b', re.I), 'must not'),
    (re.compile(r'\bshall\s+not\b', re.I), 'shall not'),
    (re.compile(r'\bwill\s+not\b', re.I), 'will not'),
    (re.compile(r'\bcan\s*not\b', re.I), 'cannot'),
    (re.compile(r'\bmay\s+not\b', re.I), 'may not'),
    (re.compile(r'\bmust\s+never\b', re.I), 'must never'),
    (re.compile(r'\bshall\s+never\b', re.I), 'shall never'),
    (re.compile(r'\bis\s+prohibited\s+from\b', re.I), 'is prohibited from'),
    (re.compile(r'\bare\s+prohibited\s+from\b', re.I), 'are prohibited from'),
    (re.compile(r'\bis\s+forbidden\s+to\b', re.I), 'is forbidden to'),
    (re.compile(r'\bare\s+forbidden\s+to\b', re.I), 'are forbidden to'),
    (re.compile(r'\bdo\s+not\b', re.I), 'do not'),
    (re.compile(r'\bdoes\s+not\b', re.I), 'does not'),
    (re.compile(r'\bno\s+\w+\s+(?:or|nor)\s+\w+\b', re.I), 'no X or Y'),  # "No food or drink"
]

# Obligations (deontic necessity)
OBLIGATION_PATTERNS = [
    (re.compile(r'\bmust\b', re.I), 'must'),
    (re.compile(r'\bshall\b', re.I), 'shall'),
    (re.compile(r'\bis\s+required\s+to\b', re.I), 'is required to'),
    (re.compile(r'\bare\s+required\s+to\b', re.I), 'are required to'),
    (re.compile(r'\bis\s+obligated\s+to\b', re.I), 'is obligated to'),
    (re.compile(r'\bare\s+obligated\s+to\b', re.I), 'are obligated to'),
    (re.compile(r'\bhas\s+to\b', re.I), 'has to'),
    (re.compile(r'\bhave\s+to\b', re.I), 'have to'),
    (re.compile(r'\bneeds?\s+to\b', re.I), 'needs to'),
    (re.compile(r'\bis\s+bound\s+to\b', re.I), 'is bound to'),
]

# Constraints (alethic / universal quantification)
CONSTRAINT_PATTERNS = [
    (re.compile(r'\balways\b', re.I), 'always'),
    (re.compile(r'\bnever\b', re.I), 'never'),
    (re.compile(r'\bunder\s+no\s+circumstances?\b', re.I), 'under no circumstances'),
    (re.compile(r'\bwithout\s+exception\b', re.I), 'without exception'),
    (re.compile(r'\bat\s+all\s+times?\b', re.I), 'at all times'),
    (re.compile(r'\bin\s+(?:all|every)\s+cases?\b', re.I), 'in all cases'),
    (re.compile(r'\bis\s+defined\s+as\b', re.I), 'is defined as'),
]

# Conditional prefixes
CONDITIONAL_RE = re.compile(
    r'\b(if|when|unless|provided\s+that|in\s+the\s+event\s+that|where|before|after|prior\s+to)\b',
    re.I
)


# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------

@dataclass(frozen=True)
class Commitment:
    """A single extracted commitment. Frozen for use in sets."""
    text: str                   # The clause text
    modal_type: str             # 'obligation' | 'prohibition' | 'constraint'
    modal_operator: str         # The matched operator
    source_sentence: str        # Original sentence
    is_conditional: bool = False
    
    @property
    def canonical(self) -> str:
        """Normalized form for comparison."""
        t = self.text.strip().lower()
        t = re.sub(r'\s+', ' ', t)             # collapse whitespace
        t = re.sub(r'[.;,!?]+$', '', t)        # strip trailing punct
        return t.strip()
    
    def __eq__(self, other):
        if not isinstance(other, Commitment):
            return False
        return self.canonical == other.canonical
    
    def __hash__(self):
        return hash(self.canonical)


# ---------------------------------------------------------------------------
# Sentence segmentation — deterministic regex, no model dependency
# ---------------------------------------------------------------------------

def segment_sentences(text: str) -> List[str]:
    """Split text into sentences and sub-clauses (semicolons)."""
    text = text.strip()
    if not text:
        return []
    
    # First split on sentence boundaries
    # Match period/excl/question followed by space and uppercase
    raw_sents = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    
    # Then split each sentence on semicolons
    result = []
    for sent in raw_sents:
        clauses = [c.strip() for c in sent.split(';') if c.strip()]
        result.extend(clauses)
    
    return result


# ---------------------------------------------------------------------------
# Core extraction
# ---------------------------------------------------------------------------

def classify_clause(clause: str) -> Optional[Tuple[str, str]]:
    """
    Classify a clause by its modal operator.
    Returns (modal_type, operator_text) or None.
    
    Checks prohibitions FIRST (longest match) to avoid
    "must not" matching as obligation "must".
    """
    # Check prohibitions first
    for pattern, operator in PROHIBITION_PATTERNS:
        if pattern.search(clause):
            return ('prohibition', operator)
    
    # Then obligations
    for pattern, operator in OBLIGATION_PATTERNS:
        if pattern.search(clause):
            return ('obligation', operator)
    
    # Then constraints
    for pattern, operator in CONSTRAINT_PATTERNS:
        if pattern.search(clause):
            return ('constraint', operator)
    
    return None


def has_conditional(clause: str) -> bool:
    """Check if a clause contains a conditional prefix."""
    return bool(CONDITIONAL_RE.search(clause))


def extract_commitments(text: str) -> List[Commitment]:
    """
    Extract all commitments from a text signal.
    
    This is the modal-pattern sieve (Figure 4):
    1. Segment into sentences/clauses
    2. Classify each by modal operator
    3. Return structured Commitment objects
    """
    sentences = segment_sentences(text)
    commitments = []
    
    for sent in sentences:
        result = classify_clause(sent)
        if result is not None:
            modal_type, operator = result
            commitments.append(Commitment(
                text=sent.strip(),
                modal_type=modal_type,
                modal_operator=operator,
                source_sentence=sent.strip(),
                is_conditional=has_conditional(sent),
            ))
    
    return commitments


def extract_commitment_set(text: str) -> Set[Commitment]:
    """Extract commitments as a set (deduped by canonical form)."""
    return set(extract_commitments(text))


def extract_commitment_texts(text: str) -> Set[str]:
    """
    Extract commitment canonical texts as a set of strings.
    This is the primary interface for fidelity scoring.
    """
    return {c.canonical for c in extract_commitments(text)}


# ---------------------------------------------------------------------------
# Backward-compatible interface
# ---------------------------------------------------------------------------

def extract_hard_commitments(text: str, nlp=None) -> Set[str]:
    """
    Backward-compatible interface. nlp parameter ignored.
    Returns set of canonical commitment strings.
    """
    return extract_commitment_texts(text)