File size: 1,568 Bytes
2a64ad4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from spacy import load
import re

def load_spacy_model(model_name='en_core_web_sm'):
    nlp = load(model_name)
    return nlp

def normalize_text(text):
    """Normalize text for comparison: lowercase, strip punctuation."""
    return re.sub(r'[^\w\s]', '', text.lower().strip())

def extract_hard_commitments(text, nlp=None):
    """Extract commitments using expanded modal keyword detection."""
    if nlp is None:
        nlp = load_spacy_model()
    
    doc = nlp(text)
    commitments = set()
    
    # Expanded modal keywords
    hard_modals = {'must', 'shall', 'will', 'have', 'need', 'required', 'ought', 'cannot', 'should'}
    soft_modals = {'might', 'could', 'may', 'perhaps', 'maybe', 'tend'}
    
    # Extract by sentence-level modal presence
    for sent in doc.sents:
        sent_lower = sent.text.lower()
        # Check for hard modals
        if any(modal in sent_lower for modal in hard_modals):
            commitments.add(sent.text.strip())
        # Check for soft modals
        elif any(modal in sent_lower for modal in soft_modals):
            commitments.add(sent.text.strip())
    
    return commitments

def extract_from_texts(texts, model_name='en_core_web_sm'):
    nlp = load_spacy_model(model_name)
    all_commitments = {}
    
    for text in texts:
        commitments = extract_hard_commitments(text, nlp)
        all_commitments[text] = commitments
    
    return all_commitments

def extract_hard(text: str, nlp=None) -> set:
    """Shorthand for extract_hard_commitments."""
    return extract_hard_commitments(text, nlp)