File size: 1,568 Bytes
2a64ad4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
from spacy import load
import re
def load_spacy_model(model_name='en_core_web_sm'):
nlp = load(model_name)
return nlp
def normalize_text(text):
"""Normalize text for comparison: lowercase, strip punctuation."""
return re.sub(r'[^\w\s]', '', text.lower().strip())
def extract_hard_commitments(text, nlp=None):
"""Extract commitments using expanded modal keyword detection."""
if nlp is None:
nlp = load_spacy_model()
doc = nlp(text)
commitments = set()
# Expanded modal keywords
hard_modals = {'must', 'shall', 'will', 'have', 'need', 'required', 'ought', 'cannot', 'should'}
soft_modals = {'might', 'could', 'may', 'perhaps', 'maybe', 'tend'}
# Extract by sentence-level modal presence
for sent in doc.sents:
sent_lower = sent.text.lower()
# Check for hard modals
if any(modal in sent_lower for modal in hard_modals):
commitments.add(sent.text.strip())
# Check for soft modals
elif any(modal in sent_lower for modal in soft_modals):
commitments.add(sent.text.strip())
return commitments
def extract_from_texts(texts, model_name='en_core_web_sm'):
nlp = load_spacy_model(model_name)
all_commitments = {}
for text in texts:
commitments = extract_hard_commitments(text, nlp)
all_commitments[text] = commitments
return all_commitments
def extract_hard(text: str, nlp=None) -> set:
"""Shorthand for extract_hard_commitments."""
return extract_hard_commitments(text, nlp) |