burnmydays's picture
Initial commit: Commitment Conservation Framework
2a64ad4
# ...new file...
import re
import json
import hashlib
import dateparser
import spacy
nlp = spacy.load("en_core_web_sm")
NUM_RE = re.compile(r'\$?\d{1,3}(?:[,\d]*)?(?:\.\d+)?')
MODAL_LEX = {
"must": "OBLIGATION", "shall": "OBLIGATION", "required": "OBLIGATION",
"must not": "PROHIBITION", "shall not": "PROHIBITION", "cannot": "PROHIBITION",
"may": "PERMISSION", "is defined as": "DEFINITION", "means": "DEFINITION"
}
def normalize_text(s: str) -> str:
s = s.strip()
s = s.replace("—", "-").replace("–", "-")
s = " ".join(s.split())
return s
def canonicalize_number(tok: str) -> str:
# convert simple money/number patterns to placeholders
if NUM_RE.search(tok):
return "#NUM"
dt = dateparser.parse(tok)
if dt:
return dt.date().isoformat()
return tok.lower()
def sentence_candidates(text: str):
doc = nlp(normalize_text(text))
return [sent.text.strip() for sent in doc.sents]
def cue_lookup(sent: str):
s = sent.lower()
for cue, mod in MODAL_LEX.items():
if cue in s:
return cue, mod
return None, None
def build_tuple_from_sentence(sent: str):
cue, modality = cue_lookup(sent)
doc = nlp(sent)
subj = None
obj = None
verb = None
cond = None
# regex conditional capture
m = re.search(r'(.+?)\b(if|when|provided that|unless|in the event that)\b(.+)', sent, flags=re.I)
if m:
cond = m.group(3).strip()
# dependency heuristics
for token in doc:
if token.dep_ in ("nsubj", "nsubjpass") and subj is None:
subj = token.text
if token.dep_ in ("dobj", "pobj", "attr") and obj is None:
obj = token.text
if token.pos_ == "VERB" and verb is None:
verb = token.lemma_
subj = subj or "UNKNOWN"
verb = verb or ""
obj = obj or ""
# canonicalize object tokens
obj_canon = " ".join(canonicalize_number(t.text) for t in nlp(obj)) if obj else ""
cond_canon = cond.lower() if cond else ""
tup = {
"actor": subj.lower(),
"modality": modality or "UNMARKED",
"action": verb,
"object": obj_canon,
"condition": cond_canon
}
# canonical key deterministic JSON
key = json.dumps(tup, sort_keys=True, separators=(',', ':'))
key_hash = hashlib.sha256(key.encode("utf8")).hexdigest()[:12]
return tup, key, key_hash
def extract_hard(text: str):
keys = []
for sent in sentence_candidates(text):
cue, _ = cue_lookup(sent)
if cue:
tup, key, h = build_tuple_from_sentence(sent)
keys.append(key)
# deterministic fallback: if none, emit empty set
return set(keys)