File size: 2,694 Bytes

2a64ad4

# ...new file...
import re
import json
import hashlib
import dateparser
import spacy

nlp = spacy.load("en_core_web_sm")

NUM_RE = re.compile(r'\$?\d{1,3}(?:[,\d]*)?(?:\.\d+)?')

MODAL_LEX = {
    "must": "OBLIGATION", "shall": "OBLIGATION", "required": "OBLIGATION",
    "must not": "PROHIBITION", "shall not": "PROHIBITION", "cannot": "PROHIBITION",
    "may": "PERMISSION", "is defined as": "DEFINITION", "means": "DEFINITION"
}

def normalize_text(s: str) -> str:
    s = s.strip()
    s = s.replace("—", "-").replace("–", "-")
    s = " ".join(s.split())
    return s

def canonicalize_number(tok: str) -> str:
    # convert simple money/number patterns to placeholders
    if NUM_RE.search(tok):
        return "#NUM"
    dt = dateparser.parse(tok)
    if dt:
        return dt.date().isoformat()
    return tok.lower()

def sentence_candidates(text: str):
    doc = nlp(normalize_text(text))
    return [sent.text.strip() for sent in doc.sents]

def cue_lookup(sent: str):
    s = sent.lower()
    for cue, mod in MODAL_LEX.items():
        if cue in s:
            return cue, mod
    return None, None

def build_tuple_from_sentence(sent: str):
    cue, modality = cue_lookup(sent)
    doc = nlp(sent)
    subj = None
    obj = None
    verb = None
    cond = None
    # regex conditional capture
    m = re.search(r'(.+?)\b(if|when|provided that|unless|in the event that)\b(.+)', sent, flags=re.I)
    if m:
        cond = m.group(3).strip()
    # dependency heuristics
    for token in doc:
        if token.dep_ in ("nsubj", "nsubjpass") and subj is None:
            subj = token.text
        if token.dep_ in ("dobj", "pobj", "attr") and obj is None:
            obj = token.text
        if token.pos_ == "VERB" and verb is None:
            verb = token.lemma_
    subj = subj or "UNKNOWN"
    verb = verb or ""
    obj = obj or ""
    # canonicalize object tokens
    obj_canon = " ".join(canonicalize_number(t.text) for t in nlp(obj)) if obj else ""
    cond_canon = cond.lower() if cond else ""
    tup = {
        "actor": subj.lower(),
        "modality": modality or "UNMARKED",
        "action": verb,
        "object": obj_canon,
        "condition": cond_canon
    }
    # canonical key deterministic JSON
    key = json.dumps(tup, sort_keys=True, separators=(',', ':'))
    key_hash = hashlib.sha256(key.encode("utf8")).hexdigest()[:12]
    return tup, key, key_hash

def extract_hard(text: str):
    keys = []
    for sent in sentence_candidates(text):
        cue, _ = cue_lookup(sent)
        if cue:
            tup, key, h = build_tuple_from_sentence(sent)
            keys.append(key)
    # deterministic fallback: if none, emit empty set
    return set(keys)