|
|
|
|
|
import re |
|
|
import json |
|
|
import hashlib |
|
|
import dateparser |
|
|
import spacy |
|
|
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
NUM_RE = re.compile(r'\$?\d{1,3}(?:[,\d]*)?(?:\.\d+)?') |
|
|
|
|
|
MODAL_LEX = { |
|
|
"must": "OBLIGATION", "shall": "OBLIGATION", "required": "OBLIGATION", |
|
|
"must not": "PROHIBITION", "shall not": "PROHIBITION", "cannot": "PROHIBITION", |
|
|
"may": "PERMISSION", "is defined as": "DEFINITION", "means": "DEFINITION" |
|
|
} |
|
|
|
|
|
def normalize_text(s: str) -> str: |
|
|
s = s.strip() |
|
|
s = s.replace("—", "-").replace("–", "-") |
|
|
s = " ".join(s.split()) |
|
|
return s |
|
|
|
|
|
def canonicalize_number(tok: str) -> str: |
|
|
|
|
|
if NUM_RE.search(tok): |
|
|
return "#NUM" |
|
|
dt = dateparser.parse(tok) |
|
|
if dt: |
|
|
return dt.date().isoformat() |
|
|
return tok.lower() |
|
|
|
|
|
def sentence_candidates(text: str): |
|
|
doc = nlp(normalize_text(text)) |
|
|
return [sent.text.strip() for sent in doc.sents] |
|
|
|
|
|
def cue_lookup(sent: str): |
|
|
s = sent.lower() |
|
|
for cue, mod in MODAL_LEX.items(): |
|
|
if cue in s: |
|
|
return cue, mod |
|
|
return None, None |
|
|
|
|
|
def build_tuple_from_sentence(sent: str): |
|
|
cue, modality = cue_lookup(sent) |
|
|
doc = nlp(sent) |
|
|
subj = None |
|
|
obj = None |
|
|
verb = None |
|
|
cond = None |
|
|
|
|
|
m = re.search(r'(.+?)\b(if|when|provided that|unless|in the event that)\b(.+)', sent, flags=re.I) |
|
|
if m: |
|
|
cond = m.group(3).strip() |
|
|
|
|
|
for token in doc: |
|
|
if token.dep_ in ("nsubj", "nsubjpass") and subj is None: |
|
|
subj = token.text |
|
|
if token.dep_ in ("dobj", "pobj", "attr") and obj is None: |
|
|
obj = token.text |
|
|
if token.pos_ == "VERB" and verb is None: |
|
|
verb = token.lemma_ |
|
|
subj = subj or "UNKNOWN" |
|
|
verb = verb or "" |
|
|
obj = obj or "" |
|
|
|
|
|
obj_canon = " ".join(canonicalize_number(t.text) for t in nlp(obj)) if obj else "" |
|
|
cond_canon = cond.lower() if cond else "" |
|
|
tup = { |
|
|
"actor": subj.lower(), |
|
|
"modality": modality or "UNMARKED", |
|
|
"action": verb, |
|
|
"object": obj_canon, |
|
|
"condition": cond_canon |
|
|
} |
|
|
|
|
|
key = json.dumps(tup, sort_keys=True, separators=(',', ':')) |
|
|
key_hash = hashlib.sha256(key.encode("utf8")).hexdigest()[:12] |
|
|
return tup, key, key_hash |
|
|
|
|
|
def extract_hard(text: str): |
|
|
keys = [] |
|
|
for sent in sentence_candidates(text): |
|
|
cue, _ = cue_lookup(sent) |
|
|
if cue: |
|
|
tup, key, h = build_tuple_from_sentence(sent) |
|
|
keys.append(key) |
|
|
|
|
|
return set(keys) |