Spaces:
Runtime error
Runtime error
| """ | |
| Triples | |
| ------- | |
| :mod:`textacy.extract.triples`: Extract structured triples from a document or sentence | |
| through rule-based pattern-matching of the annotated tokens. | |
| """ | |
| from __future__ import annotations | |
| import collections | |
| from operator import attrgetter | |
| from typing import Iterable, List, Tuple | |
| from spacy.symbols import ( | |
| AUX, VERB, | |
| agent, attr, aux, auxpass, csubj, csubjpass, dobj, neg, nsubj, nsubjpass, obj, pobj, xcomp, | |
| ) | |
| from spacy.tokens import Span, Token | |
| from textacy import types | |
| _NOMINAL_SUBJ_DEPS = {nsubj, nsubjpass} | |
| _CLAUSAL_SUBJ_DEPS = {csubj, csubjpass} | |
| _ACTIVE_SUBJ_DEPS = {csubj, nsubj} | |
| _VERB_MODIFIER_DEPS = {aux, auxpass, neg} | |
| SVOTriple: Tuple[List[Token], List[Token], List[Token]] = collections.namedtuple( | |
| "SVOTriple", ["subject", "verb", "object"] | |
| ) | |
| def subject_verb_object_triples(doclike: types.DocLike) -> Iterable[SVOTriple]: | |
| """ | |
| Extract an ordered sequence of subject-verb-object triples from a document | |
| or sentence. | |
| Args: | |
| doclike | |
| Yields: | |
| Next SVO triple as (subject, verb, object), in approximate order of appearance. | |
| """ | |
| if isinstance(doclike, Span): | |
| sents = [doclike] | |
| else: | |
| sents = doclike.sents | |
| for sent in sents: | |
| # connect subjects/objects to direct verb heads | |
| # and expand them to include conjuncts, compound nouns, ... | |
| verb_sos = collections.defaultdict(lambda: collections.defaultdict(set)) | |
| for tok in sent: | |
| head = tok.head | |
| # ensure entry for all verbs, even if empty | |
| # to catch conjugate verbs without direct subject/object deps | |
| if tok.pos == VERB: | |
| _ = verb_sos[tok] | |
| # nominal subject of active or passive verb | |
| if tok.dep in _NOMINAL_SUBJ_DEPS: | |
| if head.pos == VERB: | |
| verb_sos[head]["subjects"].update(expand_noun(tok)) | |
| # clausal subject of active or passive verb | |
| elif tok.dep in _CLAUSAL_SUBJ_DEPS: | |
| if head.pos == VERB: | |
| verb_sos[head]["subjects"].update(tok.subtree) | |
| # nominal direct object of transitive verb | |
| elif tok.dep == obj: | |
| if head.pos == VERB: | |
| verb_sos[head]["objects"].update(expand_noun(tok)) | |
| # prepositional object acting as agent of passive verb | |
| elif tok.dep == pobj: | |
| if head.dep == agent and head.head.pos == VERB: | |
| verb_sos[head.head]["objects"].update(expand_noun(tok)) | |
| # open clausal complement, but not as a secondary predicate | |
| elif tok.dep == xcomp: | |
| if ( | |
| head.pos == VERB | |
| and not any(child.dep == obj for child in head.children) | |
| ): | |
| # TODO: just the verb, or the whole tree? | |
| # verb_sos[verb]["objects"].update(expand_verb(tok)) | |
| verb_sos[head]["objects"].update(tok.subtree) | |
| # fill in any indirect relationships connected via verb conjuncts | |
| for verb, so_dict in verb_sos.items(): | |
| conjuncts = verb.conjuncts | |
| if so_dict.get("subjects"): | |
| for conj in conjuncts: | |
| conj_so_dict = verb_sos.get(conj) | |
| if conj_so_dict and not conj_so_dict.get("subjects"): | |
| conj_so_dict["subjects"].update(so_dict["subjects"]) | |
| if not so_dict.get("objects"): | |
| so_dict["objects"].update( | |
| obj | |
| for conj in conjuncts | |
| for obj in verb_sos.get(conj, {}).get("objects", []) | |
| ) | |
| # expand verbs and restructure into svo triples | |
| for verb, so_dict in verb_sos.items(): | |
| if so_dict["subjects"] and so_dict["objects"]: | |
| yield SVOTriple( | |
| subject=sorted(so_dict["subjects"], key=attrgetter("i")), | |
| verb=sorted(expand_verb(verb), key=attrgetter("i")), | |
| object=sorted(so_dict["objects"], key=attrgetter("i")), | |
| ) | |
| def expand_noun(tok: Token) -> List[Token]: | |
| """Expand a noun token to include all associated conjunct and compound nouns.""" | |
| tok_and_conjuncts = [tok] + list(tok.conjuncts) | |
| compounds = [ | |
| child | |
| for tc in tok_and_conjuncts | |
| for child in tc.children | |
| # TODO: why doesn't compound import from spacy.symbols? | |
| if child.dep_ == "compound" | |
| ] | |
| return tok_and_conjuncts + compounds | |
| def expand_verb(tok: Token) -> List[Token]: | |
| """Expand a verb token to include all associated auxiliary and negation tokens.""" | |
| verb_modifiers = [ | |
| child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS | |
| ] | |
| return [tok] + verb_modifiers | |