File size: 2,493 Bytes
12fd5f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | """
Dependency parser module.
Extracts grammatical skeletons (subject-verb-object) from sentences
using spaCy's dependency parse trees.
"""
import spacy
from typing import Dict, List, Any
from loguru import logger
class DependencyParser:
"""Extracts dependency trees and SVO triples from text."""
def __init__(self, model_name: str = "en_core_web_trf"):
try:
self.nlp = spacy.load(model_name)
except OSError:
logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'")
self.nlp = spacy.load("en_core_web_sm")
def parse(self, text: str) -> List[Dict[str, Any]]:
"""Extract dependency tree for each sentence."""
if not text or not text.strip():
return []
doc = self.nlp(text)
trees = []
for sent in doc.sents:
tokens = []
for token in sent:
tokens.append({
"text": token.text,
"lemma": token.lemma_,
"pos": token.pos_,
"dep": token.dep_,
"head": token.head.text,
"head_idx": token.head.i - sent.start,
"children": [child.text for child in token.children],
})
trees.append({
"sentence": sent.text,
"tokens": tokens,
"root": sent.root.text,
})
return trees
def extract_svo(self, text: str) -> List[Dict[str, List[str]]]:
"""Extract subject-verb-object triples per sentence."""
if not text or not text.strip():
return []
doc = self.nlp(text)
results = []
for sent in doc.sents:
subjects = []
verbs = []
objects = []
for token in sent:
if token.dep_ in ("nsubj", "nsubjpass"):
subjects.append(token.text)
# The head of nsubj is typically the verb
if token.head.pos_ == "VERB":
verbs.append(token.head.text)
elif token.dep_ in ("dobj", "pobj", "attr"):
objects.append(token.text)
# Deduplicate verbs
verbs = list(dict.fromkeys(verbs))
results.append({
"subjects": subjects,
"verbs": verbs,
"objects": objects,
})
return results
|