""" Dependency parser module. Extracts grammatical skeletons (subject-verb-object) from sentences using spaCy's dependency parse trees. """ import spacy from typing import Dict, List, Any from loguru import logger class DependencyParser: """Extracts dependency trees and SVO triples from text.""" def __init__(self, model_name: str = "en_core_web_trf"): try: self.nlp = spacy.load(model_name) except OSError: logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'") self.nlp = spacy.load("en_core_web_sm") def parse(self, text: str) -> List[Dict[str, Any]]: """Extract dependency tree for each sentence.""" if not text or not text.strip(): return [] doc = self.nlp(text) trees = [] for sent in doc.sents: tokens = [] for token in sent: tokens.append({ "text": token.text, "lemma": token.lemma_, "pos": token.pos_, "dep": token.dep_, "head": token.head.text, "head_idx": token.head.i - sent.start, "children": [child.text for child in token.children], }) trees.append({ "sentence": sent.text, "tokens": tokens, "root": sent.root.text, }) return trees def extract_svo(self, text: str) -> List[Dict[str, List[str]]]: """Extract subject-verb-object triples per sentence.""" if not text or not text.strip(): return [] doc = self.nlp(text) results = [] for sent in doc.sents: subjects = [] verbs = [] objects = [] for token in sent: if token.dep_ in ("nsubj", "nsubjpass"): subjects.append(token.text) # The head of nsubj is typically the verb if token.head.pos_ == "VERB": verbs.append(token.head.text) elif token.dep_ in ("dobj", "pobj", "attr"): objects.append(token.text) # Deduplicate verbs verbs = list(dict.fromkeys(verbs)) results.append({ "subjects": subjects, "verbs": verbs, "objects": objects, }) return results