| """ |
| Dependency parser module. |
| Extracts grammatical skeletons (subject-verb-object) from sentences |
| using spaCy's dependency parse trees. |
| """ |
|
|
| import spacy |
| from typing import Dict, List, Any |
| from loguru import logger |
|
|
|
|
| class DependencyParser: |
| """Extracts dependency trees and SVO triples from text.""" |
|
|
| def __init__(self, model_name: str = "en_core_web_trf"): |
| try: |
| self.nlp = spacy.load(model_name) |
| except OSError: |
| logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'") |
| self.nlp = spacy.load("en_core_web_sm") |
|
|
| def parse(self, text: str) -> List[Dict[str, Any]]: |
| """Extract dependency tree for each sentence.""" |
| if not text or not text.strip(): |
| return [] |
| doc = self.nlp(text) |
| trees = [] |
| for sent in doc.sents: |
| tokens = [] |
| for token in sent: |
| tokens.append({ |
| "text": token.text, |
| "lemma": token.lemma_, |
| "pos": token.pos_, |
| "dep": token.dep_, |
| "head": token.head.text, |
| "head_idx": token.head.i - sent.start, |
| "children": [child.text for child in token.children], |
| }) |
| trees.append({ |
| "sentence": sent.text, |
| "tokens": tokens, |
| "root": sent.root.text, |
| }) |
| return trees |
|
|
| def extract_svo(self, text: str) -> List[Dict[str, List[str]]]: |
| """Extract subject-verb-object triples per sentence.""" |
| if not text or not text.strip(): |
| return [] |
| doc = self.nlp(text) |
| results = [] |
| for sent in doc.sents: |
| subjects = [] |
| verbs = [] |
| objects = [] |
| for token in sent: |
| if token.dep_ in ("nsubj", "nsubjpass"): |
| subjects.append(token.text) |
| |
| if token.head.pos_ == "VERB": |
| verbs.append(token.head.text) |
| elif token.dep_ in ("dobj", "pobj", "attr"): |
| objects.append(token.text) |
| |
| verbs = list(dict.fromkeys(verbs)) |
| results.append({ |
| "subjects": subjects, |
| "verbs": verbs, |
| "objects": objects, |
| }) |
| return results |
|
|