File size: 2,493 Bytes
12fd5f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""
Dependency parser module.
Extracts grammatical skeletons (subject-verb-object) from sentences
using spaCy's dependency parse trees.
"""

import spacy
from typing import Dict, List, Any
from loguru import logger


class DependencyParser:
    """Extracts dependency trees and SVO triples from text."""

    def __init__(self, model_name: str = "en_core_web_trf"):
        try:
            self.nlp = spacy.load(model_name)
        except OSError:
            logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'")
            self.nlp = spacy.load("en_core_web_sm")

    def parse(self, text: str) -> List[Dict[str, Any]]:
        """Extract dependency tree for each sentence."""
        if not text or not text.strip():
            return []
        doc = self.nlp(text)
        trees = []
        for sent in doc.sents:
            tokens = []
            for token in sent:
                tokens.append({
                    "text": token.text,
                    "lemma": token.lemma_,
                    "pos": token.pos_,
                    "dep": token.dep_,
                    "head": token.head.text,
                    "head_idx": token.head.i - sent.start,
                    "children": [child.text for child in token.children],
                })
            trees.append({
                "sentence": sent.text,
                "tokens": tokens,
                "root": sent.root.text,
            })
        return trees

    def extract_svo(self, text: str) -> List[Dict[str, List[str]]]:
        """Extract subject-verb-object triples per sentence."""
        if not text or not text.strip():
            return []
        doc = self.nlp(text)
        results = []
        for sent in doc.sents:
            subjects = []
            verbs = []
            objects = []
            for token in sent:
                if token.dep_ in ("nsubj", "nsubjpass"):
                    subjects.append(token.text)
                    # The head of nsubj is typically the verb
                    if token.head.pos_ == "VERB":
                        verbs.append(token.head.text)
                elif token.dep_ in ("dobj", "pobj", "attr"):
                    objects.append(token.text)
            # Deduplicate verbs
            verbs = list(dict.fromkeys(verbs))
            results.append({
                "subjects": subjects,
                "verbs": verbs,
                "objects": objects,
            })
        return results