rewrite / src /preprocessing /dependency_parser.py
morpheuslord's picture
Add files using upload-large-folder tool
12fd5f2 verified
"""
Dependency parser module.
Extracts grammatical skeletons (subject-verb-object) from sentences
using spaCy's dependency parse trees.
"""
import spacy
from typing import Dict, List, Any
from loguru import logger
class DependencyParser:
"""Extracts dependency trees and SVO triples from text."""
def __init__(self, model_name: str = "en_core_web_trf"):
try:
self.nlp = spacy.load(model_name)
except OSError:
logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'")
self.nlp = spacy.load("en_core_web_sm")
def parse(self, text: str) -> List[Dict[str, Any]]:
"""Extract dependency tree for each sentence."""
if not text or not text.strip():
return []
doc = self.nlp(text)
trees = []
for sent in doc.sents:
tokens = []
for token in sent:
tokens.append({
"text": token.text,
"lemma": token.lemma_,
"pos": token.pos_,
"dep": token.dep_,
"head": token.head.text,
"head_idx": token.head.i - sent.start,
"children": [child.text for child in token.children],
})
trees.append({
"sentence": sent.text,
"tokens": tokens,
"root": sent.root.text,
})
return trees
def extract_svo(self, text: str) -> List[Dict[str, List[str]]]:
"""Extract subject-verb-object triples per sentence."""
if not text or not text.strip():
return []
doc = self.nlp(text)
results = []
for sent in doc.sents:
subjects = []
verbs = []
objects = []
for token in sent:
if token.dep_ in ("nsubj", "nsubjpass"):
subjects.append(token.text)
# The head of nsubj is typically the verb
if token.head.pos_ == "VERB":
verbs.append(token.head.text)
elif token.dep_ in ("dobj", "pobj", "attr"):
objects.append(token.text)
# Deduplicate verbs
verbs = list(dict.fromkeys(verbs))
results.append({
"subjects": subjects,
"verbs": verbs,
"objects": objects,
})
return results