File size: 5,735 Bytes
12fd5f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | """
Master pre-processing pipeline. Runs all NLP stages in sequence.
Returns a PreprocessedDoc object with all annotations attached.
"""
import spacy
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
from .spell_corrector import DyslexiaAwareSpellCorrector
import textstat
from loguru import logger
@dataclass
class EntitySpan:
text: str
label: str
start_char: int
end_char: int
@dataclass
class PreprocessedDoc:
original_text: str
corrected_text: str
sentences: List[str]
entities: List[EntitySpan] # Never to be modified by rewriter
dependency_trees: List[Dict] # Grammatical skeletons per sentence
pos_tags: List[List[tuple]] # (token, POS) per sentence
readability: Dict[str, float] # Flesch-Kincaid, Gunning Fog, etc.
sentence_lengths: List[int]
protected_spans: List[tuple] # (start, end) char spans to never touch
class PreprocessingPipeline:
"""Orchestrates all pre-processing stages: spell correction, parsing, NER, readability."""
def __init__(self, model_name: str = "en_core_web_trf"):
# Load spaCy model with fallback
try:
self.nlp = spacy.load(model_name)
except OSError:
logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'")
self.nlp = spacy.load("en_core_web_sm")
# Initialise spell corrector
self.spell_corrector = DyslexiaAwareSpellCorrector()
logger.info("PreprocessingPipeline initialised")
def _extract_readability(self, text: str) -> Dict[str, float]:
"""Compute readability scores (Flesch-Kincaid, Gunning Fog, etc.)."""
if not text or not text.strip():
return {
"flesch_kincaid_grade": 0.0,
"gunning_fog": 0.0,
"smog_index": 0.0,
"automated_readability_index": 0.0,
"flesch_reading_ease": 0.0,
"coleman_liau_index": 0.0,
}
return {
"flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
"gunning_fog": textstat.gunning_fog(text),
"smog_index": textstat.smog_index(text),
"automated_readability_index": textstat.automated_readability_index(text),
"flesch_reading_ease": textstat.flesch_reading_ease(text),
"coleman_liau_index": textstat.coleman_liau_index(text),
}
def _extract_dep_tree(self, sent) -> Dict:
"""Extract grammatical skeleton: subject-verb-object per sentence."""
subjects = []
verbs = []
objects = []
for token in sent:
if token.dep_ in ("nsubj", "nsubjpass"):
subjects.append(token.text)
if token.head.pos_ == "VERB":
verbs.append(token.head.text)
elif token.dep_ in ("dobj", "pobj", "attr"):
objects.append(token.text)
return {
"sentence": sent.text,
"subjects": subjects,
"verbs": list(dict.fromkeys(verbs)),
"objects": objects,
"root": sent.root.text if sent.root else "",
}
def process(self, raw_text: str) -> PreprocessedDoc:
"""Run full pre-processing pipeline on raw text.
7-step pipeline:
1. Spell correction (phonetic + spellcheck + grammar)
2. spaCy parsing
3. Sentence segmentation
4. Named entity recognition
5. Dependency tree extraction
6. POS tagging
7. Readability scoring
"""
if not raw_text or not raw_text.strip():
return PreprocessedDoc(
original_text=raw_text,
corrected_text=raw_text or "",
sentences=[],
entities=[],
dependency_trees=[],
pos_tags=[],
readability=self._extract_readability(""),
sentence_lengths=[],
protected_spans=[],
)
# Step 1: Spell correction
corrected = self.spell_corrector.correct(raw_text)
# Step 2: Parse corrected text with spaCy
doc = self.nlp(corrected)
# Step 3: Sentence segmentation
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
# Step 4: NER — extract entities and protected spans
entities = []
protected_spans = []
for ent in doc.ents:
entities.append(EntitySpan(
text=ent.text,
label=ent.label_,
start_char=ent.start_char,
end_char=ent.end_char,
))
protected_spans.append((ent.start_char, ent.end_char))
# Step 5: Dependency trees per sentence
dependency_trees = []
for sent in doc.sents:
dependency_trees.append(self._extract_dep_tree(sent))
# Step 6: POS tags per sentence
pos_tags = []
for sent in doc.sents:
sent_tags = [(token.text, token.pos_) for token in sent]
pos_tags.append(sent_tags)
# Step 7: Readability
readability = self._extract_readability(corrected)
# Sentence lengths
sentence_lengths = [len(s.split()) for s in sentences]
return PreprocessedDoc(
original_text=raw_text,
corrected_text=corrected,
sentences=sentences,
entities=entities,
dependency_trees=dependency_trees,
pos_tags=pos_tags,
readability=readability,
sentence_lengths=sentence_lengths,
protected_spans=protected_spans,
)
|