| """ |
| Master pre-processing pipeline. Runs all NLP stages in sequence. |
| Returns a PreprocessedDoc object with all annotations attached. |
| """ |
|
|
| import spacy |
| from dataclasses import dataclass, field |
| from typing import List, Dict, Any, Optional |
| from .spell_corrector import DyslexiaAwareSpellCorrector |
| import textstat |
| from loguru import logger |
|
|
|
|
| @dataclass |
| class EntitySpan: |
| text: str |
| label: str |
| start_char: int |
| end_char: int |
|
|
|
|
| @dataclass |
| class PreprocessedDoc: |
| original_text: str |
| corrected_text: str |
| sentences: List[str] |
| entities: List[EntitySpan] |
| dependency_trees: List[Dict] |
| pos_tags: List[List[tuple]] |
| readability: Dict[str, float] |
| sentence_lengths: List[int] |
| protected_spans: List[tuple] |
|
|
|
|
| class PreprocessingPipeline: |
| """Orchestrates all pre-processing stages: spell correction, parsing, NER, readability.""" |
|
|
| def __init__(self, model_name: str = "en_core_web_trf"): |
| |
| try: |
| self.nlp = spacy.load(model_name) |
| except OSError: |
| logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'") |
| self.nlp = spacy.load("en_core_web_sm") |
|
|
| |
| self.spell_corrector = DyslexiaAwareSpellCorrector() |
| logger.info("PreprocessingPipeline initialised") |
|
|
| def _extract_readability(self, text: str) -> Dict[str, float]: |
| """Compute readability scores (Flesch-Kincaid, Gunning Fog, etc.).""" |
| if not text or not text.strip(): |
| return { |
| "flesch_kincaid_grade": 0.0, |
| "gunning_fog": 0.0, |
| "smog_index": 0.0, |
| "automated_readability_index": 0.0, |
| "flesch_reading_ease": 0.0, |
| "coleman_liau_index": 0.0, |
| } |
| return { |
| "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text), |
| "gunning_fog": textstat.gunning_fog(text), |
| "smog_index": textstat.smog_index(text), |
| "automated_readability_index": textstat.automated_readability_index(text), |
| "flesch_reading_ease": textstat.flesch_reading_ease(text), |
| "coleman_liau_index": textstat.coleman_liau_index(text), |
| } |
|
|
| def _extract_dep_tree(self, sent) -> Dict: |
| """Extract grammatical skeleton: subject-verb-object per sentence.""" |
| subjects = [] |
| verbs = [] |
| objects = [] |
| for token in sent: |
| if token.dep_ in ("nsubj", "nsubjpass"): |
| subjects.append(token.text) |
| if token.head.pos_ == "VERB": |
| verbs.append(token.head.text) |
| elif token.dep_ in ("dobj", "pobj", "attr"): |
| objects.append(token.text) |
| return { |
| "sentence": sent.text, |
| "subjects": subjects, |
| "verbs": list(dict.fromkeys(verbs)), |
| "objects": objects, |
| "root": sent.root.text if sent.root else "", |
| } |
|
|
| def process(self, raw_text: str) -> PreprocessedDoc: |
| """Run full pre-processing pipeline on raw text. |
| |
| 7-step pipeline: |
| 1. Spell correction (phonetic + spellcheck + grammar) |
| 2. spaCy parsing |
| 3. Sentence segmentation |
| 4. Named entity recognition |
| 5. Dependency tree extraction |
| 6. POS tagging |
| 7. Readability scoring |
| """ |
| if not raw_text or not raw_text.strip(): |
| return PreprocessedDoc( |
| original_text=raw_text, |
| corrected_text=raw_text or "", |
| sentences=[], |
| entities=[], |
| dependency_trees=[], |
| pos_tags=[], |
| readability=self._extract_readability(""), |
| sentence_lengths=[], |
| protected_spans=[], |
| ) |
|
|
| |
| corrected = self.spell_corrector.correct(raw_text) |
|
|
| |
| doc = self.nlp(corrected) |
|
|
| |
| sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()] |
|
|
| |
| entities = [] |
| protected_spans = [] |
| for ent in doc.ents: |
| entities.append(EntitySpan( |
| text=ent.text, |
| label=ent.label_, |
| start_char=ent.start_char, |
| end_char=ent.end_char, |
| )) |
| protected_spans.append((ent.start_char, ent.end_char)) |
|
|
| |
| dependency_trees = [] |
| for sent in doc.sents: |
| dependency_trees.append(self._extract_dep_tree(sent)) |
|
|
| |
| pos_tags = [] |
| for sent in doc.sents: |
| sent_tags = [(token.text, token.pos_) for token in sent] |
| pos_tags.append(sent_tags) |
|
|
| |
| readability = self._extract_readability(corrected) |
|
|
| |
| sentence_lengths = [len(s.split()) for s in sentences] |
|
|
| return PreprocessedDoc( |
| original_text=raw_text, |
| corrected_text=corrected, |
| sentences=sentences, |
| entities=entities, |
| dependency_trees=dependency_trees, |
| pos_tags=pos_tags, |
| readability=readability, |
| sentence_lengths=sentence_lengths, |
| protected_spans=protected_spans, |
| ) |
|
|