File size: 5,735 Bytes

12fd5f2

"""
Master pre-processing pipeline. Runs all NLP stages in sequence.
Returns a PreprocessedDoc object with all annotations attached.
"""

import spacy
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
from .spell_corrector import DyslexiaAwareSpellCorrector
import textstat
from loguru import logger


@dataclass
class EntitySpan:
    text: str
    label: str
    start_char: int
    end_char: int


@dataclass
class PreprocessedDoc:
    original_text: str
    corrected_text: str
    sentences: List[str]
    entities: List[EntitySpan]          # Never to be modified by rewriter
    dependency_trees: List[Dict]        # Grammatical skeletons per sentence
    pos_tags: List[List[tuple]]         # (token, POS) per sentence
    readability: Dict[str, float]       # Flesch-Kincaid, Gunning Fog, etc.
    sentence_lengths: List[int]
    protected_spans: List[tuple]        # (start, end) char spans to never touch


class PreprocessingPipeline:
    """Orchestrates all pre-processing stages: spell correction, parsing, NER, readability."""

    def __init__(self, model_name: str = "en_core_web_trf"):
        # Load spaCy model with fallback
        try:
            self.nlp = spacy.load(model_name)
        except OSError:
            logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'")
            self.nlp = spacy.load("en_core_web_sm")

        # Initialise spell corrector
        self.spell_corrector = DyslexiaAwareSpellCorrector()
        logger.info("PreprocessingPipeline initialised")

    def _extract_readability(self, text: str) -> Dict[str, float]:
        """Compute readability scores (Flesch-Kincaid, Gunning Fog, etc.)."""
        if not text or not text.strip():
            return {
                "flesch_kincaid_grade": 0.0,
                "gunning_fog": 0.0,
                "smog_index": 0.0,
                "automated_readability_index": 0.0,
                "flesch_reading_ease": 0.0,
                "coleman_liau_index": 0.0,
            }
        return {
            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
            "gunning_fog": textstat.gunning_fog(text),
            "smog_index": textstat.smog_index(text),
            "automated_readability_index": textstat.automated_readability_index(text),
            "flesch_reading_ease": textstat.flesch_reading_ease(text),
            "coleman_liau_index": textstat.coleman_liau_index(text),
        }

    def _extract_dep_tree(self, sent) -> Dict:
        """Extract grammatical skeleton: subject-verb-object per sentence."""
        subjects = []
        verbs = []
        objects = []
        for token in sent:
            if token.dep_ in ("nsubj", "nsubjpass"):
                subjects.append(token.text)
                if token.head.pos_ == "VERB":
                    verbs.append(token.head.text)
            elif token.dep_ in ("dobj", "pobj", "attr"):
                objects.append(token.text)
        return {
            "sentence": sent.text,
            "subjects": subjects,
            "verbs": list(dict.fromkeys(verbs)),
            "objects": objects,
            "root": sent.root.text if sent.root else "",
        }

    def process(self, raw_text: str) -> PreprocessedDoc:
        """Run full pre-processing pipeline on raw text.

        7-step pipeline:
        1. Spell correction (phonetic + spellcheck + grammar)
        2. spaCy parsing
        3. Sentence segmentation
        4. Named entity recognition
        5. Dependency tree extraction
        6. POS tagging
        7. Readability scoring
        """
        if not raw_text or not raw_text.strip():
            return PreprocessedDoc(
                original_text=raw_text,
                corrected_text=raw_text or "",
                sentences=[],
                entities=[],
                dependency_trees=[],
                pos_tags=[],
                readability=self._extract_readability(""),
                sentence_lengths=[],
                protected_spans=[],
            )

        # Step 1: Spell correction
        corrected = self.spell_corrector.correct(raw_text)

        # Step 2: Parse corrected text with spaCy
        doc = self.nlp(corrected)

        # Step 3: Sentence segmentation
        sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

        # Step 4: NER — extract entities and protected spans
        entities = []
        protected_spans = []
        for ent in doc.ents:
            entities.append(EntitySpan(
                text=ent.text,
                label=ent.label_,
                start_char=ent.start_char,
                end_char=ent.end_char,
            ))
            protected_spans.append((ent.start_char, ent.end_char))

        # Step 5: Dependency trees per sentence
        dependency_trees = []
        for sent in doc.sents:
            dependency_trees.append(self._extract_dep_tree(sent))

        # Step 6: POS tags per sentence
        pos_tags = []
        for sent in doc.sents:
            sent_tags = [(token.text, token.pos_) for token in sent]
            pos_tags.append(sent_tags)

        # Step 7: Readability
        readability = self._extract_readability(corrected)

        # Sentence lengths
        sentence_lengths = [len(s.split()) for s in sentences]

        return PreprocessedDoc(
            original_text=raw_text,
            corrected_text=corrected,
            sentences=sentences,
            entities=entities,
            dependency_trees=dependency_trees,
            pos_tags=pos_tags,
            readability=readability,
            sentence_lengths=sentence_lengths,
            protected_spans=protected_spans,
        )