File size: 5,735 Bytes
12fd5f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
Master pre-processing pipeline. Runs all NLP stages in sequence.
Returns a PreprocessedDoc object with all annotations attached.
"""

import spacy
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
from .spell_corrector import DyslexiaAwareSpellCorrector
import textstat
from loguru import logger


@dataclass
class EntitySpan:
    text: str
    label: str
    start_char: int
    end_char: int


@dataclass
class PreprocessedDoc:
    original_text: str
    corrected_text: str
    sentences: List[str]
    entities: List[EntitySpan]          # Never to be modified by rewriter
    dependency_trees: List[Dict]        # Grammatical skeletons per sentence
    pos_tags: List[List[tuple]]         # (token, POS) per sentence
    readability: Dict[str, float]       # Flesch-Kincaid, Gunning Fog, etc.
    sentence_lengths: List[int]
    protected_spans: List[tuple]        # (start, end) char spans to never touch


class PreprocessingPipeline:
    """Orchestrates all pre-processing stages: spell correction, parsing, NER, readability."""

    def __init__(self, model_name: str = "en_core_web_trf"):
        # Load spaCy model with fallback
        try:
            self.nlp = spacy.load(model_name)
        except OSError:
            logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'")
            self.nlp = spacy.load("en_core_web_sm")

        # Initialise spell corrector
        self.spell_corrector = DyslexiaAwareSpellCorrector()
        logger.info("PreprocessingPipeline initialised")

    def _extract_readability(self, text: str) -> Dict[str, float]:
        """Compute readability scores (Flesch-Kincaid, Gunning Fog, etc.)."""
        if not text or not text.strip():
            return {
                "flesch_kincaid_grade": 0.0,
                "gunning_fog": 0.0,
                "smog_index": 0.0,
                "automated_readability_index": 0.0,
                "flesch_reading_ease": 0.0,
                "coleman_liau_index": 0.0,
            }
        return {
            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
            "gunning_fog": textstat.gunning_fog(text),
            "smog_index": textstat.smog_index(text),
            "automated_readability_index": textstat.automated_readability_index(text),
            "flesch_reading_ease": textstat.flesch_reading_ease(text),
            "coleman_liau_index": textstat.coleman_liau_index(text),
        }

    def _extract_dep_tree(self, sent) -> Dict:
        """Extract grammatical skeleton: subject-verb-object per sentence."""
        subjects = []
        verbs = []
        objects = []
        for token in sent:
            if token.dep_ in ("nsubj", "nsubjpass"):
                subjects.append(token.text)
                if token.head.pos_ == "VERB":
                    verbs.append(token.head.text)
            elif token.dep_ in ("dobj", "pobj", "attr"):
                objects.append(token.text)
        return {
            "sentence": sent.text,
            "subjects": subjects,
            "verbs": list(dict.fromkeys(verbs)),
            "objects": objects,
            "root": sent.root.text if sent.root else "",
        }

    def process(self, raw_text: str) -> PreprocessedDoc:
        """Run full pre-processing pipeline on raw text.

        7-step pipeline:
        1. Spell correction (phonetic + spellcheck + grammar)
        2. spaCy parsing
        3. Sentence segmentation
        4. Named entity recognition
        5. Dependency tree extraction
        6. POS tagging
        7. Readability scoring
        """
        if not raw_text or not raw_text.strip():
            return PreprocessedDoc(
                original_text=raw_text,
                corrected_text=raw_text or "",
                sentences=[],
                entities=[],
                dependency_trees=[],
                pos_tags=[],
                readability=self._extract_readability(""),
                sentence_lengths=[],
                protected_spans=[],
            )

        # Step 1: Spell correction
        corrected = self.spell_corrector.correct(raw_text)

        # Step 2: Parse corrected text with spaCy
        doc = self.nlp(corrected)

        # Step 3: Sentence segmentation
        sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

        # Step 4: NER — extract entities and protected spans
        entities = []
        protected_spans = []
        for ent in doc.ents:
            entities.append(EntitySpan(
                text=ent.text,
                label=ent.label_,
                start_char=ent.start_char,
                end_char=ent.end_char,
            ))
            protected_spans.append((ent.start_char, ent.end_char))

        # Step 5: Dependency trees per sentence
        dependency_trees = []
        for sent in doc.sents:
            dependency_trees.append(self._extract_dep_tree(sent))

        # Step 6: POS tags per sentence
        pos_tags = []
        for sent in doc.sents:
            sent_tags = [(token.text, token.pos_) for token in sent]
            pos_tags.append(sent_tags)

        # Step 7: Readability
        readability = self._extract_readability(corrected)

        # Sentence lengths
        sentence_lengths = [len(s.split()) for s in sentences]

        return PreprocessedDoc(
            original_text=raw_text,
            corrected_text=corrected,
            sentences=sentences,
            entities=entities,
            dependency_trees=dependency_trees,
            pos_tags=pos_tags,
            readability=readability,
            sentence_lengths=sentence_lengths,
            protected_spans=protected_spans,
        )