| """ | |
| Sentence segmentation module. | |
| Uses spaCy's sentence boundary detection for accurate segmentation | |
| of potentially malformed dyslectic text. | |
| """ | |
| import spacy | |
| from typing import List | |
| from loguru import logger | |
| class SentenceSegmenter: | |
| """Segments text into sentences using spaCy's transformer model.""" | |
| def __init__(self, model_name: str = "en_core_web_trf"): | |
| try: | |
| self.nlp = spacy.load(model_name) | |
| except OSError: | |
| logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'") | |
| self.nlp = spacy.load("en_core_web_sm") | |
| logger.info(f"SentenceSegmenter loaded with model: {self.nlp.meta['name']}") | |
| def segment(self, text: str) -> List[str]: | |
| """Split text into individual sentences.""" | |
| if not text or not text.strip(): | |
| return [] | |
| doc = self.nlp(text) | |
| sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()] | |
| return sentences | |