""" Sentence segmentation module. Uses spaCy's sentence boundary detection for accurate segmentation of potentially malformed dyslectic text. """ import spacy from typing import List from loguru import logger class SentenceSegmenter: """Segments text into sentences using spaCy's transformer model.""" def __init__(self, model_name: str = "en_core_web_trf"): try: self.nlp = spacy.load(model_name) except OSError: logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'") self.nlp = spacy.load("en_core_web_sm") logger.info(f"SentenceSegmenter loaded with model: {self.nlp.meta['name']}") def segment(self, text: str) -> List[str]: """Split text into individual sentences.""" if not text or not text.strip(): return [] doc = self.nlp(text) sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()] return sentences