File size: 993 Bytes
12fd5f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
"""
Sentence segmentation module.
Uses spaCy's sentence boundary detection for accurate segmentation
of potentially malformed dyslectic text.
"""

import spacy
from typing import List
from loguru import logger


class SentenceSegmenter:
    """Segments text into sentences using spaCy's transformer model."""

    def __init__(self, model_name: str = "en_core_web_trf"):
        try:
            self.nlp = spacy.load(model_name)
        except OSError:
            logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'")
            self.nlp = spacy.load("en_core_web_sm")
        logger.info(f"SentenceSegmenter loaded with model: {self.nlp.meta['name']}")

    def segment(self, text: str) -> List[str]:
        """Split text into individual sentences."""
        if not text or not text.strip():
            return []
        doc = self.nlp(text)
        sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
        return sentences