File size: 993 Bytes
12fd5f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | """
Sentence segmentation module.
Uses spaCy's sentence boundary detection for accurate segmentation
of potentially malformed dyslectic text.
"""
import spacy
from typing import List
from loguru import logger
class SentenceSegmenter:
"""Segments text into sentences using spaCy's transformer model."""
def __init__(self, model_name: str = "en_core_web_trf"):
try:
self.nlp = spacy.load(model_name)
except OSError:
logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'")
self.nlp = spacy.load("en_core_web_sm")
logger.info(f"SentenceSegmenter loaded with model: {self.nlp.meta['name']}")
def segment(self, text: str) -> List[str]:
"""Split text into individual sentences."""
if not text or not text.strip():
return []
doc = self.nlp(text)
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
return sentences
|