rewrite / src /preprocessing /sentence_segmenter.py
morpheuslord's picture
Add files using upload-large-folder tool
12fd5f2 verified
"""
Sentence segmentation module.
Uses spaCy's sentence boundary detection for accurate segmentation
of potentially malformed dyslectic text.
"""
import spacy
from typing import List
from loguru import logger
class SentenceSegmenter:
"""Segments text into sentences using spaCy's transformer model."""
def __init__(self, model_name: str = "en_core_web_trf"):
try:
self.nlp = spacy.load(model_name)
except OSError:
logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'")
self.nlp = spacy.load("en_core_web_sm")
logger.info(f"SentenceSegmenter loaded with model: {self.nlp.meta['name']}")
def segment(self, text: str) -> List[str]:
"""Split text into individual sentences."""
if not text or not text.strip():
return []
doc = self.nlp(text)
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
return sentences