rewrite / src /preprocessing /sentence_segmenter.py

Add files using upload-large-folder tool

12fd5f2 verified 7 days ago

993 Bytes

	"""
	Sentence segmentation module.
	Uses spaCy's sentence boundary detection for accurate segmentation
	of potentially malformed dyslectic text.
	"""

	import spacy
	from typing import List
	from loguru import logger


	class SentenceSegmenter:
	"""Segments text into sentences using spaCy's transformer model."""

	def __init__(self, model_name: str = "en_core_web_trf"):
	try:
	self.nlp = spacy.load(model_name)
	except OSError:
	logger.warning(f"spaCy model '{model_name}' not found, falling back to 'en_core_web_sm'")
	self.nlp = spacy.load("en_core_web_sm")
	logger.info(f"SentenceSegmenter loaded with model: {self.nlp.meta['name']}")

	def segment(self, text: str) -> List[str]:
	"""Split text into individual sentences."""
	if not text or not text.strip():
	return []
	doc = self.nlp(text)
	sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
	return sentences