""" Sentence Segmentation Module Handles splitting submission text into individual sentences for sentence-level categorization. """ import re from typing import List class SentenceSegmenter: """ Segments text into sentences using rule-based approach. Handles common cases in participatory planning submissions: - Standard sentence endings (. ! ?) - Abbreviations (Dr., Mr., etc.) - Numbered lists (1. Item, 2. Item) - Bullet points """ # Common abbreviations that shouldn't trigger sentence breaks ABBREVIATIONS = { 'Dr', 'Mr', 'Mrs', 'Ms', 'Jr', 'Sr', 'vs', 'etc', 'e.g', 'i.e', 'St', 'Ave', 'Blvd', 'Rd', 'No', 'Vol', 'Fig', 'Inc', 'Ltd', 'Co' } def __init__(self): # Build abbreviation pattern abbrev_pattern = '|'.join([re.escape(a) for a in self.ABBREVIATIONS]) self.abbrev_re = re.compile(f'\\b({abbrev_pattern})\\.', re.IGNORECASE) def segment(self, text: str) -> List[str]: """ Segment text into sentences. Args: text: Input text to segment Returns: List of sentence strings """ if not text or not text.strip(): return [] # Normalize whitespace text = ' '.join(text.split()) # Protect abbreviations temporarily text = self.abbrev_re.sub(r'\1', text) # Split on sentence-ending punctuation # Pattern: period/question/exclamation followed by space and capital letter # OR at end of string sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$', text) # Restore abbreviations sentences = [s.replace('', '.') for s in sentences] # Clean and filter sentences = [self._clean_sentence(s) for s in sentences] sentences = [s for s in sentences if s] # Remove empty return sentences def _clean_sentence(self, sentence: str) -> str: """Clean individual sentence""" # Remove leading/trailing whitespace sentence = sentence.strip() # Remove leading bullet points or numbers sentence = re.sub(r'^[\d\-•\*]+[\.)]\s*', '', sentence) return sentence def segment_submission(text: str) -> List[str]: """ Convenience function to segment a submission into sentences. Args: text: Submission text Returns: List of sentences """ segmenter = SentenceSegmenter() return segmenter.segment(text)