Spaces:
Sleeping
Sleeping
| """ | |
| Sentence Segmentation Module | |
| Handles splitting submission text into individual sentences for | |
| sentence-level categorization. | |
| """ | |
| import re | |
| from typing import List | |
| class SentenceSegmenter: | |
| """ | |
| Segments text into sentences using rule-based approach. | |
| Handles common cases in participatory planning submissions: | |
| - Standard sentence endings (. ! ?) | |
| - Abbreviations (Dr., Mr., etc.) | |
| - Numbered lists (1. Item, 2. Item) | |
| - Bullet points | |
| """ | |
| # Common abbreviations that shouldn't trigger sentence breaks | |
| ABBREVIATIONS = { | |
| 'Dr', 'Mr', 'Mrs', 'Ms', 'Jr', 'Sr', 'vs', 'etc', 'e.g', 'i.e', | |
| 'St', 'Ave', 'Blvd', 'Rd', 'No', 'Vol', 'Fig', 'Inc', 'Ltd', 'Co' | |
| } | |
| def __init__(self): | |
| # Build abbreviation pattern | |
| abbrev_pattern = '|'.join([re.escape(a) for a in self.ABBREVIATIONS]) | |
| self.abbrev_re = re.compile(f'\\b({abbrev_pattern})\\.', re.IGNORECASE) | |
| def segment(self, text: str) -> List[str]: | |
| """ | |
| Segment text into sentences. | |
| Args: | |
| text: Input text to segment | |
| Returns: | |
| List of sentence strings | |
| """ | |
| if not text or not text.strip(): | |
| return [] | |
| # Normalize whitespace | |
| text = ' '.join(text.split()) | |
| # Protect abbreviations temporarily | |
| text = self.abbrev_re.sub(r'\1<ABB>', text) | |
| # Split on sentence-ending punctuation | |
| # Pattern: period/question/exclamation followed by space and capital letter | |
| # OR at end of string | |
| sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$', text) | |
| # Restore abbreviations | |
| sentences = [s.replace('<ABB>', '.') for s in sentences] | |
| # Clean and filter | |
| sentences = [self._clean_sentence(s) for s in sentences] | |
| sentences = [s for s in sentences if s] # Remove empty | |
| return sentences | |
| def _clean_sentence(self, sentence: str) -> str: | |
| """Clean individual sentence""" | |
| # Remove leading/trailing whitespace | |
| sentence = sentence.strip() | |
| # Remove leading bullet points or numbers | |
| sentence = re.sub(r'^[\d\-•\*]+[\.)]\s*', '', sentence) | |
| return sentence | |
| def segment_submission(text: str) -> List[str]: | |
| """ | |
| Convenience function to segment a submission into sentences. | |
| Args: | |
| text: Submission text | |
| Returns: | |
| List of sentences | |
| """ | |
| segmenter = SentenceSegmenter() | |
| return segmenter.segment(text) | |