participatory-planner / app /sentence_segmenter.py
thadillo
Add advanced training features and HF deployment guide
00aacad
raw
history blame
2.5 kB
"""
Sentence Segmentation Module
Handles splitting submission text into individual sentences for
sentence-level categorization.
"""
import re
from typing import List
class SentenceSegmenter:
"""
Segments text into sentences using rule-based approach.
Handles common cases in participatory planning submissions:
- Standard sentence endings (. ! ?)
- Abbreviations (Dr., Mr., etc.)
- Numbered lists (1. Item, 2. Item)
- Bullet points
"""
# Common abbreviations that shouldn't trigger sentence breaks
ABBREVIATIONS = {
'Dr', 'Mr', 'Mrs', 'Ms', 'Jr', 'Sr', 'vs', 'etc', 'e.g', 'i.e',
'St', 'Ave', 'Blvd', 'Rd', 'No', 'Vol', 'Fig', 'Inc', 'Ltd', 'Co'
}
def __init__(self):
# Build abbreviation pattern
abbrev_pattern = '|'.join([re.escape(a) for a in self.ABBREVIATIONS])
self.abbrev_re = re.compile(f'\\b({abbrev_pattern})\\.', re.IGNORECASE)
def segment(self, text: str) -> List[str]:
"""
Segment text into sentences.
Args:
text: Input text to segment
Returns:
List of sentence strings
"""
if not text or not text.strip():
return []
# Normalize whitespace
text = ' '.join(text.split())
# Protect abbreviations temporarily
text = self.abbrev_re.sub(r'\1<ABB>', text)
# Split on sentence-ending punctuation
# Pattern: period/question/exclamation followed by space and capital letter
# OR at end of string
sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$', text)
# Restore abbreviations
sentences = [s.replace('<ABB>', '.') for s in sentences]
# Clean and filter
sentences = [self._clean_sentence(s) for s in sentences]
sentences = [s for s in sentences if s] # Remove empty
return sentences
def _clean_sentence(self, sentence: str) -> str:
"""Clean individual sentence"""
# Remove leading/trailing whitespace
sentence = sentence.strip()
# Remove leading bullet points or numbers
sentence = re.sub(r'^[\d\-•\*]+[\.)]\s*', '', sentence)
return sentence
def segment_submission(text: str) -> List[str]:
"""
Convenience function to segment a submission into sentences.
Args:
text: Submission text
Returns:
List of sentences
"""
segmenter = SentenceSegmenter()
return segmenter.segment(text)