Spaces:

Thadillo
/

participatory-planner

Sleeping

participatory-planner / app /sentence_segmenter.py

thadillo

Add advanced training features and HF deployment guide

00aacad 2 months ago

2.5 kB

	"""
	Sentence Segmentation Module

	Handles splitting submission text into individual sentences for
	sentence-level categorization.
	"""

	import re
	from typing import List


	class SentenceSegmenter:
	"""
	Segments text into sentences using rule-based approach.

	Handles common cases in participatory planning submissions:
	- Standard sentence endings (. ! ?)
	- Abbreviations (Dr., Mr., etc.)
	- Numbered lists (1. Item, 2. Item)
	- Bullet points
	"""

	# Common abbreviations that shouldn't trigger sentence breaks
	ABBREVIATIONS = {
	'Dr', 'Mr', 'Mrs', 'Ms', 'Jr', 'Sr', 'vs', 'etc', 'e.g', 'i.e',
	'St', 'Ave', 'Blvd', 'Rd', 'No', 'Vol', 'Fig', 'Inc', 'Ltd', 'Co'
	}

	def __init__(self):
	# Build abbreviation pattern
	abbrev_pattern = '\|'.join([re.escape(a) for a in self.ABBREVIATIONS])
	self.abbrev_re = re.compile(f'\\b({abbrev_pattern})\\.', re.IGNORECASE)

	def segment(self, text: str) -> List[str]:
	"""
	Segment text into sentences.

	Args:
	text: Input text to segment

	Returns:
	List of sentence strings
	"""
	if not text or not text.strip():
	return []

	# Normalize whitespace
	text = ' '.join(text.split())

	# Protect abbreviations temporarily
	text = self.abbrev_re.sub(r'\1<ABB>', text)

	# Split on sentence-ending punctuation
	# Pattern: period/question/exclamation followed by space and capital letter
	# OR at end of string
	sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])\|(?<=[.!?])$', text)

	# Restore abbreviations
	sentences = [s.replace('<ABB>', '.') for s in sentences]

	# Clean and filter
	sentences = [self._clean_sentence(s) for s in sentences]
	sentences = [s for s in sentences if s] # Remove empty

	return sentences

	def _clean_sentence(self, sentence: str) -> str:
	"""Clean individual sentence"""
	# Remove leading/trailing whitespace
	sentence = sentence.strip()

	# Remove leading bullet points or numbers
	sentence = re.sub(r'^[\d\-•\]+[\.)]\s', '', sentence)

	return sentence


	def segment_submission(text: str) -> List[str]:
	"""
	Convenience function to segment a submission into sentences.

	Args:
	text: Submission text

	Returns:
	List of sentences
	"""
	segmenter = SentenceSegmenter()
	return segmenter.segment(text)