""" Text processing utilities for sentence-level categorization. Handles sentence segmentation and text cleaning. """ import re from typing import List import logging logger = logging.getLogger(__name__) class TextProcessor: """Handle sentence segmentation and text processing""" @staticmethod def segment_into_sentences(text: str) -> List[str]: """ Break text into sentences using multiple strategies. Strategies: 1. NLTK punkt tokenizer (primary) 2. Regex-based fallback 3. Min/max length constraints Args: text: Input text to segment Returns: List of sentences """ # Clean text text = text.strip() if not text: return [] # Try NLTK first (better accuracy) try: import nltk # Try to use punkt tokenizer try: from nltk.tokenize import sent_tokenize sentences = sent_tokenize(text) except LookupError: # Download punkt if not available logger.info("Downloading NLTK punkt tokenizer...") nltk.download('punkt', quiet=True) from nltk.tokenize import sent_tokenize sentences = sent_tokenize(text) except Exception as e: # Fallback: regex-based segmentation logger.warning(f"NLTK tokenization failed ({e}), using regex fallback") sentences = TextProcessor._regex_segmentation(text) # Clean and filter sentences = [s.strip() for s in sentences if s.strip()] # Filter out very short "sentences" (likely not meaningful) # Require at least 3 words sentences = [s for s in sentences if len(s.split()) >= 3] return sentences @staticmethod def _regex_segmentation(text: str) -> List[str]: """ Fallback sentence segmentation using regex. This is less accurate than NLTK but works without dependencies. """ # Split on period, exclamation, question mark (followed by space or end) # Look for: ., !, or ? followed by space + capital letter, or end of string pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$' sentences = re.split(pattern, text) return [s.strip() for s in sentences if s.strip()] @staticmethod def is_valid_sentence(sentence: str) -> bool: """ Check if sentence is valid for categorization. Args: sentence: Input sentence Returns: True if valid, False otherwise """ # Must have at least 3 words if len(sentence.split()) < 3: return False # Must have some alphabetic characters if not any(c.isalpha() for c in sentence): return False # Not just a list item or fragment stripped = sentence.strip() if stripped.startswith('-') or stripped.startswith('•') or stripped.startswith('*'): # Allow if it has substantial text after the bullet if len(stripped[1:].strip().split()) < 3: return False return True @staticmethod def clean_sentence(sentence: str) -> str: """ Clean a sentence for processing. Args: sentence: Input sentence Returns: Cleaned sentence """ # Remove leading bullet points or numbers sentence = re.sub(r'^[\s\-•*\d.]+\s*', '', sentence) # Normalize whitespace sentence = ' '.join(sentence.split()) # Ensure it ends with punctuation if sentence and not sentence[-1] in '.!?': sentence += '.' return sentence.strip() @staticmethod def segment_and_clean(text: str) -> List[str]: """ Segment text into sentences and clean them. This is the main entry point for text processing. Args: text: Input text Returns: List of cleaned, valid sentences """ # Segment sentences = TextProcessor.segment_into_sentences(text) # Clean and filter result = [] for sentence in sentences: cleaned = TextProcessor.clean_sentence(sentence) if TextProcessor.is_valid_sentence(cleaned): result.append(cleaned) return result @staticmethod def get_sentence_count_estimate(text: str) -> int: """ Quick estimate of sentence count without full processing. Args: text: Input text Returns: Estimated sentence count """ # Count sentence-ending punctuation count = text.count('.') + text.count('!') + text.count('?') # At least 1 if text exists return max(1, count)