Spaces:

Thadillo
/

participatory-planner

Sleeping

File size: 5,154 Bytes

71797a4

"""
Text processing utilities for sentence-level categorization.
Handles sentence segmentation and text cleaning.
"""

import re
from typing import List
import logging

logger = logging.getLogger(__name__)

class TextProcessor:
    """Handle sentence segmentation and text processing"""
    
    @staticmethod
    def segment_into_sentences(text: str) -> List[str]:
        """
        Break text into sentences using multiple strategies.
        
        Strategies:
        1. NLTK punkt tokenizer (primary)
        2. Regex-based fallback
        3. Min/max length constraints
        
        Args:
            text: Input text to segment
            
        Returns:
            List of sentences
        """
        # Clean text
        text = text.strip()
        
        if not text:
            return []
        
        # Try NLTK first (better accuracy)
        try:
            import nltk
            # Try to use punkt tokenizer
            try:
                from nltk.tokenize import sent_tokenize
                sentences = sent_tokenize(text)
            except LookupError:
                # Download punkt if not available
                logger.info("Downloading NLTK punkt tokenizer...")
                nltk.download('punkt', quiet=True)
                from nltk.tokenize import sent_tokenize
                sentences = sent_tokenize(text)
        except Exception as e:
            # Fallback: regex-based segmentation
            logger.warning(f"NLTK tokenization failed ({e}), using regex fallback")
            sentences = TextProcessor._regex_segmentation(text)
        
        # Clean and filter
        sentences = [s.strip() for s in sentences if s.strip()]
        
        # Filter out very short "sentences" (likely not meaningful)
        # Require at least 3 words
        sentences = [s for s in sentences if len(s.split()) >= 3]
        
        return sentences
    
    @staticmethod
    def _regex_segmentation(text: str) -> List[str]:
        """
        Fallback sentence segmentation using regex.
        
        This is less accurate than NLTK but works without dependencies.
        """
        # Split on period, exclamation, question mark (followed by space or end)
        # Look for: ., !, or ? followed by space + capital letter, or end of string
        pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$'
        sentences = re.split(pattern, text)
        
        return [s.strip() for s in sentences if s.strip()]
    
    @staticmethod
    def is_valid_sentence(sentence: str) -> bool:
        """
        Check if sentence is valid for categorization.
        
        Args:
            sentence: Input sentence
            
        Returns:
            True if valid, False otherwise
        """
        # Must have at least 3 words
        if len(sentence.split()) < 3:
            return False
        
        # Must have some alphabetic characters
        if not any(c.isalpha() for c in sentence):
            return False
        
        # Not just a list item or fragment
        stripped = sentence.strip()
        if stripped.startswith('-') or stripped.startswith('•') or stripped.startswith('*'):
            # Allow if it has substantial text after the bullet
            if len(stripped[1:].strip().split()) < 3:
                return False
        
        return True
    
    @staticmethod
    def clean_sentence(sentence: str) -> str:
        """
        Clean a sentence for processing.
        
        Args:
            sentence: Input sentence
            
        Returns:
            Cleaned sentence
        """
        # Remove leading bullet points or numbers
        sentence = re.sub(r'^[\s\-•*\d.]+\s*', '', sentence)
        
        # Normalize whitespace
        sentence = ' '.join(sentence.split())
        
        # Ensure it ends with punctuation
        if sentence and not sentence[-1] in '.!?':
            sentence += '.'
        
        return sentence.strip()
    
    @staticmethod
    def segment_and_clean(text: str) -> List[str]:
        """
        Segment text into sentences and clean them.
        
        This is the main entry point for text processing.
        
        Args:
            text: Input text
            
        Returns:
            List of cleaned, valid sentences
        """
        # Segment
        sentences = TextProcessor.segment_into_sentences(text)
        
        # Clean and filter
        result = []
        for sentence in sentences:
            cleaned = TextProcessor.clean_sentence(sentence)
            if TextProcessor.is_valid_sentence(cleaned):
                result.append(cleaned)
        
        return result
    
    @staticmethod
    def get_sentence_count_estimate(text: str) -> int:
        """
        Quick estimate of sentence count without full processing.
        
        Args:
            text: Input text
            
        Returns:
            Estimated sentence count
        """
        # Count sentence-ending punctuation
        count = text.count('.') + text.count('!') + text.count('?')
        
        # At least 1 if text exists
        return max(1, count)