File size: 5,154 Bytes
71797a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
Text processing utilities for sentence-level categorization.
Handles sentence segmentation and text cleaning.
"""

import re
from typing import List
import logging

logger = logging.getLogger(__name__)

class TextProcessor:
    """Handle sentence segmentation and text processing"""
    
    @staticmethod
    def segment_into_sentences(text: str) -> List[str]:
        """
        Break text into sentences using multiple strategies.
        
        Strategies:
        1. NLTK punkt tokenizer (primary)
        2. Regex-based fallback
        3. Min/max length constraints
        
        Args:
            text: Input text to segment
            
        Returns:
            List of sentences
        """
        # Clean text
        text = text.strip()
        
        if not text:
            return []
        
        # Try NLTK first (better accuracy)
        try:
            import nltk
            # Try to use punkt tokenizer
            try:
                from nltk.tokenize import sent_tokenize
                sentences = sent_tokenize(text)
            except LookupError:
                # Download punkt if not available
                logger.info("Downloading NLTK punkt tokenizer...")
                nltk.download('punkt', quiet=True)
                from nltk.tokenize import sent_tokenize
                sentences = sent_tokenize(text)
        except Exception as e:
            # Fallback: regex-based segmentation
            logger.warning(f"NLTK tokenization failed ({e}), using regex fallback")
            sentences = TextProcessor._regex_segmentation(text)
        
        # Clean and filter
        sentences = [s.strip() for s in sentences if s.strip()]
        
        # Filter out very short "sentences" (likely not meaningful)
        # Require at least 3 words
        sentences = [s for s in sentences if len(s.split()) >= 3]
        
        return sentences
    
    @staticmethod
    def _regex_segmentation(text: str) -> List[str]:
        """
        Fallback sentence segmentation using regex.
        
        This is less accurate than NLTK but works without dependencies.
        """
        # Split on period, exclamation, question mark (followed by space or end)
        # Look for: ., !, or ? followed by space + capital letter, or end of string
        pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$'
        sentences = re.split(pattern, text)
        
        return [s.strip() for s in sentences if s.strip()]
    
    @staticmethod
    def is_valid_sentence(sentence: str) -> bool:
        """
        Check if sentence is valid for categorization.
        
        Args:
            sentence: Input sentence
            
        Returns:
            True if valid, False otherwise
        """
        # Must have at least 3 words
        if len(sentence.split()) < 3:
            return False
        
        # Must have some alphabetic characters
        if not any(c.isalpha() for c in sentence):
            return False
        
        # Not just a list item or fragment
        stripped = sentence.strip()
        if stripped.startswith('-') or stripped.startswith('•') or stripped.startswith('*'):
            # Allow if it has substantial text after the bullet
            if len(stripped[1:].strip().split()) < 3:
                return False
        
        return True
    
    @staticmethod
    def clean_sentence(sentence: str) -> str:
        """
        Clean a sentence for processing.
        
        Args:
            sentence: Input sentence
            
        Returns:
            Cleaned sentence
        """
        # Remove leading bullet points or numbers
        sentence = re.sub(r'^[\s\-•*\d.]+\s*', '', sentence)
        
        # Normalize whitespace
        sentence = ' '.join(sentence.split())
        
        # Ensure it ends with punctuation
        if sentence and not sentence[-1] in '.!?':
            sentence += '.'
        
        return sentence.strip()
    
    @staticmethod
    def segment_and_clean(text: str) -> List[str]:
        """
        Segment text into sentences and clean them.
        
        This is the main entry point for text processing.
        
        Args:
            text: Input text
            
        Returns:
            List of cleaned, valid sentences
        """
        # Segment
        sentences = TextProcessor.segment_into_sentences(text)
        
        # Clean and filter
        result = []
        for sentence in sentences:
            cleaned = TextProcessor.clean_sentence(sentence)
            if TextProcessor.is_valid_sentence(cleaned):
                result.append(cleaned)
        
        return result
    
    @staticmethod
    def get_sentence_count_estimate(text: str) -> int:
        """
        Quick estimate of sentence count without full processing.
        
        Args:
            text: Input text
            
        Returns:
            Estimated sentence count
        """
        # Count sentence-ending punctuation
        count = text.count('.') + text.count('!') + text.count('?')
        
        # At least 1 if text exists
        return max(1, count)