Spaces:
Sleeping
Sleeping
File size: 5,154 Bytes
71797a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
"""
Text processing utilities for sentence-level categorization.
Handles sentence segmentation and text cleaning.
"""
import re
from typing import List
import logging
logger = logging.getLogger(__name__)
class TextProcessor:
"""Handle sentence segmentation and text processing"""
@staticmethod
def segment_into_sentences(text: str) -> List[str]:
"""
Break text into sentences using multiple strategies.
Strategies:
1. NLTK punkt tokenizer (primary)
2. Regex-based fallback
3. Min/max length constraints
Args:
text: Input text to segment
Returns:
List of sentences
"""
# Clean text
text = text.strip()
if not text:
return []
# Try NLTK first (better accuracy)
try:
import nltk
# Try to use punkt tokenizer
try:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)
except LookupError:
# Download punkt if not available
logger.info("Downloading NLTK punkt tokenizer...")
nltk.download('punkt', quiet=True)
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)
except Exception as e:
# Fallback: regex-based segmentation
logger.warning(f"NLTK tokenization failed ({e}), using regex fallback")
sentences = TextProcessor._regex_segmentation(text)
# Clean and filter
sentences = [s.strip() for s in sentences if s.strip()]
# Filter out very short "sentences" (likely not meaningful)
# Require at least 3 words
sentences = [s for s in sentences if len(s.split()) >= 3]
return sentences
@staticmethod
def _regex_segmentation(text: str) -> List[str]:
"""
Fallback sentence segmentation using regex.
This is less accurate than NLTK but works without dependencies.
"""
# Split on period, exclamation, question mark (followed by space or end)
# Look for: ., !, or ? followed by space + capital letter, or end of string
pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$'
sentences = re.split(pattern, text)
return [s.strip() for s in sentences if s.strip()]
@staticmethod
def is_valid_sentence(sentence: str) -> bool:
"""
Check if sentence is valid for categorization.
Args:
sentence: Input sentence
Returns:
True if valid, False otherwise
"""
# Must have at least 3 words
if len(sentence.split()) < 3:
return False
# Must have some alphabetic characters
if not any(c.isalpha() for c in sentence):
return False
# Not just a list item or fragment
stripped = sentence.strip()
if stripped.startswith('-') or stripped.startswith('•') or stripped.startswith('*'):
# Allow if it has substantial text after the bullet
if len(stripped[1:].strip().split()) < 3:
return False
return True
@staticmethod
def clean_sentence(sentence: str) -> str:
"""
Clean a sentence for processing.
Args:
sentence: Input sentence
Returns:
Cleaned sentence
"""
# Remove leading bullet points or numbers
sentence = re.sub(r'^[\s\-•*\d.]+\s*', '', sentence)
# Normalize whitespace
sentence = ' '.join(sentence.split())
# Ensure it ends with punctuation
if sentence and not sentence[-1] in '.!?':
sentence += '.'
return sentence.strip()
@staticmethod
def segment_and_clean(text: str) -> List[str]:
"""
Segment text into sentences and clean them.
This is the main entry point for text processing.
Args:
text: Input text
Returns:
List of cleaned, valid sentences
"""
# Segment
sentences = TextProcessor.segment_into_sentences(text)
# Clean and filter
result = []
for sentence in sentences:
cleaned = TextProcessor.clean_sentence(sentence)
if TextProcessor.is_valid_sentence(cleaned):
result.append(cleaned)
return result
@staticmethod
def get_sentence_count_estimate(text: str) -> int:
"""
Quick estimate of sentence count without full processing.
Args:
text: Input text
Returns:
Estimated sentence count
"""
# Count sentence-ending punctuation
count = text.count('.') + text.count('!') + text.count('?')
# At least 1 if text exists
return max(1, count)
|