File size: 734 Bytes
162cb6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import re

class TextProcessor:
    def clean_text(self, text: str) -> str:
        """Remove excessive whitespace and format text"""
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        # Remove page numbers and headers
        text = re.sub(r'Page \d+', '', text)
        # Normalize quotes
        text = text.replace('"', '"').replace('"', '"')
        return text.strip()
    
    def split_into_sentences(self, text: str) -> list:
        """Split text into sentences"""
        sentences = re.split(r'(?<=[.!?])\s+', text)
        return [s.strip() for s in sentences if s.strip()]
    
    def count_words(self, text: str) -> int:
        """Count words in text"""
        return len(text.split())