File size: 2,453 Bytes
92c68e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import re
from typing import List, Optional

class TextProcessor:
    """Handles text preprocessing and cleaning"""
    
    def __init__(self):
        """Initialize text processor"""
        self.sentence_endings = r'[.!?]'
        self.word_pattern = r'\b\w+\b'
    
    def clean_text(self, text: str) -> str:
        """
        Clean and normalize text
        
        Args:
            text: Input text to clean
            
        Returns:
            str: Cleaned text
        """
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Fix common OCR errors
        text = self._fix_ocr_errors(text)
        
        # Normalize punctuation
        text = self._normalize_punctuation(text)
        
        return text.strip()
    
    def split_into_sections(self, text: str) -> List[str]:
        """
        Split text into logical sections based on content
        
        Args:
            text: Input text to split
            
        Returns:
            List[str]: List of text sections
        """
        # Split on double newlines or section markers
        sections = re.split(r'\n\s*\n|\n(?=[A-Z][^a-z]*:)', text)
        return [s.strip() for s in sections if s.strip()]
    
    def count_words(self, text: str) -> int:
        """
        Count words in text
        
        Args:
            text: Input text
            
        Returns:
            int: Word count
        """
        words = re.findall(self.word_pattern, text)
        return len(words)
    
    def _fix_ocr_errors(self, text: str) -> str:
        """Fix common OCR errors"""
        replacements = {
            r'[|]': 'I',  # Vertical bar to I
            r'0': 'O',    # Zero to O where appropriate
            r'1': 'l',    # One to l where appropriate
            r'\s+': ' '   # Multiple spaces to single space
        }
        
        for pattern, replacement in replacements.items():
            text = re.sub(pattern, replacement, text)
        return text
    
    def _normalize_punctuation(self, text: str) -> str:
        """Normalize punctuation marks"""
        # Replace multiple periods with single period
        text = re.sub(r'\.{2,}', '.', text)
        
        # Add space after punctuation if missing
        text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)
        
        # Fix spacing around punctuation
        text = re.sub(r'\s+([.!?,])', r'\1', text)
        
        return text