File size: 5,337 Bytes
3998131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
Text cleaning and normalization module
Removes headers, footers, page numbers, and fixes formatting
"""

import re
import logging
from typing import List, Dict

from .config import CLEANING_PATTERNS

logger = logging.getLogger(__name__)


class TextCleaner:
    """Cleans and normalizes extracted text"""
    
    def __init__(self):
        """Initialize text cleaner with compiled patterns"""
        self.patterns = self._compile_patterns()
    
    def _compile_patterns(self) -> Dict[str, List[re.Pattern]]:
        """Compile all regex patterns for efficiency"""
        compiled = {}
        for category, patterns in CLEANING_PATTERNS.items():
            compiled[category] = [re.compile(p, re.MULTILINE | re.IGNORECASE) for p in patterns]
        return compiled
    
    def clean_text(self, text: str) -> str:
        """
        Apply all cleaning operations to text
        
        Args:
            text: Raw text to clean
            
        Returns:
            Cleaned text
        """
        if not text:
            return ""
        
        # Remove page numbers
        text = self._remove_page_numbers(text)
        
        # Remove headers and footers
        text = self._remove_headers_footers(text)
        
        # Remove table of contents patterns
        text = self._remove_toc_patterns(text)
        
        # Fix line breaks and whitespace
        text = self._normalize_whitespace(text)
        
        # Additional cleaning
        text = self._additional_cleaning(text)
        
        return text.strip()
    
    def _remove_page_numbers(self, text: str) -> str:
        """Remove page numbers"""
        for pattern in self.patterns['page_numbers']:
            text = pattern.sub('', text)
        return text
    
    def _remove_headers_footers(self, text: str) -> str:
        """Remove common headers and footers"""
        for pattern in self.patterns['headers_footers']:
            text = pattern.sub('', text)
        return text
    
    def _remove_toc_patterns(self, text: str) -> str:
        """Remove table of contents patterns"""
        for pattern in self.patterns['toc_patterns']:
            text = pattern.sub('', text)
        return text
    
    def _normalize_whitespace(self, text: str) -> str:
        """Fix excessive whitespace and line breaks"""
        # Replace multiple blank lines with double newline
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
        
        # Replace multiple spaces/tabs with single space
        text = re.sub(r'[ \t]+', ' ', text)
        
        # Fix broken words (hyphenation at line breaks)
        text = re.sub(r'-\s*\n\s*', '', text)
        
        # Normalize line breaks within paragraphs
        # Keep double line breaks (paragraph separators)
        lines = text.split('\n')
        normalized_lines = []
        
        for i, line in enumerate(lines):
            line = line.strip()
            if line:
                # Check if this line and next are both non-empty (within paragraph)
                if i < len(lines) - 1 and lines[i + 1].strip():
                    # Check if line ends with sentence-ending punctuation
                    if not line.endswith(('.', '!', '?', ':', ';')):
                        # Join with next line
                        normalized_lines.append(line + ' ')
                    else:
                        normalized_lines.append(line + '\n')
                else:
                    normalized_lines.append(line + '\n')
        
        text = ''.join(normalized_lines)
        
        return text
    
    def _additional_cleaning(self, text: str) -> str:
        """Additional cleaning operations"""
        # Remove standalone numbers that might be page/section numbers
        text = re.sub(r'\n\s*\d+\s*\n', '\n', text)
        
        # Remove very short lines (likely artifacts)
        lines = text.split('\n')
        cleaned_lines = [line for line in lines if len(line.strip()) > 3 or line.strip() == '']
        text = '\n'.join(cleaned_lines)
        
        # Normalize unicode characters
        text = text.replace('\u2019', "'")  # Right single quotation mark
        text = text.replace('\u2018', "'")  # Left single quotation mark
        text = text.replace('\u201c', '"')  # Left double quotation mark
        text = text.replace('\u201d', '"')  # Right double quotation mark
        text = text.replace('\u2013', '-')  # En dash
        text = text.replace('\u2014', '--')  # Em dash
        
        return text
    
    def clean_pages(self, pages_data: List[Dict[str, any]]) -> str:
        """
        Clean text from multiple pages and combine
        
        Args:
            pages_data: List of dicts with 'page_number' and 'text'
            
        Returns:
            Combined cleaned text
        """
        combined_text = []
        
        for page_data in pages_data:
            page_text = page_data.get('text', '')
            if page_text:
                cleaned = self.clean_text(page_text)
                if cleaned:
                    combined_text.append(cleaned)
        
        # Join pages with double newline
        full_text = '\n\n'.join(combined_text)
        
        logger.info(f"Cleaned {len(pages_data)} pages into {len(full_text)} characters")
        
        return full_text