File size: 7,905 Bytes
c0f31c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import logging
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

class TextPreprocessor:
    def __init__(self):
        try:
            self.stopwords = set(stopwords.words('english'))
            self.lemmatizer = WordNetLemmatizer()
            self.logger = logging.getLogger(__name__)
            
        except Exception as e:
            self.logger.error(f"Failed to initialize NLTK resources: {e}")
            raise


    def standardize_case(self, text):
        return text.lower()


    def remove_punctuation(self, text):
        return re.sub(r'[^\w\s]', '', text)


    def normalize_whitespace(self, text):
        return re.sub(r'\s+', ' ', text).strip()


    def remove_stopwords(self, words):
        return [word for word in words if word not in self.stopwords]


    def lemmatize_words(self, words):
        return [self.lemmatizer.lemmatize(word) for word in words]
    

    def remove_headers_and_footers(self, text, aggressive=False, pattern=None):
        try:
            if not text or not text.strip():
                return text
                    
            lines = text.splitlines()
            if len(lines) <= 4:  # For very short text, don't remove anything
                return text
            
            # Store original lines for fallback
            original_lines = lines.copy()
            
            # Use different strategies based on document characteristics
            if self._appears_to_be_slide(lines):
                # Slide-friendly approach - only remove obvious headers/footers
                cleaned_lines = self._clean_slide_headers_footers(lines, pattern)
            elif aggressive:
                # Traditional document approach - remove first/last few lines
                num_lines = 2
                cleaned_lines = lines[num_lines:-num_lines]
            else:
                # Conservative approach - only remove based on patterns
                cleaned_lines = self._pattern_based_removal(lines, pattern)
                
            # If we removed too much (over 30% of content), revert to original
            if len(cleaned_lines) < len(lines) * 0.7:
                self.logger.warning("Header/footer removal eliminated too much content, reverting")
                cleaned_lines = original_lines
                
            # Additional heuristic: Remove single-word lines that might be page numbers
            cleaned_lines = [line for line in cleaned_lines 
                            if not (len(line.strip().split()) == 1 and 
                                line.strip().isdigit())]
            
            # Join lines back into text
            return '\n'.join(cleaned_lines)
            
        except Exception as e:
            self.logger.error(f"Error removing headers/footers: {e}")
            return text  # Return original text on error
    

    def _appears_to_be_slide(self, lines):
        """Detect if the content appears to be from a slide/presentation."""
        # Characteristics of slides:
        # - Shorter overall text
        # - Fewer lines
        # - More bullet points
        # - Title followed by bullet points
        
        if len(lines) < 15:  # Short content
            return True
            
        # Check for bullet point patterns
        bullet_pattern = r'^\s*[β€’\-\*\>\β—¦\β—‹\β—†\β—‡\β–ͺ\β–«\⚫\βšͺ\βœ“\βœ”\βœ•\βœ–\βœ—\✘]'
        bullet_lines = sum(1 for line in lines if re.match(bullet_pattern, line))
        
        # If more than 20% of lines are bullets, likely a slide
        if bullet_lines > len(lines) * 0.2:
            return True
        
        # If first non-empty line is short (likely a title) and followed by bullet points
        non_empty_lines = [line for line in lines if line.strip()]
        if non_empty_lines and len(non_empty_lines[0].strip()) < 60:
            # Check for bullet points in the following lines
            for line in non_empty_lines[1:4]:  # Check next few lines
                if re.match(bullet_pattern, line):
                    return True
                    
        return False


    def _clean_slide_headers_footers(self, lines, pattern=None):
        """Clean headers/footers from slide-based content."""
        cleaned_lines = lines.copy()
        
        # For slides, we primarily rely on pattern matching rather than line position
        if pattern:
            cleaned_lines = [line for line in cleaned_lines 
                            if not re.search(pattern, line)]
        
        # Common slide footer patterns to remove
        footer_patterns = [
            r'^\s*\d+\s*$',  # Standalone page number
            r'confidential',  # Confidentiality notices
            r'all rights reserved',
            r'proprietary',
            r'^\s*www\.',  # Website in footer
            r'^\s*https?://',  # URL in footer
            r'\bpage\s+\d+\b',  # "Page X" footer
            r'^\s*[Β©β’Έ]\s*\d{4}'  # Copyright notice
        ]
        
        # Combine all patterns
        combined_pattern = '|'.join(f'({p})' for p in footer_patterns)
        
        # Filter out footer lines
        if combined_pattern:
            cleaned_lines = [line for line in cleaned_lines 
                            if not re.search(combined_pattern, line, re.IGNORECASE)]
        
        return cleaned_lines


    def _pattern_based_removal(self, lines, pattern=None):
        """Remove headers/footers based only on patterns, not position."""
        if not pattern:
            # Default patterns for headers/footers
            patterns = [
                r'^\s*\d+\s*$',  # Standalone page numbers
                r'^\s*page\s+\d+\s+of\s+\d+\s*$',  # Page X of Y
                r'^\s*[Β©β’Έ]\s*\d{4}.*$',  # Copyright lines
                r'^\s*confidential\s*$',  # Confidentiality markers
                r'^\s*https?://.*$',  # URLs alone on a line
                r'^\s*www\..*$',  # Website alone on a line
                r'^\s*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\s*$'  # Email addresses
            ]
            combined_pattern = '|'.join(f'({p})' for p in patterns)
        else:
            combined_pattern = pattern
            
        return [line for line in lines 
                if not re.search(combined_pattern, line, re.IGNORECASE)]


    def remove_common_pdf_artifacts(self, text):
        try:
            # Remove form field indicators
            text = re.sub(r'\[\s*\]\s*|\[\s*X\s*\]|\(\s*\)\s*|\(\s*X\s*\)', '', text)
            
            # Remove common PDF annotations
            text = re.sub(r'<<[^>]*>>', '', text)
            
            # Remove artifact markers often found in PDFs
            text = re.sub(r'obj\s*\d+\s*\d+\s*R', '', text)
            
            return text
            
        except Exception as e:
            self.logger.error(f"Error removing PDF artifacts: {e}")
            return text

    def preprocess(self, text, remove_headers_footers=True, aggressive_removal=False):
        try:
            if remove_headers_footers:
                text = self.remove_headers_and_footers(text, aggressive=aggressive_removal)
            
            text = self.remove_common_pdf_artifacts(text)
                
            text = self.standardize_case(text)
            text = self.remove_punctuation(text)
            text = self.normalize_whitespace(text)
            
            words = text.split()
            words = self.remove_stopwords(words)
            words = self.lemmatize_words(words)
            
            return ' '.join(words)
        except Exception as e:
            self.logger.error(f"Error preprocessing text: {e}")
            raise