Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

File size: 7,905 Bytes

c0f31c1

import logging
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

class TextPreprocessor:
    def __init__(self):
        try:
            self.stopwords = set(stopwords.words('english'))
            self.lemmatizer = WordNetLemmatizer()
            self.logger = logging.getLogger(__name__)
            
        except Exception as e:
            self.logger.error(f"Failed to initialize NLTK resources: {e}")
            raise


    def standardize_case(self, text):
        return text.lower()


    def remove_punctuation(self, text):
        return re.sub(r'[^\w\s]', '', text)


    def normalize_whitespace(self, text):
        return re.sub(r'\s+', ' ', text).strip()


    def remove_stopwords(self, words):
        return [word for word in words if word not in self.stopwords]


    def lemmatize_words(self, words):
        return [self.lemmatizer.lemmatize(word) for word in words]
    

    def remove_headers_and_footers(self, text, aggressive=False, pattern=None):
        try:
            if not text or not text.strip():
                return text
                    
            lines = text.splitlines()
            if len(lines) <= 4:  # For very short text, don't remove anything
                return text
            
            # Store original lines for fallback
            original_lines = lines.copy()
            
            # Use different strategies based on document characteristics
            if self._appears_to_be_slide(lines):
                # Slide-friendly approach - only remove obvious headers/footers
                cleaned_lines = self._clean_slide_headers_footers(lines, pattern)
            elif aggressive:
                # Traditional document approach - remove first/last few lines
                num_lines = 2
                cleaned_lines = lines[num_lines:-num_lines]
            else:
                # Conservative approach - only remove based on patterns
                cleaned_lines = self._pattern_based_removal(lines, pattern)
                
            # If we removed too much (over 30% of content), revert to original
            if len(cleaned_lines) < len(lines) * 0.7:
                self.logger.warning("Header/footer removal eliminated too much content, reverting")
                cleaned_lines = original_lines
                
            # Additional heuristic: Remove single-word lines that might be page numbers
            cleaned_lines = [line for line in cleaned_lines 
                            if not (len(line.strip().split()) == 1 and 
                                line.strip().isdigit())]
            
            # Join lines back into text
            return '\n'.join(cleaned_lines)
            
        except Exception as e:
            self.logger.error(f"Error removing headers/footers: {e}")
            return text  # Return original text on error
    

    def _appears_to_be_slide(self, lines):
        """Detect if the content appears to be from a slide/presentation."""
        # Characteristics of slides:
        # - Shorter overall text
        # - Fewer lines
        # - More bullet points
        # - Title followed by bullet points
        
        if len(lines) < 15:  # Short content
            return True
            
        # Check for bullet point patterns
        bullet_pattern = r'^\s*[•\-\*\>\◦\○\◆\◇\▪\▫\⚫\⚪\✓\✔\✕\✖\✗\✘]'
        bullet_lines = sum(1 for line in lines if re.match(bullet_pattern, line))
        
        # If more than 20% of lines are bullets, likely a slide
        if bullet_lines > len(lines) * 0.2:
            return True
        
        # If first non-empty line is short (likely a title) and followed by bullet points
        non_empty_lines = [line for line in lines if line.strip()]
        if non_empty_lines and len(non_empty_lines[0].strip()) < 60:
            # Check for bullet points in the following lines
            for line in non_empty_lines[1:4]:  # Check next few lines
                if re.match(bullet_pattern, line):
                    return True
                    
        return False


    def _clean_slide_headers_footers(self, lines, pattern=None):
        """Clean headers/footers from slide-based content."""
        cleaned_lines = lines.copy()
        
        # For slides, we primarily rely on pattern matching rather than line position
        if pattern:
            cleaned_lines = [line for line in cleaned_lines 
                            if not re.search(pattern, line)]
        
        # Common slide footer patterns to remove
        footer_patterns = [
            r'^\s*\d+\s*$',  # Standalone page number
            r'confidential',  # Confidentiality notices
            r'all rights reserved',
            r'proprietary',
            r'^\s*www\.',  # Website in footer
            r'^\s*https?://',  # URL in footer
            r'\bpage\s+\d+\b',  # "Page X" footer
            r'^\s*[©Ⓒ]\s*\d{4}'  # Copyright notice
        ]
        
        # Combine all patterns
        combined_pattern = '|'.join(f'({p})' for p in footer_patterns)
        
        # Filter out footer lines
        if combined_pattern:
            cleaned_lines = [line for line in cleaned_lines 
                            if not re.search(combined_pattern, line, re.IGNORECASE)]
        
        return cleaned_lines


    def _pattern_based_removal(self, lines, pattern=None):
        """Remove headers/footers based only on patterns, not position."""
        if not pattern:
            # Default patterns for headers/footers
            patterns = [
                r'^\s*\d+\s*$',  # Standalone page numbers
                r'^\s*page\s+\d+\s+of\s+\d+\s*$',  # Page X of Y
                r'^\s*[©Ⓒ]\s*\d{4}.*$',  # Copyright lines
                r'^\s*confidential\s*$',  # Confidentiality markers
                r'^\s*https?://.*$',  # URLs alone on a line
                r'^\s*www\..*$',  # Website alone on a line
                r'^\s*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\s*$'  # Email addresses
            ]
            combined_pattern = '|'.join(f'({p})' for p in patterns)
        else:
            combined_pattern = pattern
            
        return [line for line in lines 
                if not re.search(combined_pattern, line, re.IGNORECASE)]


    def remove_common_pdf_artifacts(self, text):
        try:
            # Remove form field indicators
            text = re.sub(r'\[\s*\]\s*|\[\s*X\s*\]|\(\s*\)\s*|\(\s*X\s*\)', '', text)
            
            # Remove common PDF annotations
            text = re.sub(r'<<[^>]*>>', '', text)
            
            # Remove artifact markers often found in PDFs
            text = re.sub(r'obj\s*\d+\s*\d+\s*R', '', text)
            
            return text
            
        except Exception as e:
            self.logger.error(f"Error removing PDF artifacts: {e}")
            return text

    def preprocess(self, text, remove_headers_footers=True, aggressive_removal=False):
        try:
            if remove_headers_footers:
                text = self.remove_headers_and_footers(text, aggressive=aggressive_removal)
            
            text = self.remove_common_pdf_artifacts(text)
                
            text = self.standardize_case(text)
            text = self.remove_punctuation(text)
            text = self.normalize_whitespace(text)
            
            words = text.split()
            words = self.remove_stopwords(words)
            words = self.lemmatize_words(words)
            
            return ' '.join(words)
        except Exception as e:
            self.logger.error(f"Error preprocessing text: {e}")
            raise