File size: 5,337 Bytes
3998131 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | """
Text cleaning and normalization module
Removes headers, footers, page numbers, and fixes formatting
"""
import re
import logging
from typing import List, Dict
from .config import CLEANING_PATTERNS
logger = logging.getLogger(__name__)
class TextCleaner:
"""Cleans and normalizes extracted text"""
def __init__(self):
"""Initialize text cleaner with compiled patterns"""
self.patterns = self._compile_patterns()
def _compile_patterns(self) -> Dict[str, List[re.Pattern]]:
"""Compile all regex patterns for efficiency"""
compiled = {}
for category, patterns in CLEANING_PATTERNS.items():
compiled[category] = [re.compile(p, re.MULTILINE | re.IGNORECASE) for p in patterns]
return compiled
def clean_text(self, text: str) -> str:
"""
Apply all cleaning operations to text
Args:
text: Raw text to clean
Returns:
Cleaned text
"""
if not text:
return ""
# Remove page numbers
text = self._remove_page_numbers(text)
# Remove headers and footers
text = self._remove_headers_footers(text)
# Remove table of contents patterns
text = self._remove_toc_patterns(text)
# Fix line breaks and whitespace
text = self._normalize_whitespace(text)
# Additional cleaning
text = self._additional_cleaning(text)
return text.strip()
def _remove_page_numbers(self, text: str) -> str:
"""Remove page numbers"""
for pattern in self.patterns['page_numbers']:
text = pattern.sub('', text)
return text
def _remove_headers_footers(self, text: str) -> str:
"""Remove common headers and footers"""
for pattern in self.patterns['headers_footers']:
text = pattern.sub('', text)
return text
def _remove_toc_patterns(self, text: str) -> str:
"""Remove table of contents patterns"""
for pattern in self.patterns['toc_patterns']:
text = pattern.sub('', text)
return text
def _normalize_whitespace(self, text: str) -> str:
"""Fix excessive whitespace and line breaks"""
# Replace multiple blank lines with double newline
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
# Replace multiple spaces/tabs with single space
text = re.sub(r'[ \t]+', ' ', text)
# Fix broken words (hyphenation at line breaks)
text = re.sub(r'-\s*\n\s*', '', text)
# Normalize line breaks within paragraphs
# Keep double line breaks (paragraph separators)
lines = text.split('\n')
normalized_lines = []
for i, line in enumerate(lines):
line = line.strip()
if line:
# Check if this line and next are both non-empty (within paragraph)
if i < len(lines) - 1 and lines[i + 1].strip():
# Check if line ends with sentence-ending punctuation
if not line.endswith(('.', '!', '?', ':', ';')):
# Join with next line
normalized_lines.append(line + ' ')
else:
normalized_lines.append(line + '\n')
else:
normalized_lines.append(line + '\n')
text = ''.join(normalized_lines)
return text
def _additional_cleaning(self, text: str) -> str:
"""Additional cleaning operations"""
# Remove standalone numbers that might be page/section numbers
text = re.sub(r'\n\s*\d+\s*\n', '\n', text)
# Remove very short lines (likely artifacts)
lines = text.split('\n')
cleaned_lines = [line for line in lines if len(line.strip()) > 3 or line.strip() == '']
text = '\n'.join(cleaned_lines)
# Normalize unicode characters
text = text.replace('\u2019', "'") # Right single quotation mark
text = text.replace('\u2018', "'") # Left single quotation mark
text = text.replace('\u201c', '"') # Left double quotation mark
text = text.replace('\u201d', '"') # Right double quotation mark
text = text.replace('\u2013', '-') # En dash
text = text.replace('\u2014', '--') # Em dash
return text
def clean_pages(self, pages_data: List[Dict[str, any]]) -> str:
"""
Clean text from multiple pages and combine
Args:
pages_data: List of dicts with 'page_number' and 'text'
Returns:
Combined cleaned text
"""
combined_text = []
for page_data in pages_data:
page_text = page_data.get('text', '')
if page_text:
cleaned = self.clean_text(page_text)
if cleaned:
combined_text.append(cleaned)
# Join pages with double newline
full_text = '\n\n'.join(combined_text)
logger.info(f"Cleaned {len(pages_data)} pages into {len(full_text)} characters")
return full_text
|