setu / module_a /cleaners.py
khagu's picture
chore: finally untrack large database files
3998131
"""
Text cleaning and normalization module
Removes headers, footers, page numbers, and fixes formatting
"""
import re
import logging
from typing import List, Dict
from .config import CLEANING_PATTERNS
logger = logging.getLogger(__name__)
class TextCleaner:
"""Cleans and normalizes extracted text"""
def __init__(self):
"""Initialize text cleaner with compiled patterns"""
self.patterns = self._compile_patterns()
def _compile_patterns(self) -> Dict[str, List[re.Pattern]]:
"""Compile all regex patterns for efficiency"""
compiled = {}
for category, patterns in CLEANING_PATTERNS.items():
compiled[category] = [re.compile(p, re.MULTILINE | re.IGNORECASE) for p in patterns]
return compiled
def clean_text(self, text: str) -> str:
"""
Apply all cleaning operations to text
Args:
text: Raw text to clean
Returns:
Cleaned text
"""
if not text:
return ""
# Remove page numbers
text = self._remove_page_numbers(text)
# Remove headers and footers
text = self._remove_headers_footers(text)
# Remove table of contents patterns
text = self._remove_toc_patterns(text)
# Fix line breaks and whitespace
text = self._normalize_whitespace(text)
# Additional cleaning
text = self._additional_cleaning(text)
return text.strip()
def _remove_page_numbers(self, text: str) -> str:
"""Remove page numbers"""
for pattern in self.patterns['page_numbers']:
text = pattern.sub('', text)
return text
def _remove_headers_footers(self, text: str) -> str:
"""Remove common headers and footers"""
for pattern in self.patterns['headers_footers']:
text = pattern.sub('', text)
return text
def _remove_toc_patterns(self, text: str) -> str:
"""Remove table of contents patterns"""
for pattern in self.patterns['toc_patterns']:
text = pattern.sub('', text)
return text
def _normalize_whitespace(self, text: str) -> str:
"""Fix excessive whitespace and line breaks"""
# Replace multiple blank lines with double newline
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
# Replace multiple spaces/tabs with single space
text = re.sub(r'[ \t]+', ' ', text)
# Fix broken words (hyphenation at line breaks)
text = re.sub(r'-\s*\n\s*', '', text)
# Normalize line breaks within paragraphs
# Keep double line breaks (paragraph separators)
lines = text.split('\n')
normalized_lines = []
for i, line in enumerate(lines):
line = line.strip()
if line:
# Check if this line and next are both non-empty (within paragraph)
if i < len(lines) - 1 and lines[i + 1].strip():
# Check if line ends with sentence-ending punctuation
if not line.endswith(('.', '!', '?', ':', ';')):
# Join with next line
normalized_lines.append(line + ' ')
else:
normalized_lines.append(line + '\n')
else:
normalized_lines.append(line + '\n')
text = ''.join(normalized_lines)
return text
def _additional_cleaning(self, text: str) -> str:
"""Additional cleaning operations"""
# Remove standalone numbers that might be page/section numbers
text = re.sub(r'\n\s*\d+\s*\n', '\n', text)
# Remove very short lines (likely artifacts)
lines = text.split('\n')
cleaned_lines = [line for line in lines if len(line.strip()) > 3 or line.strip() == '']
text = '\n'.join(cleaned_lines)
# Normalize unicode characters
text = text.replace('\u2019', "'") # Right single quotation mark
text = text.replace('\u2018', "'") # Left single quotation mark
text = text.replace('\u201c', '"') # Left double quotation mark
text = text.replace('\u201d', '"') # Right double quotation mark
text = text.replace('\u2013', '-') # En dash
text = text.replace('\u2014', '--') # Em dash
return text
def clean_pages(self, pages_data: List[Dict[str, any]]) -> str:
"""
Clean text from multiple pages and combine
Args:
pages_data: List of dicts with 'page_number' and 'text'
Returns:
Combined cleaned text
"""
combined_text = []
for page_data in pages_data:
page_text = page_data.get('text', '')
if page_text:
cleaned = self.clean_text(page_text)
if cleaned:
combined_text.append(cleaned)
# Join pages with double newline
full_text = '\n\n'.join(combined_text)
logger.info(f"Cleaned {len(pages_data)} pages into {len(full_text)} characters")
return full_text