|
|
""" |
|
|
Text cleaning and normalization module |
|
|
Removes headers, footers, page numbers, and fixes formatting |
|
|
""" |
|
|
|
|
|
import re |
|
|
import logging |
|
|
from typing import List, Dict |
|
|
|
|
|
from .config import CLEANING_PATTERNS |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class TextCleaner: |
|
|
"""Cleans and normalizes extracted text""" |
|
|
|
|
|
def __init__(self): |
|
|
"""Initialize text cleaner with compiled patterns""" |
|
|
self.patterns = self._compile_patterns() |
|
|
|
|
|
def _compile_patterns(self) -> Dict[str, List[re.Pattern]]: |
|
|
"""Compile all regex patterns for efficiency""" |
|
|
compiled = {} |
|
|
for category, patterns in CLEANING_PATTERNS.items(): |
|
|
compiled[category] = [re.compile(p, re.MULTILINE | re.IGNORECASE) for p in patterns] |
|
|
return compiled |
|
|
|
|
|
def clean_text(self, text: str) -> str: |
|
|
""" |
|
|
Apply all cleaning operations to text |
|
|
|
|
|
Args: |
|
|
text: Raw text to clean |
|
|
|
|
|
Returns: |
|
|
Cleaned text |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = self._remove_page_numbers(text) |
|
|
|
|
|
|
|
|
text = self._remove_headers_footers(text) |
|
|
|
|
|
|
|
|
text = self._remove_toc_patterns(text) |
|
|
|
|
|
|
|
|
text = self._normalize_whitespace(text) |
|
|
|
|
|
|
|
|
text = self._additional_cleaning(text) |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
def _remove_page_numbers(self, text: str) -> str: |
|
|
"""Remove page numbers""" |
|
|
for pattern in self.patterns['page_numbers']: |
|
|
text = pattern.sub('', text) |
|
|
return text |
|
|
|
|
|
def _remove_headers_footers(self, text: str) -> str: |
|
|
"""Remove common headers and footers""" |
|
|
for pattern in self.patterns['headers_footers']: |
|
|
text = pattern.sub('', text) |
|
|
return text |
|
|
|
|
|
def _remove_toc_patterns(self, text: str) -> str: |
|
|
"""Remove table of contents patterns""" |
|
|
for pattern in self.patterns['toc_patterns']: |
|
|
text = pattern.sub('', text) |
|
|
return text |
|
|
|
|
|
def _normalize_whitespace(self, text: str) -> str: |
|
|
"""Fix excessive whitespace and line breaks""" |
|
|
|
|
|
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'[ \t]+', ' ', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'-\s*\n\s*', '', text) |
|
|
|
|
|
|
|
|
|
|
|
lines = text.split('\n') |
|
|
normalized_lines = [] |
|
|
|
|
|
for i, line in enumerate(lines): |
|
|
line = line.strip() |
|
|
if line: |
|
|
|
|
|
if i < len(lines) - 1 and lines[i + 1].strip(): |
|
|
|
|
|
if not line.endswith(('.', '!', '?', ':', ';')): |
|
|
|
|
|
normalized_lines.append(line + ' ') |
|
|
else: |
|
|
normalized_lines.append(line + '\n') |
|
|
else: |
|
|
normalized_lines.append(line + '\n') |
|
|
|
|
|
text = ''.join(normalized_lines) |
|
|
|
|
|
return text |
|
|
|
|
|
def _additional_cleaning(self, text: str) -> str: |
|
|
"""Additional cleaning operations""" |
|
|
|
|
|
text = re.sub(r'\n\s*\d+\s*\n', '\n', text) |
|
|
|
|
|
|
|
|
lines = text.split('\n') |
|
|
cleaned_lines = [line for line in lines if len(line.strip()) > 3 or line.strip() == ''] |
|
|
text = '\n'.join(cleaned_lines) |
|
|
|
|
|
|
|
|
text = text.replace('\u2019', "'") |
|
|
text = text.replace('\u2018', "'") |
|
|
text = text.replace('\u201c', '"') |
|
|
text = text.replace('\u201d', '"') |
|
|
text = text.replace('\u2013', '-') |
|
|
text = text.replace('\u2014', '--') |
|
|
|
|
|
return text |
|
|
|
|
|
def clean_pages(self, pages_data: List[Dict[str, any]]) -> str: |
|
|
""" |
|
|
Clean text from multiple pages and combine |
|
|
|
|
|
Args: |
|
|
pages_data: List of dicts with 'page_number' and 'text' |
|
|
|
|
|
Returns: |
|
|
Combined cleaned text |
|
|
""" |
|
|
combined_text = [] |
|
|
|
|
|
for page_data in pages_data: |
|
|
page_text = page_data.get('text', '') |
|
|
if page_text: |
|
|
cleaned = self.clean_text(page_text) |
|
|
if cleaned: |
|
|
combined_text.append(cleaned) |
|
|
|
|
|
|
|
|
full_text = '\n\n'.join(combined_text) |
|
|
|
|
|
logger.info(f"Cleaned {len(pages_data)} pages into {len(full_text)} characters") |
|
|
|
|
|
return full_text |
|
|
|