| import re |
| from typing import List, Optional |
|
|
| class TextProcessor: |
| """Handles text preprocessing and cleaning""" |
| |
| def __init__(self): |
| """Initialize text processor""" |
| self.sentence_endings = r'[.!?]' |
| self.word_pattern = r'\b\w+\b' |
| |
| def clean_text(self, text: str) -> str: |
| """ |
| Clean and normalize text |
| |
| Args: |
| text: Input text to clean |
| |
| Returns: |
| str: Cleaned text |
| """ |
| |
| text = ' '.join(text.split()) |
| |
| |
| text = self._fix_ocr_errors(text) |
| |
| |
| text = self._normalize_punctuation(text) |
| |
| return text.strip() |
| |
| def split_into_sections(self, text: str) -> List[str]: |
| """ |
| Split text into logical sections based on content |
| |
| Args: |
| text: Input text to split |
| |
| Returns: |
| List[str]: List of text sections |
| """ |
| |
| sections = re.split(r'\n\s*\n|\n(?=[A-Z][^a-z]*:)', text) |
| return [s.strip() for s in sections if s.strip()] |
| |
| def count_words(self, text: str) -> int: |
| """ |
| Count words in text |
| |
| Args: |
| text: Input text |
| |
| Returns: |
| int: Word count |
| """ |
| words = re.findall(self.word_pattern, text) |
| return len(words) |
| |
| def _fix_ocr_errors(self, text: str) -> str: |
| """Fix common OCR errors""" |
| replacements = { |
| r'[|]': 'I', |
| r'0': 'O', |
| r'1': 'l', |
| r'\s+': ' ' |
| } |
| |
| for pattern, replacement in replacements.items(): |
| text = re.sub(pattern, replacement, text) |
| return text |
| |
| def _normalize_punctuation(self, text: str) -> str: |
| """Normalize punctuation marks""" |
| |
| text = re.sub(r'\.{2,}', '.', text) |
| |
| |
| text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text) |
| |
| |
| text = re.sub(r'\s+([.!?,])', r'\1', text) |
| |
| return text |