# ============================================================ # FILE: src/text_cleaner.py # ============================================================ # PURPOSE: # Clean extracted text before chunking. # # Cleaning should improve text quality without destroying meaning. # # Do NOT blindly remove: # - punctuation # - numbers # - headings # - table labels # - legal references # # These can be important for retrieval. # ============================================================ import re def clean_text(text: str) -> str: """ Clean text while preserving meaning. Steps: 1. Normalize line endings. 2. Replace tabs with spaces. 3. Remove repeated spaces. 4. Reduce excessive blank lines. 5. Strip leading and trailing whitespace. """ text = text.replace("\r\n", "\n").replace("\r", "\n") text = text.replace("\t", " ") text = re.sub(r"[ ]+", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip()