| # ============================================================ | |
| # FILE: src/text_cleaner.py | |
| # ============================================================ | |
| # PURPOSE: | |
| # Clean extracted text before chunking. | |
| # | |
| # Cleaning should improve text quality without destroying meaning. | |
| # | |
| # Do NOT blindly remove: | |
| # - punctuation | |
| # - numbers | |
| # - headings | |
| # - table labels | |
| # - legal references | |
| # | |
| # These can be important for retrieval. | |
| # ============================================================ | |
| import re | |
| def clean_text(text: str) -> str: | |
| """ | |
| Clean text while preserving meaning. | |
| Steps: | |
| 1. Normalize line endings. | |
| 2. Replace tabs with spaces. | |
| 3. Remove repeated spaces. | |
| 4. Reduce excessive blank lines. | |
| 5. Strip leading and trailing whitespace. | |
| """ | |
| text = text.replace("\r\n", "\n").replace("\r", "\n") | |
| text = text.replace("\t", " ") | |
| text = re.sub(r"[ ]+", " ", text) | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| return text.strip() |