# ============================================================
# FILE: src/text_cleaner.py
# ============================================================
# PURPOSE:
# Clean extracted text before chunking.
#
# Cleaning should improve text quality without destroying meaning.
#
# Do NOT blindly remove:
# - punctuation
# - numbers
# - headings
# - table labels
# - legal references
#
# These can be important for retrieval.
# ============================================================

import re


def clean_text(text: str) -> str:
    """
    Clean text while preserving meaning.

    Steps:
    1. Normalize line endings.
    2. Replace tabs with spaces.
    3. Remove repeated spaces.
    4. Reduce excessive blank lines.
    5. Strip leading and trailing whitespace.
    """

    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = text.replace("\t", " ")
    text = re.sub(r"[ ]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()