"""Text cleaning and normalization for parsed PDF content."""

import logging
import re

logger = logging.getLogger(__name__)


class TextCleaner:
    """Cleans and normalizes raw text extracted from PDFs."""

    def clean(self, raw_text: str) -> str:
        """Clean raw text by removing artifacts and normalizing whitespace.

        Args:
            raw_text: The raw text extracted from a PDF page.

        Returns:
            Cleaned and normalized text string.
        """
        text = raw_text
        # Remove null bytes and control characters (keep newlines and tabs)
        text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)
        # Normalize unicode whitespace to regular spaces
        text = re.sub(r"\u00a0", " ", text)
        # Remove soft hyphens
        text = text.replace("\u00ad", "")
        # Collapse multiple spaces into one
        text = re.sub(r"[ \t]+", " ", text)
        # Collapse three or more newlines into two
        text = re.sub(r"\n{3,}", "\n\n", text)
        # Strip leading/trailing whitespace per line
        text = "\n".join(line.strip() for line in text.splitlines())
        # Strip leading/trailing whitespace overall
        text = text.strip()
        return text

    def remove_headers_footers(self, text: str) -> str:
        """Remove repeating headers and footers from text.

        Args:
            text: Text that may contain headers/footers.

        Returns:
            Text with headers and footers removed.
        """
        lines = text.splitlines()
        if len(lines) < 3:
            return text
        # Remove common page-number-only lines (e.g. "  3  ", "- 12 -", "Side 5")
        cleaned_lines: list[str] = []
        for line in lines:
            stripped = line.strip()
            # Skip standalone page numbers
            if re.match(r"^[-–—]?\s*\d{1,4}\s*[-–—]?$", stripped):
                continue
            # Skip lines like "Side 3" or "Page 3" (Danish/English)
            if re.match(r"^(side|page)\s+\d{1,4}$", stripped, re.IGNORECASE):
                continue
            cleaned_lines.append(line)
        return "\n".join(cleaned_lines)