Spaces:

LucaCappelletti94
/

talking-snake

Sleeping

File size: 31,268 Bytes

"""PDF text extraction and cleaning for TTS processing."""

from __future__ import annotations

import io
import re
from collections import Counter
from dataclasses import dataclass

from pdfminer.high_level import extract_pages
from pdfminer.layout import (
    LAParams,
    LTAnno,
    LTChar,
    LTPage,
    LTTextBoxHorizontal,
    LTTextLineHorizontal,
)


@dataclass
class TextBlock:
    """A block of text with positional metadata."""

    text: str
    y_ratio: float  # 0.0 = bottom, 1.0 = top
    font_size: float
    page_num: int
    x0: float = 0.0  # Left edge position for table detection
    x1: float = 0.0  # Right edge position for table detection


def _is_caption(text: str) -> bool:
    """Check if text is a figure/table caption.

    Captions typically start with:
    - "Figure 1:", "Fig. 2:", "Figure 1."
    - "Table 1:", "Table 2."
    - "Exhibit A:", "Chart 1:"
    - "Source:", "Note:", "Notes:"

    Args:
        text: Text to check.

    Returns:
        True if text appears to be a caption.
    """
    text = text.strip()
    if not text:
        return False

    # Common caption patterns (case-insensitive start)
    caption_patterns = [
        r"^fig(?:ure)?\.?\s*\d",
        r"^table\.?\s*\d",
        r"^exhibit\.?\s*[a-z0-9]",
        r"^chart\.?\s*\d",
        r"^graph\.?\s*\d",
        r"^diagram\.?\s*\d",
        r"^plate\.?\s*\d",
        r"^scheme\.?\s*\d",
        r"^box\.?\s*\d",
        r"^panel\.?\s*[a-z0-9]",
        r"^appendix\.?\s*[a-z0-9]",
        r"^source\s*:",
        r"^sources\s*:",
        r"^note\s*:",
        r"^notes\s*:",
        r"^data\s*:",
        r"^\*\s*p\s*[<>=]",  # Statistical notes like "* p < 0.05"
        r"^legend\s*:",
    ]

    text_lower = text.lower()
    for pattern in caption_patterns:
        if re.match(pattern, text_lower):
            return True

    return False


def _is_table_like_text(text: str) -> bool:
    """Check if text looks like table content.

    Tables often have:
    - Very short text fragments
    - Mostly numbers or single words
    - Lots of whitespace-separated values
    - Column headers or row labels
    - Short phrases without sentence structure

    Args:
        text: Text to check.

    Returns:
        True if the text appears to be table content.
    """
    text = text.strip()

    # Very short fragments are likely table cells
    if len(text) < 5:
        return True

    # Count numbers vs letters
    digits = sum(1 for c in text if c.isdigit())
    letters = sum(1 for c in text if c.isalpha())

    # Mostly numbers with few letters (like "123.45" or "2024")
    if digits > 0 and letters < 3 and digits >= letters:
        return True

    # Check for patterns common in tables
    # Multiple tab-separated or heavily spaced values
    if "\t" in text or "  " in text:
        parts = re.split(r"\s{2,}|\t", text)
        if len(parts) >= 3:
            # Multiple short parts suggests table row
            short_parts = sum(1 for p in parts if len(p.strip()) < 15)
            if short_parts >= len(parts) * 0.6:
                return True

    # Single words that look like column headers
    words = text.split()
    if len(words) == 1 and len(text) < 20:
        # Common table headers/labels
        table_keywords = {
            "total",
            "sum",
            "avg",
            "average",
            "mean",
            "count",
            "min",
            "max",
            "date",
            "time",
            "year",
            "month",
            "day",
            "name",
            "id",
            "no",
            "no.",
            "value",
            "amount",
            "price",
            "cost",
            "qty",
            "quantity",
            "unit",
            "row",
            "column",
            "col",
            "item",
            "description",
            "desc",
            "note",
            "status",
            "type",
            "category",
            "code",
            "ref",
            "reference",
        }
        if text.lower() in table_keywords:
            return True

    # Short phrases without sentence structure (likely table cells)
    # Table cells typically:
    # - Are short (< 50 chars)
    # - Don't end with sentence-ending punctuation
    # - Don't start with lowercase (unless very short)
    # - Have few words (< 8)
    if len(text) < 50 and len(words) < 8:
        # Doesn't end like a sentence
        if not text.rstrip().endswith((".", "!", "?", ":")):
            # Common table cell patterns
            text_lower = text.lower()

            # Technical/status phrases common in tables
            table_phrases = [
                "supported",
                "not supported",
                "yes",
                "no",
                "n/a",
                "none",
                "required",
                "optional",
                "enabled",
                "disabled",
                "active",
                "inactive",
                "read-only",
                "read only",
                "write",
                "read/write",
                "read-write",
                "must be",
                "can be",
                "should be",
                "will be",
                "available",
                "unavailable",
                "pending",
                "completed",
                "failed",
                "true",
                "false",
                "default",
                "custom",
                "manual",
                "automatic",
                "identical",
                "different",
                "same",
                "other",
            ]
            for phrase in table_phrases:
                if phrase in text_lower:
                    return True

            # Looks like a label or header (Title Case or ALL CAPS, short)
            if len(words) <= 4 and len(text) < 40:
                # Check if it's Title Case or contains common label patterns
                if text.istitle() or text.isupper():
                    return True
                # Two-three word phrases that look like labels
                if len(words) in (2, 3) and all(w[0].isupper() for w in words if w):
                    return True

    return False


def _filter_table_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
    """Filter out blocks that appear to be part of tables.

    Detects tables by looking for:
    - Multiple blocks at similar Y positions (table rows)
    - Blocks with table-like content

    Args:
        blocks: List of text blocks.

    Returns:
        Filtered list with table content removed.
    """
    if not blocks:
        return blocks

    # Group blocks by page and approximate Y position (row detection)
    # Blocks within 1% of page height are considered same row
    filtered = []

    for page_num in set(b.page_num for b in blocks):
        page_blocks = [b for b in blocks if b.page_num == page_num]

        # Group by Y position (rounded to detect rows)
        y_groups: dict[float, list[TextBlock]] = {}
        for block in page_blocks:
            y_key = round(block.y_ratio, 2)  # Group within ~1% of page
            if y_key not in y_groups:
                y_groups[y_key] = []
            y_groups[y_key].append(block)

        for y_key, row_blocks in y_groups.items():
            # If many blocks at same Y position, likely a table row
            if len(row_blocks) >= 3:
                # Check if most blocks look like table cells
                table_like = sum(1 for b in row_blocks if _is_table_like_text(b.text))
                if table_like >= len(row_blocks) * 0.5:
                    # Skip this entire row - it's a table
                    continue

            # Filter individual blocks that look like table content
            for block in row_blocks:
                if not _is_table_like_text(block.text):
                    filtered.append(block)

    # Sort by page and position (top to bottom)
    filtered.sort(key=lambda b: (b.page_num, -b.y_ratio))
    return filtered


def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
    """Extract text blocks from PDF with positional information.

    Args:
        pdf_bytes: Raw PDF file content.

    Returns:
        List of TextBlock objects with text and metadata.
    """
    blocks: list[TextBlock] = []
    pdf_file = io.BytesIO(pdf_bytes)

    laparams = LAParams(
        line_margin=0.5,
        word_margin=0.1,
        char_margin=2.0,
        boxes_flow=0.5,
    )

    for page_num, page_layout in enumerate(extract_pages(pdf_file, laparams=laparams), start=1):
        if not isinstance(page_layout, LTPage):
            continue

        page_height = page_layout.height

        for element in page_layout:
            if not isinstance(element, LTTextBoxHorizontal):
                continue

            # Extract characters with their font sizes
            # LTChar has font size, LTAnno is whitespace (use size=-1 to always keep)
            chars_with_sizes: list[tuple[str, float]] = []
            for line in element:
                if isinstance(line, LTTextLineHorizontal):
                    for char in line:
                        if isinstance(char, LTChar):
                            chars_with_sizes.append((char.get_text(), char.size))
                        elif isinstance(char, LTAnno):
                            # Whitespace/newlines - always keep (use -1 as marker)
                            chars_with_sizes.append((char.get_text(), -1))

            if not chars_with_sizes:
                text = element.get_text().strip()
                if text:
                    blocks.append(
                        TextBlock(
                            text=text,
                            y_ratio=element.y0 / page_height if page_height > 0 else 0.5,
                            font_size=10.0,
                            page_num=page_num,
                        )
                    )
                continue

            # Find dominant font size (most common, excluding whitespace markers)
            font_sizes = [size for _, size in chars_with_sizes if size > 0]
            if not font_sizes:
                continue
            size_counts = Counter(round(s, 1) for s in font_sizes)
            dominant_size = max(size_counts, key=lambda x: size_counts[x])

            # Filter out superscript/subscript characters (< 70% of dominant size)
            # Keep whitespace (size=-1) and normal-sized characters
            min_size = dominant_size * 0.7
            filtered_text = "".join(
                char for char, size in chars_with_sizes if size < 0 or size >= min_size
            )

            text = filtered_text.strip()
            if not text:
                continue

            # Calculate Y position as ratio (0=bottom, 1=top)
            y_ratio = element.y0 / page_height if page_height > 0 else 0.5

            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0

            blocks.append(
                TextBlock(
                    text=text,
                    y_ratio=y_ratio,
                    font_size=avg_font_size,
                    page_num=page_num,
                )
            )

    return blocks


def get_page_count(pdf_bytes: bytes) -> int:
    """Get the number of pages in a PDF.

    Args:
        pdf_bytes: Raw PDF file content.

    Returns:
        Number of pages in the PDF.
    """
    pdf_file = io.BytesIO(pdf_bytes)
    laparams = LAParams()
    page_count = sum(1 for _ in extract_pages(pdf_file, laparams=laparams))
    return page_count


def extract_text(pdf_bytes: bytes) -> str:
    """Extract and clean text from a PDF file.

    Args:
        pdf_bytes: Raw PDF file content.

    Returns:
        Cleaned text suitable for TTS.
    """
    blocks = extract_text_blocks(pdf_bytes)
    if not blocks:
        return ""

    # Filter out table content first
    blocks = _filter_table_blocks(blocks)

    cleaned_blocks = clean_text_blocks(blocks)
    text = "\n\n".join(block.text for block in cleaned_blocks)

    # Apply TTS-specific normalization
    return normalize_for_tts(text)


def clean_text_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
    """Remove headers, footers, page numbers, and other artifacts.

    Applies multiple heuristics:
    1. Remove blocks in top/bottom margins (likely headers/footers)
    2. Remove repeated text across pages (likely running headers)
    3. Remove standalone page numbers
    4. Remove very short lines that look like artifacts

    Args:
        blocks: List of TextBlock objects.

    Returns:
        Filtered list of TextBlock objects.
    """
    if not blocks:
        return []

    # Find repeated text patterns (headers/footers)
    text_counts = Counter(block.text for block in blocks)
    total_pages = max(block.page_num for block in blocks)
    repeated_threshold = max(2, total_pages // 2)
    repeated_texts = {text for text, count in text_counts.items() if count >= repeated_threshold}

    # Calculate median font size for filtering
    font_sizes = sorted(block.font_size for block in blocks)
    median_font_size = font_sizes[len(font_sizes) // 2] if font_sizes else 10.0

    cleaned: list[TextBlock] = []

    for block in blocks:
        # Skip if in header zone (top 10%)
        if block.y_ratio > 0.90:
            continue

        # Skip if in footer zone (bottom 10%)
        if block.y_ratio < 0.10:
            continue

        # Skip repeated text (running headers/footers)
        if block.text in repeated_texts:
            continue

        # Skip standalone page numbers
        if is_page_number(block.text):
            continue

        # Skip figure/table captions
        if _is_caption(block.text):
            continue

        # Skip very short lines with small font (likely captions/footnotes)
        if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
            continue

        cleaned.append(block)

    return cleaned


def is_page_number(text: str) -> bool:
    """Check if text is likely a page number.

    Args:
        text: Text to check.

    Returns:
        True if text appears to be a page number.
    """
    text = text.strip()

    # Pure number
    if text.isdigit():
        return True

    # Roman numerals
    if re.match(r"^[ivxlcdmIVXLCDM]+$", text):
        return True

    # "Page N" or "N of M" patterns
    if re.match(r"^(page\s*)?\d+(\s*(of|/)\s*\d+)?$", text, re.IGNORECASE):
        return True

    # "- N -" pattern
    if re.match(r"^[-–—]\s*\d+\s*[-–—]$", text):
        return True

    return False


def clean_text(text: str) -> str:
    """Clean raw text for TTS processing.

    This is a simpler function for cleaning already-extracted text,
    without the positional information.

    Args:
        text: Raw text to clean.

    Returns:
        Cleaned text suitable for TTS.
    """
    lines = text.split("\n")
    cleaned_lines: list[str] = []

    for line in lines:
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Skip standalone page numbers
        if is_page_number(line):
            continue

        # Skip very short lines (likely artifacts)
        if len(line) < 3:
            continue

        cleaned_lines.append(line)

    # Rejoin with proper spacing
    result = "\n".join(cleaned_lines)

    # === FIX HYPHENATED/SPLIT WORDS ===
    # These are words broken across lines, common in PDFs and web content

    # Pattern 1: word-\nword (hyphen at end of line) -> rejoin word
    result = re.sub(r"(\w)-\n\s*(\w)", r"\1\2", result)

    # Pattern 2: word-\n  word (hyphen + newline + spaces)
    result = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", result)

    # Pattern 3: word- word (hyphen + space, often from copy-paste)
    result = re.sub(r"(\w)- (\w)", r"\1\2", result)

    # Pattern 4: Lines ending with hyphen followed by lowercase (likely continuation)
    result = re.sub(r"-\n([a-z])", r"\1", result)

    # === FIX LINE BREAK ARTIFACTS ===
    # Join lines that don't end with sentence-ending punctuation
    # This handles text that was wrapped at fixed width

    # Replace single newlines (not paragraph breaks) with spaces
    # Keep double newlines as paragraph separators
    result = re.sub(r"(?<![.!?:;\n])\n(?!\n)", " ", result)

    # Normalize whitespace
    result = re.sub(r"\n{3,}", "\n\n", result)
    result = re.sub(r"[ \t]+", " ", result)

    # Apply TTS-specific normalization
    result = normalize_for_tts(result)

    return result.strip()


def normalize_for_tts(text: str) -> str:
    """Normalize text for natural TTS pronunciation.

    Handles special characters, punctuation, and formatting that can
    cause TTS models to slow down or mispronounce.

    Args:
        text: Text to normalize.

    Returns:
        Normalized text optimized for TTS.
    """
    # === REMOVE ACADEMIC/PAPER ARTIFACTS ===
    # Remove inline citations like (Smith et al., 2020) or (Smith, 2020; Jones, 2019)
    # Also handles (Chen, 2018; Lee et al., 2020)
    text = re.sub(r"\([^()]*\b\d{4}[a-z]?\b[^()]*\)", "", text)

    # Remove author-year citations like "Smith (2020)" or "Smith et al. (2020)"
    text = re.sub(
        r"\b[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*\(\d{4}[a-z]?\)", "", text
    )

    # Clean up "by [Author]" patterns - remove the author part, keep "by" for grammar
    # "by Smith" -> "" (will be cleaned up), "study by Smith found" -> "study found"
    text = re.sub(
        r"\bby\s+[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*,?\s*(?=found|showed|demonstrated|reported|observed|noted|suggested|concluded|argued|claimed|stated|proposed|discovered|revealed|indicated|confirmed)",
        "",
        text,
    )

    # Remove orphaned "et al." and similar
    text = re.sub(r"\s+et\s+al\.?,?\s*", " ", text)

    # Remove figure/table references like "see Figure 1" or "(see Table 2)"
    text = re.sub(
        r"\(?see\s+(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph|Appendix)\s*\d+[a-z]?\)?",
        "",
        text,
        flags=re.IGNORECASE,
    )

    # Remove standalone figure/table references like "Figure 1 shows" -> "shows"
    text = re.sub(
        r"(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph)\s*\d+[a-z]?\s*(?:shows?|depicts?|illustrates?|presents?|displays?|summarizes?)",
        "",
        text,
        flags=re.IGNORECASE,
    )

    # Remove section references like "Section 2.1" or "Chapter 3" (with surrounding context)
    text = re.sub(
        r"(?:in|see|as\s+(?:shown|described|discussed)\s+in|according\s+to)\s+(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*,?\s*",
        "",
        text,
        flags=re.IGNORECASE,
    )
    text = re.sub(r"(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*", "", text, flags=re.IGNORECASE)

    # Remove equation references like "Equation 1" or "Eq. (2)"
    text = re.sub(r"(?:Equation|Eq\.?)\s*\(?\d+\)?", "", text, flags=re.IGNORECASE)

    # Remove DOIs
    text = re.sub(r"(?:doi:|DOI:?)\s*10\.\d{4,}/[^\s]+", "", text, flags=re.IGNORECASE)

    # Remove arXiv references
    text = re.sub(r"arXiv:\d{4}\.\d{4,}(?:v\d+)?", "", text, flags=re.IGNORECASE)

    # Remove ISSN/ISBN numbers
    text = re.sub(r"(?:ISSN|ISBN)[:\s]*[\d-]+", "", text, flags=re.IGNORECASE)

    # Remove page ranges like "pp. 123-456" or "p. 42" or "pages 10-20"
    text = re.sub(r"(?:p{1,2}\.?|pages?)\s*\d+(?:\s*[-–—]\s*\d+)?", "", text, flags=re.IGNORECASE)

    # Remove volume/issue numbers like "Vol. 12, No. 3" (entire phrase)
    text = re.sub(
        r"(?:Vol(?:ume)?\.?\s*\d+,?\s*)?(?:Issue|No\.?)\s*\d+,?\s*", "", text, flags=re.IGNORECASE
    )
    text = re.sub(r"Vol(?:ume)?\.?\s*\d+,?\s*", "", text, flags=re.IGNORECASE)

    # Remove copyright notices
    text = re.sub(r"©\s*\d{4}[^.]*\.", "", text)
    text = re.sub(r"Copyright\s*©?\s*\d{4}[^.]*\.", "", text, flags=re.IGNORECASE)

    # Remove "All rights reserved" and similar
    text = re.sub(r"All rights reserved\.?", "", text, flags=re.IGNORECASE)

    # Remove asterisks used for footnote markers
    text = re.sub(r"\*{1,3}(?=\s|$)", "", text)

    # === NORMALIZE NEWLINES FIRST ===
    # Convert various newline formats to standard \n
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # Replace single newlines (mid-sentence line breaks) with spaces
    # Keep double newlines as paragraph separators
    # First, normalize multiple newlines to exactly two
    text = re.sub(r"\n{3,}", "\n\n", text)

    # Replace single newlines that aren't paragraph breaks with spaces
    # A single newline not preceded by sentence-ending punctuation is likely a line wrap
    text = re.sub(r"(?<![.!?:\n])\n(?!\n)", " ", text)

    # === CODE AND TECHNICAL CONTENT ===
    # Handle common programming patterns that read poorly

    # === REMOVE URLS AND TECHNICAL STRINGS FIRST ===
    # URLs (various formats) - remove completely
    text = re.sub(r"https?://[^\s<>\"')\]]+", "", text)
    text = re.sub(r"www\.[^\s<>\"')\]]+", "", text)
    text = re.sub(r"ftp://[^\s<>\"')\]]+", "", text)

    # UUIDs (with or without dashes) - must come before git hash pattern
    uuid_pattern = (
        r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-" r"[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"
    )
    text = re.sub(uuid_pattern, "", text)

    # Git commit hashes (7-40 hex chars standalone)
    text = re.sub(r"(?<![a-zA-Z0-9])[0-9a-f]{7,40}(?![a-zA-Z0-9])", "", text, flags=re.IGNORECASE)

    # Hex color codes (#fff, #ffffff)
    text = re.sub(r"#[0-9a-fA-F]{3,8}\b", "", text)

    # Long hex/base64 strings (likely encoded data)
    text = re.sub(r"\b[A-Za-z0-9+/]{20,}={0,2}\b", "", text)

    # File paths (Unix and Windows style)
    text = re.sub(r"[/\\][\w./\\-]+\.\w+", "", text)

    # IP addresses
    text = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", "", text)

    # Port numbers after colon
    text = re.sub(r":\d{2,5}\b", "", text)

    # Remove email addresses
    text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "", text)

    # SHA/MD5 style hashes with prefix
    text = re.sub(r"\b(sha\d*|md5|hash)[:\s]*[0-9a-f]+\b", "", text, flags=re.IGNORECASE)

    # CamelCase: split into words (e.g., "getUserName" -> "get User Name")
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)

    # snake_case: replace underscores with spaces
    text = re.sub(r"(\w)_(\w)", r"\1 \2", text)

    # Function calls: "func()" -> "func"
    text = re.sub(r"(\w+)\(\)", r"\1", text)

    # Arrow functions/operators: -> and =>
    text = text.replace("->", " returns ")
    text = text.replace("=>", " arrow ")

    # Common code operators spoken naturally
    text = text.replace("!=", " not equals ")
    text = text.replace("==", " equals ")
    text = text.replace("===", " strictly equals ")
    text = text.replace("!==", " strictly not equals ")
    text = text.replace("&&", " and ")
    text = text.replace("||", " or ")
    text = text.replace("++", " increment ")
    text = text.replace("--", " decrement ")

    # File extensions: ".py" -> " dot py" (only for common extensions)
    ext_pattern = r"\.(py|js|ts|html|css|json|xml|md|txt|csv|pdf)\b"
    text = re.sub(ext_pattern, r" dot \1", text, flags=re.IGNORECASE)

    # Remove standalone hashes/pound signs (not hashtags)
    text = re.sub(r"(?<!\w)#(?!\w)", "", text)

    # Backticks (often used in markdown for code)
    text = text.replace("`", "")

    # Triple quotes
    text = text.replace('"""', "")
    text = text.replace("'''", "")

    # === UNICODE NORMALIZATION ===

    # Remove superscript characters (often footnote references)
    # Includes Unicode superscript digits, letters, and modifier letters
    superscripts = (
        "⁰¹²³⁴⁵⁶⁷⁸⁹"  # Superscript digits
        "⁺⁻⁼⁽⁾"  # Superscript operators
        "ⁿⁱ"  # Common superscript letters
        "ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻ"  # Superscript lowercase
        "ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᴬᴭᴮᴯᴰᴱᴲᴳᴴᴵᴶᴷᴸᴹᴺᴻᴼᴽᴾᴿᵀᵁᵂ"  # Superscript uppercase
        "ᶦᶧᶨᶩᶪᶫᶬᶭᶮᶯᶰᶱᶲᶳᶴᶵᶶᶷᶸᶹᶺᶻᶼᶽᶾᶿ"  # More modifier letters
        "ʰʱʲʳʴʵʶʷʸʹʺʻʼʽˀˁˆˇˈˉˊˋˌˍˎˏːˑ"  # Modifier letters
    )
    for char in superscripts:
        text = text.replace(char, "")

    # Also use regex to catch any remaining superscript-like characters
    # Unicode categories for superscripts and modifiers
    text = re.sub(r"[\u2070-\u209F]", "", text)  # Superscripts and Subscripts block
    text = re.sub(r"[\u1D2C-\u1D6A]", "", text)  # Phonetic Extensions (modifier letters)
    text = re.sub(r"[\u1D78-\u1D7F]", "", text)  # More phonetic extensions
    text = re.sub(r"[\u02B0-\u02FF]", "", text)  # Spacing Modifier Letters

    # Remove subscript characters
    subscripts = "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₔₕₖₗₘₙₚₛₜ"
    for char in subscripts:
        text = text.replace(char, "")

    # Convert smart quotes to simple quotes
    text = text.replace("\u201c", '"').replace("\u201d", '"')
    text = text.replace("\u2018", "'").replace("\u2019", "'")
    text = text.replace("\u201e", '"').replace("\u201f", '"')

    # Normalize dashes to standard hyphen or remove
    text = text.replace("–", "-")  # en-dash
    text = text.replace("—", " - ")  # em-dash (add spaces for pause)
    text = text.replace("―", " - ")  # horizontal bar
    text = text.replace("‐", "-")  # Unicode hyphen
    text = text.replace("‑", "-")  # non-breaking hyphen
    text = text.replace("⁃", "-")  # hyphen bullet
    text = text.replace("−", "-")  # minus sign

    # Normalize ellipsis
    text = text.replace("…", "...")
    text = re.sub(r"\.{4,}", "...", text)  # Limit to 3 dots

    # Normalize other Unicode punctuation
    text = text.replace("•", ",")  # Bullet points
    text = text.replace("·", " ")  # Middle dot
    text = text.replace("‧", " ")  # Hyphenation point
    text = text.replace("※", " ")  # Reference mark
    text = text.replace("†", "")  # Dagger (footnote)
    text = text.replace("‡", "")  # Double dagger
    text = text.replace("§", "section ")
    text = text.replace("¶", "")  # Pilcrow
    text = text.replace("©", "copyright ")
    text = text.replace("®", " registered ")
    text = text.replace("™", " trademark ")
    text = text.replace("°", " degrees ")

    # === SPACING AROUND PUNCTUATION ===
    # Ensure proper spacing around dashes used as separators
    text = re.sub(r"\s*-\s*-\s*", " - ", text)  # Double dash
    text = re.sub(r"(\w)\s*-\s*(\w)", r"\1 - \2", text)  # Word-dash-word with spaces

    # Fix missing space after punctuation
    text = re.sub(r"([.!?])([A-Z])", r"\1 \2", text)
    text = re.sub(r",([A-Za-z])", r", \1", text)

    # Fix multiple punctuation marks
    text = re.sub(r"[,]{2,}", ",", text)
    text = re.sub(r"[;]{2,}", ";", text)
    text = re.sub(r"[:]{2,}", ":", text)
    text = re.sub(r"[!]{2,}", "!", text)
    text = re.sub(r"[?]{2,}", "?", text)

    # === NUMBERS AND SPECIAL NOTATIONS ===
    # Convert common fractions
    text = text.replace("½", " one half ")
    text = text.replace("⅓", " one third ")
    text = text.replace("⅔", " two thirds ")
    text = text.replace("¼", " one quarter ")
    text = text.replace("¾", " three quarters ")
    text = text.replace("⅕", " one fifth ")
    text = text.replace("⅖", " two fifths ")
    text = text.replace("⅗", " three fifths ")
    text = text.replace("⅘", " four fifths ")
    text = text.replace("⅙", " one sixth ")
    text = text.replace("⅚", " five sixths ")
    text = text.replace("⅛", " one eighth ")
    text = text.replace("⅜", " three eighths ")
    text = text.replace("⅝", " five eighths ")
    text = text.replace("⅞", " seven eighths ")

    # Handle percentage and math symbols
    text = text.replace("%", " percent")
    text = text.replace("&", " and ")
    text = text.replace("+", " plus ")
    text = text.replace("=", " equals ")
    text = text.replace("<", " less than ")
    text = text.replace(">", " greater than ")
    text = text.replace("≤", " less than or equal to ")
    text = text.replace("≥", " greater than or equal to ")
    text = text.replace("≠", " not equal to ")
    text = text.replace("±", " plus or minus ")
    text = text.replace("×", " times ")
    text = text.replace("÷", " divided by ")

    # === ABBREVIATIONS AND SPECIAL CASES ===
    # Common abbreviations that might cause issues
    text = re.sub(r"\be\.g\.", "for example", text, flags=re.IGNORECASE)
    text = re.sub(r"\bi\.e\.", "that is", text, flags=re.IGNORECASE)
    text = re.sub(r"\betc\.", "etcetera", text, flags=re.IGNORECASE)
    text = re.sub(r"\bvs\.", "versus", text, flags=re.IGNORECASE)
    text = re.sub(r"\bDr\.", "Doctor", text)
    text = re.sub(r"\bMr\.", "Mister", text)
    text = re.sub(r"\bMrs\.", "Missus", text)
    text = re.sub(r"\bMs\.", "Miss", text)
    text = re.sub(r"\bProf\.", "Professor", text)
    text = re.sub(r"\bSt\.", "Saint", text)
    text = re.sub(r"\bNo\.\s*(\d)", r"Number \1", text)
    text = re.sub(r"\bFig\.", "Figure", text, flags=re.IGNORECASE)
    text = re.sub(r"\bVol\.", "Volume", text, flags=re.IGNORECASE)
    text = re.sub(r"\bpp\.", "pages", text, flags=re.IGNORECASE)
    text = re.sub(r"\bp\.\s*(\d)", r"page \1", text, flags=re.IGNORECASE)

    # === BRACKETS AND PARENTHESES ===
    # Remove or simplify brackets that might cause pauses
    text = re.sub(r"\[([^\]]+)\]", r"(\1)", text)  # Square to round
    text = re.sub(r"\{([^}]+)\}", r"(\1)", text)  # Curly to round

    # Remove citation numbers like [1], [2,3], [1-5]
    text = re.sub(r"\[\d+(?:[-,]\d+)*\]", "", text)
    text = re.sub(r"\(\d+(?:[-,]\d+)*\)", "", text)

    # === CLEANUP ===
    # Remove standalone special characters
    text = re.sub(r"\s+[#@*^~`|\\]+\s+", " ", text)

    # Remove content in angle brackets (often HTML/XML artifacts)
    text = re.sub(r"<[^>]+>", "", text)

    # Remove spaces before punctuation
    text = re.sub(r"\s+([.,;:!?])", r"\1", text)

    # Ensure space after punctuation (but not before another punctuation)
    text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)

    # === FINAL WHITESPACE NORMALIZATION ===
    # This must happen LAST after all substitutions that can create gaps

    # Collapse all whitespace (spaces, tabs, multiple spaces) to single space
    # Do this per-line to preserve intentional paragraph breaks
    lines = text.split("\n")
    normalized_lines = []
    for line in lines:
        # Replace any sequence of whitespace with single space
        line = re.sub(r"[ \t]+", " ", line)
        # Strip leading/trailing whitespace from each line
        line = line.strip()
        normalized_lines.append(line)

    text = "\n".join(normalized_lines)

    # Remove excessive blank lines (keep max 1 blank line between paragraphs)
    text = re.sub(r"\n{3,}", "\n\n", text)

    # Remove blank lines at start/end
    text = text.strip()

    return text