"""
Adapted and supplemented from origional at https://github.com/KittenML/KittenTTS/blob/main/kittentts/preprocess.py
See license at: https://github.com/KittenML/KittenTTS/blob/main/LICENSE (Apache 2.0)
"""

import re
import unicodedata

# ─────────────────────────────────────────────
# Number → Words conversion
# ─────────────────────────────────────────────

_ONES = [
    '',
    'one',
    'two',
    'three',
    'four',
    'five',
    'six',
    'seven',
    'eight',
    'nine',
    'ten',
    'eleven',
    'twelve',
    'thirteen',
    'fourteen',
    'fifteen',
    'sixteen',
    'seventeen',
    'eighteen',
    'nineteen',
]
_TENS = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
_SCALE = ['', 'thousand', 'million', 'billion', 'trillion']

_ORDINAL_EXCEPTIONS = {
    'one': 'first',
    'two': 'second',
    'three': 'third',
    'four': 'fourth',
    'five': 'fifth',
    'six': 'sixth',
    'seven': 'seventh',
    'eight': 'eighth',
    'nine': 'ninth',
    'twelve': 'twelfth',
}

_CURRENCY_SYMBOLS = {
    '$': 'dollar',
    '€': 'euro',
    '£': 'pound',
    '¥': 'yen',
    '₹': 'rupee',
    '₩': 'won',
    '₿': 'bitcoin',
}

_CURRENCY_SCALE_MAP = {
    'K': 'thousand',
    'M': 'million',
    'B': 'billion',
    'T': 'trillion',
    'thousand': 'thousand',
    'million': 'million',
    'billion': 'billion',
    'trillion': 'trillion',
}

_ROMAN = [
    (1000, 'M'),
    (900, 'CM'),
    (500, 'D'),
    (400, 'CD'),
    (100, 'C'),
    (90, 'XC'),
    (50, 'L'),
    (40, 'XL'),
    (10, 'X'),
    (9, 'IX'),
    (5, 'V'),
    (4, 'IV'),
    (1, 'I'),
]
_RE_ROMAN = re.compile(r'\b(M{0,4})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\b')


def _three_digits_to_words(n: int) -> str:
    """Convert a number 0–999 to English words."""
    if n == 0:
        return ''
    parts = []
    hundreds = n // 100
    remainder = n % 100
    if hundreds:
        parts.append(f'{_ONES[hundreds]} hundred')
    if remainder < 20:
        if remainder:
            parts.append(_ONES[remainder])
    else:
        tens_word = _TENS[remainder // 10]
        ones_word = _ONES[remainder % 10]
        parts.append(f'{tens_word}-{ones_word}' if ones_word else tens_word)
    return ' '.join(parts)


def number_to_words(n: int) -> str:
    """
    Convert an integer to its English word representation.

    Examples:
        1200      → "twelve hundred"
        1000      → "one thousand"
        1_000_000 → "one million"
        -42       → "negative forty-two"
        0         → "zero"
    """
    if not isinstance(n, int):
        n = int(n)
    if n == 0:
        return 'zero'
    if n < 0:
        return f'negative {number_to_words(-n)}'

    # X00–X999 read as "X hundred" (e.g. 1200 → "twelve hundred")
    # Exclude exact multiples of 1000 (1000 → "one thousand", not "ten hundred")
    if 100 <= n <= 9999 and n % 100 == 0 and n % 1000 != 0:
        hundreds = n // 100
        if hundreds < 20:
            return f'{_ONES[hundreds]} hundred'

    parts = []
    for _i, scale in enumerate(_SCALE):
        chunk = n % 1000
        if chunk:
            chunk_words = _three_digits_to_words(chunk)
            parts.append(f'{chunk_words} {scale}'.strip() if scale else chunk_words)
        n //= 1000
        if n == 0:
            break

    return ' '.join(reversed(parts))


def float_to_words(value, decimal_sep: str = 'point') -> str:
    """
    Convert a float (or numeric string) to words, reading decimal digits individually.
    Accepts a string to preserve trailing zeros (e.g. "1.50" → "one point five zero").

    Examples:
        3.14   → "three point one four"
        -0.5   → "negative zero point five"
        "3.10" → "three point one zero"
        1.007  → "one point zero zero seven"
    """
    text = value if isinstance(value, str) else f'{value}'
    negative = text.startswith('-')
    if negative:
        text = text[1:]

    if '.' in text:
        int_part, dec_part = text.split('.', 1)
        int_words = number_to_words(int(int_part)) if int_part else 'zero'
        # Read each decimal digit individually; "0" → "zero"
        digit_map = ['zero'] + _ONES[1:]  # index 0 → "zero"
        dec_words = ' '.join(digit_map[int(d)] for d in dec_part)
        result = f'{int_words} {decimal_sep} {dec_words}'
    else:
        result = number_to_words(int(text))

    return f'negative {result}' if negative else result


def roman_to_int(s: str) -> int:
    """Convert a Roman numeral string to an integer."""
    val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    result = 0
    prev = 0
    for ch in reversed(s.upper()):
        curr = val[ch]
        result += curr if curr >= prev else -curr
        prev = curr
    return result


# ─────────────────────────────────────────────
# Regex patterns
# ─────────────────────────────────────────────

_RE_URL = re.compile(r'https?://\S+|www\.\S+')
_RE_EMAIL = re.compile(r'\b[\w.+-]+@[\w-]+\.[a-z]{2,}\b', re.IGNORECASE)
_RE_HASHTAG = re.compile(r'#\w+')
_RE_MENTION = re.compile(r'@\w+')
_RE_HTML = re.compile(r'<[^>]+>')
_RE_PUNCT = re.compile(r'[^\w\s]')
_RE_SPACES = re.compile(r'\s+')
_RE_AI = re.compile(r'\bAI\b')
_RE_DOT_COM = re.compile(r'\.com\b', re.IGNORECASE)
_RE_PLUS = re.compile(r'\+')
_RE_AMPERSAND = re.compile(r'&')
_RE_AT_SYMBOL = re.compile(r'@')
_RE_NEWLINE = re.compile(r'[\r\n]+')
_RE_TILDE = re.compile(r'~')

_MONTH_MAP = {
    'Jan': 'January',
    'Feb': 'February',
    'Mar': 'March',
    'Apr': 'April',
    'Jun': 'June',
    'Jul': 'July',
    'Aug': 'August',
    'Sep': 'September',
    'Sept': 'September',
    'Oct': 'October',
    'Nov': 'November',
    'Dec': 'December',
}

# Regex looks for Title Case months followed by a period or a digit
# We handle "May" separately because it's a common word.
_RE_MONTHS = re.compile(r'\b(Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?\b(?=\s*\d|\s*$)')
_RE_MAY = re.compile(r'\bMay\b(?=\s*\d)')  # Only expand May if followed by a number (May 5)

# Number: do NOT match a leading minus if it is immediately preceded by a letter
# (handles "gpt-3", "gpl-3", "v-2" etc.)
_RE_NUMBER = re.compile(r'(?<![a-zA-Z])-?[\d,]+(?:\.\d+)?')

# Ordinals: 1st, 2nd, 3rd, 4th … 21st, 101st …
_RE_ORDINAL = re.compile(r'\b(\d+)(st|nd|rd|th)\b', re.IGNORECASE)

# Percentages: 50%, 3.5%
_RE_PERCENT = re.compile(r'(-?[\d,]+(?:\.\d+)?)\s*%')

# Currency: $100, €1,200.50, £50, $85K, $2.5M (optional scale suffix)
_RE_CURRENCY = re.compile(
    r'([$€£¥₹₩₿])\s*([\d,]+(?:\.\d+)?)\s*(million|billion|trillion|thousand|[KMBT])?\b',
    re.IGNORECASE,
)

# Time: 3:30pm, 14:00, 3:30 AM — requires 2-digit minutes so "3:0" (score) doesn't match
_RE_TIME = re.compile(r'\b(\d{1,2}):(\d{2})(?::(\d{2}))?\s*(am|pm)?\b', re.IGNORECASE)

# Ranges: 10-20, 100-200 (both sides numeric, hyphen between them)
_RE_RANGE = re.compile(r'(?<!\w)(\d+)-(\d+)(?!\w)')

# Version/model names: gpt-3, gpt-3.5, v2.0, Python-3.10, GPL-3
# Letter(s) + hyphen + digit(s) [+ more version parts]
_RE_MODEL_VER = re.compile(r'\b([a-zA-Z][a-zA-Z0-9]*)-(\d[\d.]*)(?=[^\d.]|$)')

# Measurement units glued to numbers: 100km, 50kg, 25°C, 5GB
_RE_UNIT = re.compile(
    r'(\d+(?:\.\d+)?)\s*(km|kg|mg|ml|gb|mb|kb|tb|hz|khz|mhz|ghz|mph|kph|°[cCfF]|[cCfF]°|ms|ns|µs)\b',
    re.IGNORECASE,
)

# Scale suffixes (uppercase only to avoid ambiguity): 7B, 340M, 1.5K, 2T
# Must NOT be preceded by a letter (so 'MB' is handled by unit regex first)
_RE_SCALE = re.compile(r'(?<![a-zA-Z])(\d+(?:\.\d+)?)\s*([KMBT])(?![a-zA-Z\d])')

# Scientific notation: 1e-4, 2.5e10, 6.022E23
_RE_SCI = re.compile(r'(?<![a-zA-Z\d])(-?\d+(?:\.\d+)?)[eE]([+-]?\d+)(?![a-zA-Z\d])')

# Fractions: 1/2, 3/4, 2/3
_RE_FRACTION = re.compile(r'\b(\d+)\s*/\s*(\d+)\b')

# Decades: 80s, 90s, 1980s, 2020s (number ending in 0 followed by 's')
_RE_DECADE = re.compile(r'\b(\d{1,3})0s\b')

# Leading decimal (no digit before the dot): .5, .75
_RE_LEAD_DEC = re.compile(r'(?<!\d)\.([\d])')


# ─────────────────────────────────────────────
# Expansion helpers
# ─────────────────────────────────────────────
def expand_abbreviations(text: str) -> str:
    """
    Handles specific abbreviations before lowercase normalization.
    AI -> A.I.
    .com -> dot com
    """
    # 1. AI to A.I. (Case sensitive)
    text = _RE_AI.sub('A.I.', text)
    # 2. .com to dot com
    text = _RE_DOT_COM.sub(' dot com', text)
    return text


def expand_symbols(text: str) -> str:
    """
    Translates mathematical and connector symbols to words.
    """
    text = _RE_PLUS.sub(' plus ', text)
    text = _RE_AMPERSAND.sub(' and ', text)
    text = _RE_AT_SYMBOL.sub(' at ', text)
    return text


def _ordinal_suffix(n: int) -> str:
    """Return the ordinal word for n (e.g. 1 → 'first', 5 → 'fifth', 21 → 'twenty-first')."""
    word = number_to_words(n)
    # For hyphenated compounds like "twenty-one", convert only the last part
    if '-' in word:
        prefix, last = word.rsplit('-', 1)
        joiner = '-'
    else:
        parts = word.rsplit(' ', 1)
        prefix, last, joiner = (parts[0], parts[1], ' ') if len(parts) == 2 else ('', parts[0], '')

    # Check exception table
    for base, ordinal in _ORDINAL_EXCEPTIONS.items():
        if last == base:
            last_ord = ordinal
            break
    else:
        # General rule
        if last.endswith('t'):
            last_ord = last + 'h'
        elif last.endswith('e'):
            last_ord = last[:-1] + 'th'
        else:
            last_ord = last + 'th'

    return f'{prefix}{joiner}{last_ord}' if prefix else last_ord


def expand_ordinals(text: str) -> str:
    """
    Convert ordinal numbers to words.

    Examples:
        "1st place"  → "first place"
        "2nd floor"  → "second floor"
        "3rd base"   → "third base"
        "21st century" → "twenty-first century"
        "100th day"  → "one hundredth day"
    """

    def _replace(m: re.Match) -> str:
        return _ordinal_suffix(int(m.group(1)))

    return _RE_ORDINAL.sub(_replace, text)


def expand_percentages(text: str) -> str:
    """
    Expand percentage expressions.

    Examples:
        "50% off"    → "fifty percent off"
        "3.5% rate"  → "three point five percent rate"
        "-2% change" → "negative two percent change"
    """

    def _replace(m: re.Match) -> str:
        raw = m.group(1).replace(',', '')
        if '.' in raw:
            return float_to_words(float(raw)) + ' percent'
        return number_to_words(int(raw)) + ' percent'

    return _RE_PERCENT.sub(_replace, text)


def expand_newlines(text: str) -> str:
    """Change newlines/returns to a period and space for TTS pausing."""
    return _RE_NEWLINE.sub('. ', text)


def expand_tilde(text: str) -> str:
    """Change ~ to 'about'."""
    return _RE_TILDE.sub('about ', text)


def expand_currency(text: str) -> str:
    """
    Expand currency amounts, including optional scale suffixes.

    Examples:
        "$100"      → "one hundred dollars"
        "€1,200.50" → "twelve hundred euros and fifty cents"
        "£9.99"     → "nine pounds and ninety-nine cents"
        "$85K"      → "eighty five thousand dollars"
        "$2.5M"     → "two point five million dollars"
    """

    def _replace(m: re.Match) -> str:
        symbol = m.group(1)
        raw = m.group(2).replace(',', '')
        scale_suffix = m.group(3)
        unit = _CURRENCY_SYMBOLS.get(symbol, '')

        # Handle Scaled Currency ($17.5 billion or $17.5B)
        if scale_suffix:
            # Normalize suffix (e.g., 'B' or 'billion' -> 'billion')
            scale_word = _CURRENCY_SCALE_MAP.get(scale_suffix.upper(), scale_suffix.lower())
            num = float_to_words(raw) if '.' in raw else number_to_words(int(raw))
            return f'{num} {scale_word} {unit}{"s" if unit else ""}'.strip()

        # Handle Standard Currency ($17.50)
        if '.' in raw:
            int_part, dec_part = raw.split('.', 1)
            dec_val = int(dec_part[:2].ljust(2, '0'))
            int_words = number_to_words(int(int_part))
            result = f'{int_words} {unit}s' if unit else int_words
            if dec_val:
                cents = number_to_words(dec_val)
                result += f' and {cents} cent{"s" if dec_val != 1 else ""}'
        else:
            val = int(raw)
            words = number_to_words(val)
            result = f'{words} {unit}{"s" if val != 1 and unit else ""}' if unit else words
        return result

    return _RE_CURRENCY.sub(_replace, text)


def expand_time(text: str) -> str:
    """
    Expand time expressions.

    Examples:
        "3:30pm"  → "three thirty pm"
        "14:00"   → "fourteen hundred"
        "9:05 AM" → "nine oh five am"
        "12:00pm" → "twelve pm"
    """

    def _replace(m: re.Match) -> str:
        h = int(m.group(1))
        mins = int(m.group(2))
        suffix = (' ' + m.group(4).lower()) if m.group(4) else ''
        h_words = number_to_words(h)
        if mins == 0:
            return f'{h_words} hundred{suffix}' if not m.group(4) else f'{h_words}{suffix}'
        elif mins < 10:
            return f'{h_words} oh {number_to_words(mins)}{suffix}'
        else:
            return f'{h_words} {number_to_words(mins)}{suffix}'

    return _RE_TIME.sub(_replace, text)


def expand_ranges(text: str) -> str:
    """
    Expand numeric ranges.

    Examples:
        "10-20 items"   → "ten to twenty items"
        "pages 100-200" → "pages one hundred to two hundred"
        "2020-2024"     → "twenty twenty to twenty twenty-four"
    """

    def _replace(m: re.Match) -> str:
        lo = number_to_words(int(m.group(1)))
        hi = number_to_words(int(m.group(2)))
        return f'{lo} to {hi}'

    return _RE_RANGE.sub(_replace, text)


def expand_model_names(text: str) -> str:
    """
    Normalise version/model names that use letter-hyphen-number patterns,
    so the number is not misread as negative.

    Examples:
        "GPT-3"      → "GPT 3"
        "gpt-3.5"    → "gpt 3.5"
        "GPL-3"      → "GPL 3"
        "Python-3.10"→ "Python 3.10"
        "v2.0"       stays as "v2.0" (no hyphen — handled by number replacement)
        "IPv6"       stays as "IPv6"
    """
    return _RE_MODEL_VER.sub(lambda m: f'{m.group(1)} {m.group(2)}', text)


def expand_units(text: str) -> str:
    """
    Expand common measurement units glued to numbers.

    Examples:
        "100km"  → "one hundred kilometers"
        "50kg"   → "fifty kilograms"
        "25°C"   → "twenty-five degrees Celsius"
        "5GB"    → "five gigabytes"
    """
    _unit_map = {
        'km': 'kilometers',
        'kg': 'kilograms',
        'mg': 'milligrams',
        'ml': 'milliliters',
        'gb': 'gigabytes',
        'mb': 'megabytes',
        'kb': 'kilobytes',
        'tb': 'terabytes',
        'hz': 'hertz',
        'khz': 'kilohertz',
        'mhz': 'megahertz',
        'ghz': 'gigahertz',
        'mph': 'miles per hour',
        'kph': 'kilometers per hour',
        'ms': 'milliseconds',
        'ns': 'nanoseconds',
        'µs': 'microseconds',
        '°c': 'degrees Celsius',
        'c°': 'degrees Celsius',
        '°f': 'degrees Fahrenheit',
        'f°': 'degrees Fahrenheit',
    }

    def _replace(m: re.Match) -> str:
        raw = m.group(1)
        unit = m.group(2).lower()
        expanded = _unit_map.get(unit, m.group(2))
        num = float_to_words(float(raw)) if '.' in raw else number_to_words(int(raw))
        return f'{num} {expanded}'

    return _RE_UNIT.sub(_replace, text)


def expand_roman_numerals(text: str, context_words: bool = True) -> str:
    """
    Expand Roman numerals that appear as standalone tokens (optionally
    only when preceded by a title-like word to avoid false positives).

    Examples:
        "World War II"     → "World War two"
        "Chapter IV"       → "Chapter four"
        "Louis XIV"        → "Louis fourteen"
        "mix I with V"     → left unchanged (ambiguous single letters)
    """
    _TITLE_WORDS = re.compile(
        r'\b(war|chapter|part|volume|act|scene|book|section|article|'
        r'king|queen|pope|louis|henry|edward|george|william|james|'
        r'phase|round|level|stage|class|type|version|episode|season)\b',
        re.IGNORECASE,
    )

    def _replace(m: re.Match) -> str:
        roman = m.group(0)
        if not roman.strip():
            return roman
        # Skip single ambiguous letters (I, V, X) unless context present
        if len(roman) == 1 and roman in 'IVX':
            # Only expand if preceded by a title word
            start = m.start()
            preceding = text[max(0, start - 30) : start]
            if not _TITLE_WORDS.search(preceding):
                return roman
        try:
            val = roman_to_int(roman)
            if val == 0:
                return roman
            return number_to_words(val)
        except Exception:
            return roman

    return _RE_ROMAN.sub(_replace, text)


def normalize_leading_decimals(text: str) -> str:
    """
    Normalise bare leading-decimal floats so the number pipeline handles them.

    Examples:
        ".5 teaspoons" → "0.5 teaspoons"
        "-.25 adjustment" → "-0.25 adjustment"
    """
    # Handle -.5 → -0.5 and .5 → 0.5
    text = re.sub(r'(?<!\d)(-)\.([\d])', r'\g<1>0.\2', text)
    return _RE_LEAD_DEC.sub(r'0.\1', text)


def expand_scientific_notation(text: str) -> str:
    """
    Expand scientific-notation numbers to spoken form.

    Examples:
        "1e-4"    → "one times ten to the negative four"
        "2.5e10"  → "two point five times ten to the ten"
        "6.022E23"→ "six point zero two two times ten to the twenty three"
    """

    def _replace(m: re.Match) -> str:
        coeff_raw = m.group(1)
        exp = int(m.group(2))
        coeff_words = (
            float_to_words(coeff_raw) if '.' in coeff_raw else number_to_words(int(coeff_raw))
        )
        exp_words = number_to_words(abs(exp))
        sign = 'negative ' if exp < 0 else ''
        return f'{coeff_words} times ten to the {sign}{exp_words}'

    return _RE_SCI.sub(_replace, text)


def expand_scale_suffixes(text: str) -> str:
    """
    Expand standalone uppercase scale suffixes attached to numbers.

    Examples:
        "7B parameters" → "seven billion parameters"
        "340M model"    → "three hundred forty million model"
        "1.5K salary"   → "one point five thousand salary"
        "$100K budget"  → "$100K budget"  (currency handled upstream)
    """
    _map = {'K': 'thousand', 'M': 'million', 'B': 'billion', 'T': 'trillion'}

    def _replace(m: re.Match) -> str:
        raw = m.group(1)
        suffix = m.group(2)
        scale_word = _map.get(suffix, suffix)
        num = float_to_words(raw) if '.' in raw else number_to_words(int(raw))
        return f'{num} {scale_word}'

    return _RE_SCALE.sub(_replace, text)


def expand_fractions(text: str) -> str:
    """
    Expand simple numeric fractions.

    Examples:
        "1/2 cup"  → "one half cup"
        "3/4 mile" → "three quarters mile"
        "2/3 done" → "two thirds done"
        "5/8 inch" → "five eighths inch"
    """

    def _replace(m: re.Match) -> str:
        num = int(m.group(1))
        den = int(m.group(2))
        if den == 0:
            return m.group()
        num_words = number_to_words(num)
        if den == 2:
            denom_word = 'half' if num == 1 else 'halves'
        elif den == 4:
            denom_word = 'quarter' if num == 1 else 'quarters'
        else:
            denom_word = _ordinal_suffix(den)
            if num != 1:
                denom_word += 's'
        return f'{num_words} {denom_word}'

    return _RE_FRACTION.sub(_replace, text)


def expand_decades(text: str) -> str:
    """
    Expand decade expressions to words.

    Examples:
        "the 80s"    → "the eighties"
        "the 1980s"  → "the nineteen eighties"
        "the 2020s"  → "the twenty twenties"
        "'90s music" → "nineties music"
    """
    _decade_map = {
        0: 'hundreds',
        1: 'tens',
        2: 'twenties',
        3: 'thirties',
        4: 'forties',
        5: 'fifties',
        6: 'sixties',
        7: 'seventies',
        8: 'eighties',
        9: 'nineties',
    }

    def _replace(m: re.Match) -> str:
        base = int(m.group(1))  # e.g. 8 for "80s", 198 for "1980s"
        decade_digit = base % 10
        decade_word = _decade_map.get(decade_digit, '')
        if base < 10:
            return decade_word
        century_part = base // 10  # e.g. 19 for 198
        return f'{number_to_words(century_part)} {decade_word}'

    return _RE_DECADE.sub(_replace, text)


def expand_ip_addresses(text: str) -> str:
    """
    Expand IPv4 addresses to spoken digits per octet.

    Examples:
        "192.168.1.1"  → "one nine two dot one six eight dot one dot one"
        "10.0.0.1"     → "one zero dot zero dot zero dot one"
    """
    _d = {
        '0': 'zero',
        '1': 'one',
        '2': 'two',
        '3': 'three',
        '4': 'four',
        '5': 'five',
        '6': 'six',
        '7': 'seven',
        '8': 'eight',
        '9': 'nine',
    }

    def _octet(s: str) -> str:
        return ' '.join(_d[c] for c in s)

    def _replace(m: re.Match) -> str:
        return ' dot '.join(_octet(g) for g in m.groups())

    return re.sub(r'\b(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})\b', _replace, text)


def expand_phone_numbers(text: str) -> str:
    """
    Expand US phone numbers to spoken digits before range expansion claims the hyphens.

    Examples:
        "555-1234"       → "five five five one two three four"
        "555-123-4567"   → "five five five one two three four five six seven"
        "1-800-555-0199" → "one eight zero zero five five five zero one nine nine"
    """
    _d = {
        '0': 'zero',
        '1': 'one',
        '2': 'two',
        '3': 'three',
        '4': 'four',
        '5': 'five',
        '6': 'six',
        '7': 'seven',
        '8': 'eight',
        '9': 'nine',
    }

    def _digits(s: str) -> str:
        return ' '.join(_d[c] for c in s)

    def _join(*groups) -> str:
        return ' '.join(_digits(g) for g in groups)

    # Match longest pattern first to avoid partial matches
    # 11-digit: 1-800-555-0199
    text = re.sub(
        r'(?<!\d-)(?<!\d)\b(\d{1,2})-(\d{3})-(\d{3})-(\d{4})\b(?!-\d)',
        lambda m: _join(*m.groups()),
        text,
    )
    # 10-digit: 555-123-4567
    text = re.sub(
        r'(?<!\d-)(?<!\d)\b(\d{3})-(\d{3})-(\d{4})\b(?!-\d)', lambda m: _join(*m.groups()), text
    )
    # 7-digit local: 555-1234 (not preceded or followed by digit-hyphen to avoid sub-matching)
    text = re.sub(r'(?<!\d-)\b(\d{3})-(\d{4})\b(?!-\d)', lambda m: _join(*m.groups()), text)
    return text


def expand_months(text: str) -> str:
    """
    Expands Jan, Feb, etc. to January, February.
    Only triggers if the abbreviation is likely a date.
    """

    def _replace(m: re.Match) -> str:
        return _MONTH_MAP.get(m.group(1), m.group(1))

    # 1. Standard abbreviations
    text = _RE_MONTHS.sub(_replace, text)

    # 2. May (Special case: only if followed by a digit)
    text = _RE_MAY.sub('May', text)  # Essentially just ensuring it's treated as a word

    return text


# ─────────────────────────────────────────────
# Core preprocessing functions
# ─────────────────────────────────────────────


def replace_numbers(text: str, replace_floats: bool = True) -> str:
    """
    Replace all numeric tokens with their word equivalents.

    Examples:
        "There are 1200 students" → "There are twelve hundred students"
        "Pi is 3.14"              → "Pi is three point one four"
        "gpt-3 rocks"             → "gpt-3 rocks"  (hyphen not treated as minus)
    """

    def _replace(m: re.Match) -> str:
        raw = m.group().replace(',', '')
        try:
            if '.' in raw and replace_floats:
                # Pass raw string so trailing zeros are preserved ("1.50" → "one point five zero")
                return float_to_words(raw)
            else:
                return number_to_words(int(float(raw)))
        except (ValueError, OverflowError):
            return m.group()

    return _RE_NUMBER.sub(_replace, text)


def to_lowercase(text: str) -> str:
    """Convert text to lowercase."""
    return text.lower()


def remove_urls(text: str, replacement: str = '') -> str:
    """Remove URLs from text."""
    return _RE_URL.sub(replacement, text).strip()


def remove_emails(text: str, replacement: str = '') -> str:
    """Remove email addresses from text."""
    return _RE_EMAIL.sub(replacement, text).strip()


def remove_html_tags(text: str) -> str:
    """Strip HTML tags from text."""
    return _RE_HTML.sub(' ', text)


def remove_hashtags(text: str, replacement: str = '') -> str:
    """Remove hashtags (e.g. #NLP) from text."""
    return _RE_HASHTAG.sub(replacement, text)


def remove_mentions(text: str, replacement: str = '') -> str:
    """Remove @mentions from text."""
    return _RE_MENTION.sub(replacement, text)


def remove_punctuation(text: str) -> str:
    """Remove all punctuation characters."""
    return _RE_PUNCT.sub(' ', text)


def remove_extra_whitespace(text: str) -> str:
    """Collapse multiple whitespace characters into a single space and strip ends."""
    return _RE_SPACES.sub(' ', text).strip()


def normalize_unicode(text: str, form: str = 'NFC') -> str:
    """Normalize unicode characters (NFC, NFD, NFKC, or NFKD)."""
    return unicodedata.normalize(form, text)


def remove_accents(text: str) -> str:
    """Remove diacritical marks (accents) from characters."""
    nfkd = unicodedata.normalize('NFD', text)
    return ''.join(c for c in nfkd if unicodedata.category(c) != 'Mn')


def expand_contractions(text: str) -> str:
    """
    Expand common English contractions.

    Examples:
        "don't"   → "do not"
        "they're" → "they are"
        "I've"    → "I have"
    """
    contractions = {
        r"\bcan't\b": 'cannot',
        r"\bwon't\b": 'will not',
        r"\bshan't\b": 'shall not',
        r"\bain't\b": 'is not',
        r"\blet's\b": 'let us',
        r"\b(\w+)n't\b": r'\1 not',
        r"\b(\w+)'re\b": r'\1 are',
        r"\b(\w+)'ve\b": r'\1 have',
        r"\b(\w+)'ll\b": r'\1 will',
        r"\b(\w+)'d\b": r'\1 would',
        r"\b(\w+)'m\b": r'\1 am',
        r"\bit's\b": 'it is',
    }
    for pattern, replacement in contractions.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text


def remove_stopwords(text: str, stopwords: set | None = None) -> str:
    """
    Remove stopwords from text.

    Args:
        stopwords: Set of words to remove. Uses a built-in English set if None.
    """
    if stopwords is None:
        stopwords = {
            'a',
            'an',
            'the',
            'and',
            'or',
            'but',
            'in',
            'on',
            'at',
            'to',
            'for',
            'of',
            'with',
            'by',
            'from',
            'is',
            'was',
            'are',
            'were',
            'be',
            'been',
            'being',
            'have',
            'has',
            'had',
            'do',
            'does',
            'did',
            'will',
            'would',
            'could',
            'should',
            'may',
            'might',
            'this',
            'that',
            'these',
            'those',
            'it',
            'its',
            'i',
            'me',
            'my',
            'we',
            'our',
            'you',
            'your',
            'he',
            'she',
            'him',
            'her',
            'they',
            'them',
            'their',
        }
    tokens = text.split()
    return ' '.join(t for t in tokens if t.lower() not in stopwords)


# ─────────────────────────────────────────────
# Pipeline helper
# ─────────────────────────────────────────────


class TextPreprocessor:
    """
    Configurable preprocessing pipeline.

    Usage:
        pp = TextPreprocessor(
            lowercase=True,
            replace_numbers=True,
            remove_urls=True,
            remove_html=True,
            remove_punctuation=True,
        )
        clean = pp("GPT-3 costs $0.002 per token — 50% cheaper than before!")
        # → "gpt three costs zero dollars and zero point two cents per token fifty percent cheaper than before"
    """

    def __init__(
        self,
        lowercase: bool = True,
        replace_numbers: bool = True,
        replace_floats: bool = True,
        expand_newlines: bool = True,
        expand_tilde: bool = True,
        expand_abbreviations: bool = True,
        expand_symbols: bool = True,
        expand_contractions: bool = True,
        expand_model_names: bool = True,
        expand_ordinals: bool = True,
        expand_percentages: bool = True,
        expand_currency: bool = True,
        expand_time: bool = True,
        expand_ranges: bool = True,
        expand_units: bool = True,
        expand_scale_suffixes: bool = True,
        expand_scientific_notation: bool = True,
        expand_fractions: bool = True,
        expand_decades: bool = True,
        expand_phone_numbers: bool = True,
        expand_ip_addresses: bool = True,
        normalize_leading_decimals: bool = True,
        expand_roman_numerals: bool = False,
        remove_urls: bool = True,
        remove_emails: bool = True,
        remove_html: bool = True,
        remove_hashtags: bool = False,
        remove_mentions: bool = False,
        remove_punctuation: bool = True,
        remove_stopwords: bool = False,
        stopwords: set | None = None,
        normalize_unicode: bool = True,
        remove_accents: bool = False,
        remove_extra_whitespace: bool = True,
    ):
        self.config = {k: v for k, v in locals().items() if k != 'self'}
        self._stopwords = stopwords

    def __call__(self, text: str) -> str:
        return self.process(text)

    def process(self, text: str) -> str:
        cfg = self.config
        if cfg.get('expand_abbreviations'):
            text = expand_abbreviations(text)
            text = expand_months(text)
        if cfg.get('expand_newlines'):
            text = expand_newlines(text)
        if cfg.get('expand_symbols'):
            text = expand_symbols(text)
        if cfg.get('expand_tilde'):
            text = expand_tilde(text)
        if cfg['normalize_unicode']:
            text = normalize_unicode(text)
        if cfg['remove_html']:
            text = remove_html_tags(text)
        if cfg['remove_urls']:
            text = remove_urls(text)
        if cfg['remove_emails']:
            text = remove_emails(text)
        if cfg['remove_hashtags']:
            text = remove_hashtags(text)
        if cfg['remove_mentions']:
            text = remove_mentions(text)
        if cfg['expand_contractions']:
            text = expand_contractions(text)
        # IP addresses before normalize_leading_decimals (IPs contain dots before digits)
        if cfg['expand_ip_addresses']:
            text = expand_ip_addresses(text)
        # Normalise bare leading decimals early so downstream regexes see "0.5" not ".5"
        if cfg['normalize_leading_decimals']:
            text = normalize_leading_decimals(text)
        # Expand special forms before generic number replacement
        if cfg['expand_currency']:
            text = expand_currency(text)
        if cfg['expand_percentages']:
            text = expand_percentages(text)
        # Scientific notation before model-name expansion (e.g. "1e-4" contains "e-4")
        if cfg['expand_scientific_notation']:
            text = expand_scientific_notation(text)
        if cfg['expand_time']:
            text = expand_time(text)
        if cfg['expand_ordinals']:
            text = expand_ordinals(text)
        if cfg['expand_units']:
            text = expand_units(text)
        # Scale suffixes after units (units handles "MB"/"GB"; this handles bare "B"/"M")
        if cfg['expand_scale_suffixes']:
            text = expand_scale_suffixes(text)
        if cfg['expand_fractions']:
            text = expand_fractions(text)
        if cfg['expand_decades']:
            text = expand_decades(text)
        # Phone numbers before ranges, otherwise NNN-NNNN is treated as a range
        if cfg['expand_phone_numbers']:
            text = expand_phone_numbers(text)
        if cfg['expand_ranges']:
            text = expand_ranges(text)
        if cfg['expand_model_names']:
            text = expand_model_names(text)
        if cfg['expand_roman_numerals']:
            text = expand_roman_numerals(text)
        if cfg['replace_numbers']:
            text = replace_numbers(text, replace_floats=cfg['replace_floats'])
        if cfg['remove_accents']:
            text = remove_accents(text)
        if cfg['remove_punctuation']:
            text = remove_punctuation(text)
        if cfg['lowercase']:
            text = to_lowercase(text)
        if cfg['remove_stopwords']:
            text = remove_stopwords(text, self._stopwords)
        if cfg['remove_extra_whitespace']:
            text = remove_extra_whitespace(text)

        return text