Spaces:

grimshaw
/

tts-api

Sleeping

teddybear082

more linting

ec11370 4 months ago

34.4 kB

	"""
	Adapted and supplemented from origional at https://github.com/KittenML/KittenTTS/blob/main/kittentts/preprocess.py
	See license at: https://github.com/KittenML/KittenTTS/blob/main/LICENSE (Apache 2.0)
	"""

	import re
	import unicodedata

	# ─────────────────────────────────────────────
	# Number → Words conversion
	# ─────────────────────────────────────────────

	_ONES = [
	'',
	'one',
	'two',
	'three',
	'four',
	'five',
	'six',
	'seven',
	'eight',
	'nine',
	'ten',
	'eleven',
	'twelve',
	'thirteen',
	'fourteen',
	'fifteen',
	'sixteen',
	'seventeen',
	'eighteen',
	'nineteen',
	]
	_TENS = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
	_SCALE = ['', 'thousand', 'million', 'billion', 'trillion']

	_ORDINAL_EXCEPTIONS = {
	'one': 'first',
	'two': 'second',
	'three': 'third',
	'four': 'fourth',
	'five': 'fifth',
	'six': 'sixth',
	'seven': 'seventh',
	'eight': 'eighth',
	'nine': 'ninth',
	'twelve': 'twelfth',
	}

	_CURRENCY_SYMBOLS = {
	'$': 'dollar',
	'€': 'euro',
	'£': 'pound',
	'¥': 'yen',
	'₹': 'rupee',
	'₩': 'won',
	'₿': 'bitcoin',
	}

	_CURRENCY_SCALE_MAP = {
	'K': 'thousand',
	'M': 'million',
	'B': 'billion',
	'T': 'trillion',
	'thousand': 'thousand',
	'million': 'million',
	'billion': 'billion',
	'trillion': 'trillion',
	}

	_ROMAN = [
	(1000, 'M'),
	(900, 'CM'),
	(500, 'D'),
	(400, 'CD'),
	(100, 'C'),
	(90, 'XC'),
	(50, 'L'),
	(40, 'XL'),
	(10, 'X'),
	(9, 'IX'),
	(5, 'V'),
	(4, 'IV'),
	(1, 'I'),
	]
	_RE_ROMAN = re.compile(r'\b(M{0,4})(CM\|CD\|D?C{0,3})(XC\|XL\|L?X{0,3})(IX\|IV\|V?I{0,3})\b')


	def _three_digits_to_words(n: int) -> str:
	"""Convert a number 0–999 to English words."""
	if n == 0:
	return ''
	parts = []
	hundreds = n // 100
	remainder = n % 100
	if hundreds:
	parts.append(f'{_ONES[hundreds]} hundred')
	if remainder < 20:
	if remainder:
	parts.append(_ONES[remainder])
	else:
	tens_word = _TENS[remainder // 10]
	ones_word = _ONES[remainder % 10]
	parts.append(f'{tens_word}-{ones_word}' if ones_word else tens_word)
	return ' '.join(parts)


	def number_to_words(n: int) -> str:
	"""
	Convert an integer to its English word representation.

	Examples:
	1200 → "twelve hundred"
	1000 → "one thousand"
	1_000_000 → "one million"
	-42 → "negative forty-two"
	0 → "zero"
	"""
	if not isinstance(n, int):
	n = int(n)
	if n == 0:
	return 'zero'
	if n < 0:
	return f'negative {number_to_words(-n)}'

	# X00–X999 read as "X hundred" (e.g. 1200 → "twelve hundred")
	# Exclude exact multiples of 1000 (1000 → "one thousand", not "ten hundred")
	if 100 <= n <= 9999 and n % 100 == 0 and n % 1000 != 0:
	hundreds = n // 100
	if hundreds < 20:
	return f'{_ONES[hundreds]} hundred'

	parts = []
	for _i, scale in enumerate(_SCALE):
	chunk = n % 1000
	if chunk:
	chunk_words = _three_digits_to_words(chunk)
	parts.append(f'{chunk_words} {scale}'.strip() if scale else chunk_words)
	n //= 1000
	if n == 0:
	break

	return ' '.join(reversed(parts))


	def float_to_words(value, decimal_sep: str = 'point') -> str:
	"""
	Convert a float (or numeric string) to words, reading decimal digits individually.
	Accepts a string to preserve trailing zeros (e.g. "1.50" → "one point five zero").

	Examples:
	3.14 → "three point one four"
	-0.5 → "negative zero point five"
	"3.10" → "three point one zero"
	1.007 → "one point zero zero seven"
	"""
	text = value if isinstance(value, str) else f'{value}'
	negative = text.startswith('-')
	if negative:
	text = text[1:]

	if '.' in text:
	int_part, dec_part = text.split('.', 1)
	int_words = number_to_words(int(int_part)) if int_part else 'zero'
	# Read each decimal digit individually; "0" → "zero"
	digit_map = ['zero'] + _ONES[1:] # index 0 → "zero"
	dec_words = ' '.join(digit_map[int(d)] for d in dec_part)
	result = f'{int_words} {decimal_sep} {dec_words}'
	else:
	result = number_to_words(int(text))

	return f'negative {result}' if negative else result


	def roman_to_int(s: str) -> int:
	"""Convert a Roman numeral string to an integer."""
	val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
	result = 0
	prev = 0
	for ch in reversed(s.upper()):
	curr = val[ch]
	result += curr if curr >= prev else -curr
	prev = curr
	return result


	# ─────────────────────────────────────────────
	# Regex patterns
	# ─────────────────────────────────────────────

	_RE_URL = re.compile(r'https?://\S+\|www\.\S+')
	_RE_EMAIL = re.compile(r'\b[\w.+-]+@[\w-]+\.[a-z]{2,}\b', re.IGNORECASE)
	_RE_HASHTAG = re.compile(r'#\w+')
	_RE_MENTION = re.compile(r'@\w+')
	_RE_HTML = re.compile(r'<[^>]+>')
	_RE_PUNCT = re.compile(r'[^\w\s]')
	_RE_SPACES = re.compile(r'\s+')
	_RE_AI = re.compile(r'\bAI\b')
	_RE_DOT_COM = re.compile(r'\.com\b', re.IGNORECASE)
	_RE_PLUS = re.compile(r'\+')
	_RE_AMPERSAND = re.compile(r'&')
	_RE_AT_SYMBOL = re.compile(r'@')
	_RE_NEWLINE = re.compile(r'[\r\n]+')
	_RE_TILDE = re.compile(r'~')

	_MONTH_MAP = {
	'Jan': 'January',
	'Feb': 'February',
	'Mar': 'March',
	'Apr': 'April',
	'Jun': 'June',
	'Jul': 'July',
	'Aug': 'August',
	'Sep': 'September',
	'Sept': 'September',
	'Oct': 'October',
	'Nov': 'November',
	'Dec': 'December',
	}

	# Regex looks for Title Case months followed by a period or a digit
	# We handle "May" separately because it's a common word.
	_RE_MONTHS = re.compile(r'\b(Jan\|Feb\|Mar\|Apr\|Jun\|Jul\|Aug\|Sep\|Sept\|Oct\|Nov\|Dec)\.?\b(?=\s\d\|\s$)')
	_RE_MAY = re.compile(r'\bMay\b(?=\s*\d)') # Only expand May if followed by a number (May 5)

	# Number: do NOT match a leading minus if it is immediately preceded by a letter
	# (handles "gpt-3", "gpl-3", "v-2" etc.)
	_RE_NUMBER = re.compile(r'(?<![a-zA-Z])-?[\d,]+(?:\.\d+)?')

	# Ordinals: 1st, 2nd, 3rd, 4th … 21st, 101st …
	_RE_ORDINAL = re.compile(r'\b(\d+)(st\|nd\|rd\|th)\b', re.IGNORECASE)

	# Percentages: 50%, 3.5%
	_RE_PERCENT = re.compile(r'(-?[\d,]+(?:\.\d+)?)\s*%')

	# Currency: $100, €1,200.50, £50, $85K, $2.5M (optional scale suffix)
	_RE_CURRENCY = re.compile(
	r'([$€£¥₹₩₿])\s([\d,]+(?:\.\d+)?)\s(million\|billion\|trillion\|thousand\|[KMBT])?\b',
	re.IGNORECASE,
	)

	# Time: 3:30pm, 14:00, 3:30 AM — requires 2-digit minutes so "3:0" (score) doesn't match
	_RE_TIME = re.compile(r'\b(\d{1,2}):(\d{2})(?::(\d{2}))?\s*(am\|pm)?\b', re.IGNORECASE)

	# Ranges: 10-20, 100-200 (both sides numeric, hyphen between them)
	_RE_RANGE = re.compile(r'(?<!\w)(\d+)-(\d+)(?!\w)')

	# Version/model names: gpt-3, gpt-3.5, v2.0, Python-3.10, GPL-3
	# Letter(s) + hyphen + digit(s) [+ more version parts]
	_RE_MODEL_VER = re.compile(r'\b([a-zA-Z][a-zA-Z0-9])-(\d[\d.])(?=[^\d.]\|$)')

	# Measurement units glued to numbers: 100km, 50kg, 25°C, 5GB
	_RE_UNIT = re.compile(
	r'(\d+(?:\.\d+)?)\s*(km\|kg\|mg\|ml\|gb\|mb\|kb\|tb\|hz\|khz\|mhz\|ghz\|mph\|kph\|°[cCfF]\|[cCfF]°\|ms\|ns\|µs)\b',
	re.IGNORECASE,
	)

	# Scale suffixes (uppercase only to avoid ambiguity): 7B, 340M, 1.5K, 2T
	# Must NOT be preceded by a letter (so 'MB' is handled by unit regex first)
	_RE_SCALE = re.compile(r'(?<![a-zA-Z])(\d+(?:\.\d+)?)\s*([KMBT])(?![a-zA-Z\d])')

	# Scientific notation: 1e-4, 2.5e10, 6.022E23
	_RE_SCI = re.compile(r'(?<![a-zA-Z\d])(-?\d+(?:\.\d+)?)[eE]([+-]?\d+)(?![a-zA-Z\d])')

	# Fractions: 1/2, 3/4, 2/3
	_RE_FRACTION = re.compile(r'\b(\d+)\s/\s(\d+)\b')

	# Decades: 80s, 90s, 1980s, 2020s (number ending in 0 followed by 's')
	_RE_DECADE = re.compile(r'\b(\d{1,3})0s\b')

	# Leading decimal (no digit before the dot): .5, .75
	_RE_LEAD_DEC = re.compile(r'(?<!\d)\.([\d])')


	# ─────────────────────────────────────────────
	# Expansion helpers
	# ─────────────────────────────────────────────
	def expand_abbreviations(text: str) -> str:
	"""
	Handles specific abbreviations before lowercase normalization.
	AI -> A.I.
	.com -> dot com
	"""
	# 1. AI to A.I. (Case sensitive)
	text = _RE_AI.sub('A.I.', text)
	# 2. .com to dot com
	text = _RE_DOT_COM.sub(' dot com', text)
	return text


	def expand_symbols(text: str) -> str:
	"""
	Translates mathematical and connector symbols to words.
	"""
	text = _RE_PLUS.sub(' plus ', text)
	text = _RE_AMPERSAND.sub(' and ', text)
	text = _RE_AT_SYMBOL.sub(' at ', text)
	return text


	def _ordinal_suffix(n: int) -> str:
	"""Return the ordinal word for n (e.g. 1 → 'first', 5 → 'fifth', 21 → 'twenty-first')."""
	word = number_to_words(n)
	# For hyphenated compounds like "twenty-one", convert only the last part
	if '-' in word:
	prefix, last = word.rsplit('-', 1)
	joiner = '-'
	else:
	parts = word.rsplit(' ', 1)
	prefix, last, joiner = (parts[0], parts[1], ' ') if len(parts) == 2 else ('', parts[0], '')

	# Check exception table
	for base, ordinal in _ORDINAL_EXCEPTIONS.items():
	if last == base:
	last_ord = ordinal
	break
	else:
	# General rule
	if last.endswith('t'):
	last_ord = last + 'h'
	elif last.endswith('e'):
	last_ord = last[:-1] + 'th'
	else:
	last_ord = last + 'th'

	return f'{prefix}{joiner}{last_ord}' if prefix else last_ord


	def expand_ordinals(text: str) -> str:
	"""
	Convert ordinal numbers to words.

	Examples:
	"1st place" → "first place"
	"2nd floor" → "second floor"
	"3rd base" → "third base"
	"21st century" → "twenty-first century"
	"100th day" → "one hundredth day"
	"""

	def _replace(m: re.Match) -> str:
	return _ordinal_suffix(int(m.group(1)))

	return _RE_ORDINAL.sub(_replace, text)


	def expand_percentages(text: str) -> str:
	"""
	Expand percentage expressions.

	Examples:
	"50% off" → "fifty percent off"
	"3.5% rate" → "three point five percent rate"
	"-2% change" → "negative two percent change"
	"""

	def _replace(m: re.Match) -> str:
	raw = m.group(1).replace(',', '')
	if '.' in raw:
	return float_to_words(float(raw)) + ' percent'
	return number_to_words(int(raw)) + ' percent'

	return _RE_PERCENT.sub(_replace, text)


	def expand_newlines(text: str) -> str:
	"""Change newlines/returns to a period and space for TTS pausing."""
	return _RE_NEWLINE.sub('. ', text)


	def expand_tilde(text: str) -> str:
	"""Change ~ to 'about'."""
	return _RE_TILDE.sub('about ', text)


	def expand_currency(text: str) -> str:
	"""
	Expand currency amounts, including optional scale suffixes.

	Examples:
	"$100" → "one hundred dollars"
	"€1,200.50" → "twelve hundred euros and fifty cents"
	"£9.99" → "nine pounds and ninety-nine cents"
	"$85K" → "eighty five thousand dollars"
	"$2.5M" → "two point five million dollars"
	"""

	def _replace(m: re.Match) -> str:
	symbol = m.group(1)
	raw = m.group(2).replace(',', '')
	scale_suffix = m.group(3)
	unit = _CURRENCY_SYMBOLS.get(symbol, '')

	# Handle Scaled Currency ($17.5 billion or $17.5B)
	if scale_suffix:
	# Normalize suffix (e.g., 'B' or 'billion' -> 'billion')
	scale_word = _CURRENCY_SCALE_MAP.get(scale_suffix.upper(), scale_suffix.lower())
	num = float_to_words(raw) if '.' in raw else number_to_words(int(raw))
	return f'{num} {scale_word} {unit}{"s" if unit else ""}'.strip()

	# Handle Standard Currency ($17.50)
	if '.' in raw:
	int_part, dec_part = raw.split('.', 1)
	dec_val = int(dec_part[:2].ljust(2, '0'))
	int_words = number_to_words(int(int_part))
	result = f'{int_words} {unit}s' if unit else int_words
	if dec_val:
	cents = number_to_words(dec_val)
	result += f' and {cents} cent{"s" if dec_val != 1 else ""}'
	else:
	val = int(raw)
	words = number_to_words(val)
	result = f'{words} {unit}{"s" if val != 1 and unit else ""}' if unit else words
	return result

	return _RE_CURRENCY.sub(_replace, text)


	def expand_time(text: str) -> str:
	"""
	Expand time expressions.

	Examples:
	"3:30pm" → "three thirty pm"
	"14:00" → "fourteen hundred"
	"9:05 AM" → "nine oh five am"
	"12:00pm" → "twelve pm"
	"""

	def _replace(m: re.Match) -> str:
	h = int(m.group(1))
	mins = int(m.group(2))
	suffix = (' ' + m.group(4).lower()) if m.group(4) else ''
	h_words = number_to_words(h)
	if mins == 0:
	return f'{h_words} hundred{suffix}' if not m.group(4) else f'{h_words}{suffix}'
	elif mins < 10:
	return f'{h_words} oh {number_to_words(mins)}{suffix}'
	else:
	return f'{h_words} {number_to_words(mins)}{suffix}'

	return _RE_TIME.sub(_replace, text)


	def expand_ranges(text: str) -> str:
	"""
	Expand numeric ranges.

	Examples:
	"10-20 items" → "ten to twenty items"
	"pages 100-200" → "pages one hundred to two hundred"
	"2020-2024" → "twenty twenty to twenty twenty-four"
	"""

	def _replace(m: re.Match) -> str:
	lo = number_to_words(int(m.group(1)))
	hi = number_to_words(int(m.group(2)))
	return f'{lo} to {hi}'

	return _RE_RANGE.sub(_replace, text)


	def expand_model_names(text: str) -> str:
	"""
	Normalise version/model names that use letter-hyphen-number patterns,
	so the number is not misread as negative.

	Examples:
	"GPT-3" → "GPT 3"
	"gpt-3.5" → "gpt 3.5"
	"GPL-3" → "GPL 3"
	"Python-3.10"→ "Python 3.10"
	"v2.0" stays as "v2.0" (no hyphen — handled by number replacement)
	"IPv6" stays as "IPv6"
	"""
	return _RE_MODEL_VER.sub(lambda m: f'{m.group(1)} {m.group(2)}', text)


	def expand_units(text: str) -> str:
	"""
	Expand common measurement units glued to numbers.

	Examples:
	"100km" → "one hundred kilometers"
	"50kg" → "fifty kilograms"
	"25°C" → "twenty-five degrees Celsius"
	"5GB" → "five gigabytes"
	"""
	_unit_map = {
	'km': 'kilometers',
	'kg': 'kilograms',
	'mg': 'milligrams',
	'ml': 'milliliters',
	'gb': 'gigabytes',
	'mb': 'megabytes',
	'kb': 'kilobytes',
	'tb': 'terabytes',
	'hz': 'hertz',
	'khz': 'kilohertz',
	'mhz': 'megahertz',
	'ghz': 'gigahertz',
	'mph': 'miles per hour',
	'kph': 'kilometers per hour',
	'ms': 'milliseconds',
	'ns': 'nanoseconds',
	'µs': 'microseconds',
	'°c': 'degrees Celsius',
	'c°': 'degrees Celsius',
	'°f': 'degrees Fahrenheit',
	'f°': 'degrees Fahrenheit',
	}

	def _replace(m: re.Match) -> str:
	raw = m.group(1)
	unit = m.group(2).lower()
	expanded = _unit_map.get(unit, m.group(2))
	num = float_to_words(float(raw)) if '.' in raw else number_to_words(int(raw))
	return f'{num} {expanded}'

	return _RE_UNIT.sub(_replace, text)


	def expand_roman_numerals(text: str, context_words: bool = True) -> str:
	"""
	Expand Roman numerals that appear as standalone tokens (optionally
	only when preceded by a title-like word to avoid false positives).

	Examples:
	"World War II" → "World War two"
	"Chapter IV" → "Chapter four"
	"Louis XIV" → "Louis fourteen"
	"mix I with V" → left unchanged (ambiguous single letters)
	"""
	_TITLE_WORDS = re.compile(
	r'\b(war\|chapter\|part\|volume\|act\|scene\|book\|section\|article\|'
	r'king\|queen\|pope\|louis\|henry\|edward\|george\|william\|james\|'
	r'phase\|round\|level\|stage\|class\|type\|version\|episode\|season)\b',
	re.IGNORECASE,
	)

	def _replace(m: re.Match) -> str:
	roman = m.group(0)
	if not roman.strip():
	return roman
	# Skip single ambiguous letters (I, V, X) unless context present
	if len(roman) == 1 and roman in 'IVX':
	# Only expand if preceded by a title word
	start = m.start()
	preceding = text[max(0, start - 30) : start]
	if not _TITLE_WORDS.search(preceding):
	return roman
	try:
	val = roman_to_int(roman)
	if val == 0:
	return roman
	return number_to_words(val)
	except Exception:
	return roman

	return _RE_ROMAN.sub(_replace, text)


	def normalize_leading_decimals(text: str) -> str:
	"""
	Normalise bare leading-decimal floats so the number pipeline handles them.

	Examples:
	".5 teaspoons" → "0.5 teaspoons"
	"-.25 adjustment" → "-0.25 adjustment"
	"""
	# Handle -.5 → -0.5 and .5 → 0.5
	text = re.sub(r'(?<!\d)(-)\.([\d])', r'\g<1>0.\2', text)
	return _RE_LEAD_DEC.sub(r'0.\1', text)


	def expand_scientific_notation(text: str) -> str:
	"""
	Expand scientific-notation numbers to spoken form.

	Examples:
	"1e-4" → "one times ten to the negative four"
	"2.5e10" → "two point five times ten to the ten"
	"6.022E23"→ "six point zero two two times ten to the twenty three"
	"""

	def _replace(m: re.Match) -> str:
	coeff_raw = m.group(1)
	exp = int(m.group(2))
	coeff_words = (
	float_to_words(coeff_raw) if '.' in coeff_raw else number_to_words(int(coeff_raw))
	)
	exp_words = number_to_words(abs(exp))
	sign = 'negative ' if exp < 0 else ''
	return f'{coeff_words} times ten to the {sign}{exp_words}'

	return _RE_SCI.sub(_replace, text)


	def expand_scale_suffixes(text: str) -> str:
	"""
	Expand standalone uppercase scale suffixes attached to numbers.

	Examples:
	"7B parameters" → "seven billion parameters"
	"340M model" → "three hundred forty million model"
	"1.5K salary" → "one point five thousand salary"
	"$100K budget" → "$100K budget" (currency handled upstream)
	"""
	_map = {'K': 'thousand', 'M': 'million', 'B': 'billion', 'T': 'trillion'}

	def _replace(m: re.Match) -> str:
	raw = m.group(1)
	suffix = m.group(2)
	scale_word = _map.get(suffix, suffix)
	num = float_to_words(raw) if '.' in raw else number_to_words(int(raw))
	return f'{num} {scale_word}'

	return _RE_SCALE.sub(_replace, text)


	def expand_fractions(text: str) -> str:
	"""
	Expand simple numeric fractions.

	Examples:
	"1/2 cup" → "one half cup"
	"3/4 mile" → "three quarters mile"
	"2/3 done" → "two thirds done"
	"5/8 inch" → "five eighths inch"
	"""

	def _replace(m: re.Match) -> str:
	num = int(m.group(1))
	den = int(m.group(2))
	if den == 0:
	return m.group()
	num_words = number_to_words(num)
	if den == 2:
	denom_word = 'half' if num == 1 else 'halves'
	elif den == 4:
	denom_word = 'quarter' if num == 1 else 'quarters'
	else:
	denom_word = _ordinal_suffix(den)
	if num != 1:
	denom_word += 's'
	return f'{num_words} {denom_word}'

	return _RE_FRACTION.sub(_replace, text)


	def expand_decades(text: str) -> str:
	"""
	Expand decade expressions to words.

	Examples:
	"the 80s" → "the eighties"
	"the 1980s" → "the nineteen eighties"
	"the 2020s" → "the twenty twenties"
	"'90s music" → "nineties music"
	"""
	_decade_map = {
	0: 'hundreds',
	1: 'tens',
	2: 'twenties',
	3: 'thirties',
	4: 'forties',
	5: 'fifties',
	6: 'sixties',
	7: 'seventies',
	8: 'eighties',
	9: 'nineties',
	}

	def _replace(m: re.Match) -> str:
	base = int(m.group(1)) # e.g. 8 for "80s", 198 for "1980s"
	decade_digit = base % 10
	decade_word = _decade_map.get(decade_digit, '')
	if base < 10:
	return decade_word
	century_part = base // 10 # e.g. 19 for 198
	return f'{number_to_words(century_part)} {decade_word}'

	return _RE_DECADE.sub(_replace, text)


	def expand_ip_addresses(text: str) -> str:
	"""
	Expand IPv4 addresses to spoken digits per octet.

	Examples:
	"192.168.1.1" → "one nine two dot one six eight dot one dot one"
	"10.0.0.1" → "one zero dot zero dot zero dot one"
	"""
	_d = {
	'0': 'zero',
	'1': 'one',
	'2': 'two',
	'3': 'three',
	'4': 'four',
	'5': 'five',
	'6': 'six',
	'7': 'seven',
	'8': 'eight',
	'9': 'nine',
	}

	def _octet(s: str) -> str:
	return ' '.join(_d[c] for c in s)

	def _replace(m: re.Match) -> str:
	return ' dot '.join(_octet(g) for g in m.groups())

	return re.sub(r'\b(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})\b', _replace, text)


	def expand_phone_numbers(text: str) -> str:
	"""
	Expand US phone numbers to spoken digits before range expansion claims the hyphens.

	Examples:
	"555-1234" → "five five five one two three four"
	"555-123-4567" → "five five five one two three four five six seven"
	"1-800-555-0199" → "one eight zero zero five five five zero one nine nine"
	"""
	_d = {
	'0': 'zero',
	'1': 'one',
	'2': 'two',
	'3': 'three',
	'4': 'four',
	'5': 'five',
	'6': 'six',
	'7': 'seven',
	'8': 'eight',
	'9': 'nine',
	}

	def _digits(s: str) -> str:
	return ' '.join(_d[c] for c in s)

	def _join(*groups) -> str:
	return ' '.join(_digits(g) for g in groups)

	# Match longest pattern first to avoid partial matches
	# 11-digit: 1-800-555-0199
	text = re.sub(
	r'(?<!\d-)(?<!\d)\b(\d{1,2})-(\d{3})-(\d{3})-(\d{4})\b(?!-\d)',
	lambda m: _join(*m.groups()),
	text,
	)
	# 10-digit: 555-123-4567
	text = re.sub(
	r'(?<!\d-)(?<!\d)\b(\d{3})-(\d{3})-(\d{4})\b(?!-\d)', lambda m: _join(*m.groups()), text
	)
	# 7-digit local: 555-1234 (not preceded or followed by digit-hyphen to avoid sub-matching)
	text = re.sub(r'(?<!\d-)\b(\d{3})-(\d{4})\b(?!-\d)', lambda m: _join(*m.groups()), text)
	return text


	def expand_months(text: str) -> str:
	"""
	Expands Jan, Feb, etc. to January, February.
	Only triggers if the abbreviation is likely a date.
	"""

	def _replace(m: re.Match) -> str:
	return _MONTH_MAP.get(m.group(1), m.group(1))

	# 1. Standard abbreviations
	text = _RE_MONTHS.sub(_replace, text)

	# 2. May (Special case: only if followed by a digit)
	text = _RE_MAY.sub('May', text) # Essentially just ensuring it's treated as a word

	return text


	# ─────────────────────────────────────────────
	# Core preprocessing functions
	# ─────────────────────────────────────────────


	def replace_numbers(text: str, replace_floats: bool = True) -> str:
	"""
	Replace all numeric tokens with their word equivalents.

	Examples:
	"There are 1200 students" → "There are twelve hundred students"
	"Pi is 3.14" → "Pi is three point one four"
	"gpt-3 rocks" → "gpt-3 rocks" (hyphen not treated as minus)
	"""

	def _replace(m: re.Match) -> str:
	raw = m.group().replace(',', '')
	try:
	if '.' in raw and replace_floats:
	# Pass raw string so trailing zeros are preserved ("1.50" → "one point five zero")
	return float_to_words(raw)
	else:
	return number_to_words(int(float(raw)))
	except (ValueError, OverflowError):
	return m.group()

	return _RE_NUMBER.sub(_replace, text)


	def to_lowercase(text: str) -> str:
	"""Convert text to lowercase."""
	return text.lower()


	def remove_urls(text: str, replacement: str = '') -> str:
	"""Remove URLs from text."""
	return _RE_URL.sub(replacement, text).strip()


	def remove_emails(text: str, replacement: str = '') -> str:
	"""Remove email addresses from text."""
	return _RE_EMAIL.sub(replacement, text).strip()


	def remove_html_tags(text: str) -> str:
	"""Strip HTML tags from text."""
	return _RE_HTML.sub(' ', text)


	def remove_hashtags(text: str, replacement: str = '') -> str:
	"""Remove hashtags (e.g. #NLP) from text."""
	return _RE_HASHTAG.sub(replacement, text)


	def remove_mentions(text: str, replacement: str = '') -> str:
	"""Remove @mentions from text."""
	return _RE_MENTION.sub(replacement, text)


	def remove_punctuation(text: str) -> str:
	"""Remove all punctuation characters."""
	return _RE_PUNCT.sub(' ', text)


	def remove_extra_whitespace(text: str) -> str:
	"""Collapse multiple whitespace characters into a single space and strip ends."""
	return _RE_SPACES.sub(' ', text).strip()


	def normalize_unicode(text: str, form: str = 'NFC') -> str:
	"""Normalize unicode characters (NFC, NFD, NFKC, or NFKD)."""
	return unicodedata.normalize(form, text)


	def remove_accents(text: str) -> str:
	"""Remove diacritical marks (accents) from characters."""
	nfkd = unicodedata.normalize('NFD', text)
	return ''.join(c for c in nfkd if unicodedata.category(c) != 'Mn')


	def expand_contractions(text: str) -> str:
	"""
	Expand common English contractions.

	Examples:
	"don't" → "do not"
	"they're" → "they are"
	"I've" → "I have"
	"""
	contractions = {
	r"\bcan't\b": 'cannot',
	r"\bwon't\b": 'will not',
	r"\bshan't\b": 'shall not',
	r"\bain't\b": 'is not',
	r"\blet's\b": 'let us',
	r"\b(\w+)n't\b": r'\1 not',
	r"\b(\w+)'re\b": r'\1 are',
	r"\b(\w+)'ve\b": r'\1 have',
	r"\b(\w+)'ll\b": r'\1 will',
	r"\b(\w+)'d\b": r'\1 would',
	r"\b(\w+)'m\b": r'\1 am',
	r"\bit's\b": 'it is',
	}
	for pattern, replacement in contractions.items():
	text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
	return text


	def remove_stopwords(text: str, stopwords: set \| None = None) -> str:
	"""
	Remove stopwords from text.

	Args:
	stopwords: Set of words to remove. Uses a built-in English set if None.
	"""
	if stopwords is None:
	stopwords = {
	'a',
	'an',
	'the',
	'and',
	'or',
	'but',
	'in',
	'on',
	'at',
	'to',
	'for',
	'of',
	'with',
	'by',
	'from',
	'is',
	'was',
	'are',
	'were',
	'be',
	'been',
	'being',
	'have',
	'has',
	'had',
	'do',
	'does',
	'did',
	'will',
	'would',
	'could',
	'should',
	'may',
	'might',
	'this',
	'that',
	'these',
	'those',
	'it',
	'its',
	'i',
	'me',
	'my',
	'we',
	'our',
	'you',
	'your',
	'he',
	'she',
	'him',
	'her',
	'they',
	'them',
	'their',
	}
	tokens = text.split()
	return ' '.join(t for t in tokens if t.lower() not in stopwords)


	# ─────────────────────────────────────────────
	# Pipeline helper
	# ─────────────────────────────────────────────


	class TextPreprocessor:
	"""
	Configurable preprocessing pipeline.

	Usage:
	pp = TextPreprocessor(
	lowercase=True,
	replace_numbers=True,
	remove_urls=True,
	remove_html=True,
	remove_punctuation=True,
	)
	clean = pp("GPT-3 costs $0.002 per token — 50% cheaper than before!")
	# → "gpt three costs zero dollars and zero point two cents per token fifty percent cheaper than before"
	"""

	def __init__(
	self,
	lowercase: bool = True,
	replace_numbers: bool = True,
	replace_floats: bool = True,
	expand_newlines: bool = True,
	expand_tilde: bool = True,
	expand_abbreviations: bool = True,
	expand_symbols: bool = True,
	expand_contractions: bool = True,
	expand_model_names: bool = True,
	expand_ordinals: bool = True,
	expand_percentages: bool = True,
	expand_currency: bool = True,
	expand_time: bool = True,
	expand_ranges: bool = True,
	expand_units: bool = True,
	expand_scale_suffixes: bool = True,
	expand_scientific_notation: bool = True,
	expand_fractions: bool = True,
	expand_decades: bool = True,
	expand_phone_numbers: bool = True,
	expand_ip_addresses: bool = True,
	normalize_leading_decimals: bool = True,
	expand_roman_numerals: bool = False,
	remove_urls: bool = True,
	remove_emails: bool = True,
	remove_html: bool = True,
	remove_hashtags: bool = False,
	remove_mentions: bool = False,
	remove_punctuation: bool = True,
	remove_stopwords: bool = False,
	stopwords: set \| None = None,
	normalize_unicode: bool = True,
	remove_accents: bool = False,
	remove_extra_whitespace: bool = True,
	):
	self.config = {k: v for k, v in locals().items() if k != 'self'}
	self._stopwords = stopwords

	def __call__(self, text: str) -> str:
	return self.process(text)

	def process(self, text: str) -> str:
	cfg = self.config
	if cfg.get('expand_abbreviations'):
	text = expand_abbreviations(text)
	text = expand_months(text)
	if cfg.get('expand_newlines'):
	text = expand_newlines(text)
	if cfg.get('expand_symbols'):
	text = expand_symbols(text)
	if cfg.get('expand_tilde'):
	text = expand_tilde(text)
	if cfg['normalize_unicode']:
	text = normalize_unicode(text)
	if cfg['remove_html']:
	text = remove_html_tags(text)
	if cfg['remove_urls']:
	text = remove_urls(text)
	if cfg['remove_emails']:
	text = remove_emails(text)
	if cfg['remove_hashtags']:
	text = remove_hashtags(text)
	if cfg['remove_mentions']:
	text = remove_mentions(text)
	if cfg['expand_contractions']:
	text = expand_contractions(text)
	# IP addresses before normalize_leading_decimals (IPs contain dots before digits)
	if cfg['expand_ip_addresses']:
	text = expand_ip_addresses(text)
	# Normalise bare leading decimals early so downstream regexes see "0.5" not ".5"
	if cfg['normalize_leading_decimals']:
	text = normalize_leading_decimals(text)
	# Expand special forms before generic number replacement
	if cfg['expand_currency']:
	text = expand_currency(text)
	if cfg['expand_percentages']:
	text = expand_percentages(text)
	# Scientific notation before model-name expansion (e.g. "1e-4" contains "e-4")
	if cfg['expand_scientific_notation']:
	text = expand_scientific_notation(text)
	if cfg['expand_time']:
	text = expand_time(text)
	if cfg['expand_ordinals']:
	text = expand_ordinals(text)
	if cfg['expand_units']:
	text = expand_units(text)
	# Scale suffixes after units (units handles "MB"/"GB"; this handles bare "B"/"M")
	if cfg['expand_scale_suffixes']:
	text = expand_scale_suffixes(text)
	if cfg['expand_fractions']:
	text = expand_fractions(text)
	if cfg['expand_decades']:
	text = expand_decades(text)
	# Phone numbers before ranges, otherwise NNN-NNNN is treated as a range
	if cfg['expand_phone_numbers']:
	text = expand_phone_numbers(text)
	if cfg['expand_ranges']:
	text = expand_ranges(text)
	if cfg['expand_model_names']:
	text = expand_model_names(text)
	if cfg['expand_roman_numerals']:
	text = expand_roman_numerals(text)
	if cfg['replace_numbers']:
	text = replace_numbers(text, replace_floats=cfg['replace_floats'])
	if cfg['remove_accents']:
	text = remove_accents(text)
	if cfg['remove_punctuation']:
	text = remove_punctuation(text)
	if cfg['lowercase']:
	text = to_lowercase(text)
	if cfg['remove_stopwords']:
	text = remove_stopwords(text, self._stopwords)
	if cfg['remove_extra_whitespace']:
	text = remove_extra_whitespace(text)

	return text