| """ |
| Adapted and supplemented from origional at https://github.com/KittenML/KittenTTS/blob/main/kittentts/preprocess.py |
| See license at: https://github.com/KittenML/KittenTTS/blob/main/LICENSE (Apache 2.0) |
| """ |
|
|
| import re |
| import unicodedata |
|
|
| |
| |
| |
|
|
| _ONES = [ |
| '', |
| 'one', |
| 'two', |
| 'three', |
| 'four', |
| 'five', |
| 'six', |
| 'seven', |
| 'eight', |
| 'nine', |
| 'ten', |
| 'eleven', |
| 'twelve', |
| 'thirteen', |
| 'fourteen', |
| 'fifteen', |
| 'sixteen', |
| 'seventeen', |
| 'eighteen', |
| 'nineteen', |
| ] |
| _TENS = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'] |
| _SCALE = ['', 'thousand', 'million', 'billion', 'trillion'] |
|
|
| _ORDINAL_EXCEPTIONS = { |
| 'one': 'first', |
| 'two': 'second', |
| 'three': 'third', |
| 'four': 'fourth', |
| 'five': 'fifth', |
| 'six': 'sixth', |
| 'seven': 'seventh', |
| 'eight': 'eighth', |
| 'nine': 'ninth', |
| 'twelve': 'twelfth', |
| } |
|
|
| _CURRENCY_SYMBOLS = { |
| '$': 'dollar', |
| '€': 'euro', |
| '£': 'pound', |
| '¥': 'yen', |
| '₹': 'rupee', |
| '₩': 'won', |
| '₿': 'bitcoin', |
| } |
|
|
| _CURRENCY_SCALE_MAP = { |
| 'K': 'thousand', |
| 'M': 'million', |
| 'B': 'billion', |
| 'T': 'trillion', |
| 'thousand': 'thousand', |
| 'million': 'million', |
| 'billion': 'billion', |
| 'trillion': 'trillion', |
| } |
|
|
| _ROMAN = [ |
| (1000, 'M'), |
| (900, 'CM'), |
| (500, 'D'), |
| (400, 'CD'), |
| (100, 'C'), |
| (90, 'XC'), |
| (50, 'L'), |
| (40, 'XL'), |
| (10, 'X'), |
| (9, 'IX'), |
| (5, 'V'), |
| (4, 'IV'), |
| (1, 'I'), |
| ] |
| _RE_ROMAN = re.compile(r'\b(M{0,4})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\b') |
|
|
|
|
| def _three_digits_to_words(n: int) -> str: |
| """Convert a number 0–999 to English words.""" |
| if n == 0: |
| return '' |
| parts = [] |
| hundreds = n // 100 |
| remainder = n % 100 |
| if hundreds: |
| parts.append(f'{_ONES[hundreds]} hundred') |
| if remainder < 20: |
| if remainder: |
| parts.append(_ONES[remainder]) |
| else: |
| tens_word = _TENS[remainder // 10] |
| ones_word = _ONES[remainder % 10] |
| parts.append(f'{tens_word}-{ones_word}' if ones_word else tens_word) |
| return ' '.join(parts) |
|
|
|
|
| def number_to_words(n: int) -> str: |
| """ |
| Convert an integer to its English word representation. |
| |
| Examples: |
| 1200 → "twelve hundred" |
| 1000 → "one thousand" |
| 1_000_000 → "one million" |
| -42 → "negative forty-two" |
| 0 → "zero" |
| """ |
| if not isinstance(n, int): |
| n = int(n) |
| if n == 0: |
| return 'zero' |
| if n < 0: |
| return f'negative {number_to_words(-n)}' |
|
|
| |
| |
| if 100 <= n <= 9999 and n % 100 == 0 and n % 1000 != 0: |
| hundreds = n // 100 |
| if hundreds < 20: |
| return f'{_ONES[hundreds]} hundred' |
|
|
| parts = [] |
| for _i, scale in enumerate(_SCALE): |
| chunk = n % 1000 |
| if chunk: |
| chunk_words = _three_digits_to_words(chunk) |
| parts.append(f'{chunk_words} {scale}'.strip() if scale else chunk_words) |
| n //= 1000 |
| if n == 0: |
| break |
|
|
| return ' '.join(reversed(parts)) |
|
|
|
|
| def float_to_words(value, decimal_sep: str = 'point') -> str: |
| """ |
| Convert a float (or numeric string) to words, reading decimal digits individually. |
| Accepts a string to preserve trailing zeros (e.g. "1.50" → "one point five zero"). |
| |
| Examples: |
| 3.14 → "three point one four" |
| -0.5 → "negative zero point five" |
| "3.10" → "three point one zero" |
| 1.007 → "one point zero zero seven" |
| """ |
| text = value if isinstance(value, str) else f'{value}' |
| negative = text.startswith('-') |
| if negative: |
| text = text[1:] |
|
|
| if '.' in text: |
| int_part, dec_part = text.split('.', 1) |
| int_words = number_to_words(int(int_part)) if int_part else 'zero' |
| |
| digit_map = ['zero'] + _ONES[1:] |
| dec_words = ' '.join(digit_map[int(d)] for d in dec_part) |
| result = f'{int_words} {decimal_sep} {dec_words}' |
| else: |
| result = number_to_words(int(text)) |
|
|
| return f'negative {result}' if negative else result |
|
|
|
|
| def roman_to_int(s: str) -> int: |
| """Convert a Roman numeral string to an integer.""" |
| val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} |
| result = 0 |
| prev = 0 |
| for ch in reversed(s.upper()): |
| curr = val[ch] |
| result += curr if curr >= prev else -curr |
| prev = curr |
| return result |
|
|
|
|
| |
| |
| |
|
|
| _RE_URL = re.compile(r'https?://\S+|www\.\S+') |
| _RE_EMAIL = re.compile(r'\b[\w.+-]+@[\w-]+\.[a-z]{2,}\b', re.IGNORECASE) |
| _RE_HASHTAG = re.compile(r'#\w+') |
| _RE_MENTION = re.compile(r'@\w+') |
| _RE_HTML = re.compile(r'<[^>]+>') |
| _RE_PUNCT = re.compile(r'[^\w\s]') |
| _RE_SPACES = re.compile(r'\s+') |
| _RE_AI = re.compile(r'\bAI\b') |
| _RE_DOT_COM = re.compile(r'\.com\b', re.IGNORECASE) |
| _RE_PLUS = re.compile(r'\+') |
| _RE_AMPERSAND = re.compile(r'&') |
| _RE_AT_SYMBOL = re.compile(r'@') |
| _RE_NEWLINE = re.compile(r'[\r\n]+') |
| _RE_TILDE = re.compile(r'~') |
|
|
| _MONTH_MAP = { |
| 'Jan': 'January', |
| 'Feb': 'February', |
| 'Mar': 'March', |
| 'Apr': 'April', |
| 'Jun': 'June', |
| 'Jul': 'July', |
| 'Aug': 'August', |
| 'Sep': 'September', |
| 'Sept': 'September', |
| 'Oct': 'October', |
| 'Nov': 'November', |
| 'Dec': 'December', |
| } |
|
|
| |
| |
| _RE_MONTHS = re.compile(r'\b(Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?\b(?=\s*\d|\s*$)') |
| _RE_MAY = re.compile(r'\bMay\b(?=\s*\d)') |
|
|
| |
| |
| _RE_NUMBER = re.compile(r'(?<![a-zA-Z])-?[\d,]+(?:\.\d+)?') |
|
|
| |
| _RE_ORDINAL = re.compile(r'\b(\d+)(st|nd|rd|th)\b', re.IGNORECASE) |
|
|
| |
| _RE_PERCENT = re.compile(r'(-?[\d,]+(?:\.\d+)?)\s*%') |
|
|
| |
| _RE_CURRENCY = re.compile( |
| r'([$€£¥₹₩₿])\s*([\d,]+(?:\.\d+)?)\s*(million|billion|trillion|thousand|[KMBT])?\b', |
| re.IGNORECASE, |
| ) |
|
|
| |
| _RE_TIME = re.compile(r'\b(\d{1,2}):(\d{2})(?::(\d{2}))?\s*(am|pm)?\b', re.IGNORECASE) |
|
|
| |
| _RE_RANGE = re.compile(r'(?<!\w)(\d+)-(\d+)(?!\w)') |
|
|
| |
| |
| _RE_MODEL_VER = re.compile(r'\b([a-zA-Z][a-zA-Z0-9]*)-(\d[\d.]*)(?=[^\d.]|$)') |
|
|
| |
| _RE_UNIT = re.compile( |
| r'(\d+(?:\.\d+)?)\s*(km|kg|mg|ml|gb|mb|kb|tb|hz|khz|mhz|ghz|mph|kph|°[cCfF]|[cCfF]°|ms|ns|µs)\b', |
| re.IGNORECASE, |
| ) |
|
|
| |
| |
| _RE_SCALE = re.compile(r'(?<![a-zA-Z])(\d+(?:\.\d+)?)\s*([KMBT])(?![a-zA-Z\d])') |
|
|
| |
| _RE_SCI = re.compile(r'(?<![a-zA-Z\d])(-?\d+(?:\.\d+)?)[eE]([+-]?\d+)(?![a-zA-Z\d])') |
|
|
| |
| _RE_FRACTION = re.compile(r'\b(\d+)\s*/\s*(\d+)\b') |
|
|
| |
| _RE_DECADE = re.compile(r'\b(\d{1,3})0s\b') |
|
|
| |
| _RE_LEAD_DEC = re.compile(r'(?<!\d)\.([\d])') |
|
|
|
|
| |
| |
| |
| def expand_abbreviations(text: str) -> str: |
| """ |
| Handles specific abbreviations before lowercase normalization. |
| AI -> A.I. |
| .com -> dot com |
| """ |
| |
| text = _RE_AI.sub('A.I.', text) |
| |
| text = _RE_DOT_COM.sub(' dot com', text) |
| return text |
|
|
|
|
| def expand_symbols(text: str) -> str: |
| """ |
| Translates mathematical and connector symbols to words. |
| """ |
| text = _RE_PLUS.sub(' plus ', text) |
| text = _RE_AMPERSAND.sub(' and ', text) |
| text = _RE_AT_SYMBOL.sub(' at ', text) |
| return text |
|
|
|
|
| def _ordinal_suffix(n: int) -> str: |
| """Return the ordinal word for n (e.g. 1 → 'first', 5 → 'fifth', 21 → 'twenty-first').""" |
| word = number_to_words(n) |
| |
| if '-' in word: |
| prefix, last = word.rsplit('-', 1) |
| joiner = '-' |
| else: |
| parts = word.rsplit(' ', 1) |
| prefix, last, joiner = (parts[0], parts[1], ' ') if len(parts) == 2 else ('', parts[0], '') |
|
|
| |
| for base, ordinal in _ORDINAL_EXCEPTIONS.items(): |
| if last == base: |
| last_ord = ordinal |
| break |
| else: |
| |
| if last.endswith('t'): |
| last_ord = last + 'h' |
| elif last.endswith('e'): |
| last_ord = last[:-1] + 'th' |
| else: |
| last_ord = last + 'th' |
|
|
| return f'{prefix}{joiner}{last_ord}' if prefix else last_ord |
|
|
|
|
| def expand_ordinals(text: str) -> str: |
| """ |
| Convert ordinal numbers to words. |
| |
| Examples: |
| "1st place" → "first place" |
| "2nd floor" → "second floor" |
| "3rd base" → "third base" |
| "21st century" → "twenty-first century" |
| "100th day" → "one hundredth day" |
| """ |
|
|
| def _replace(m: re.Match) -> str: |
| return _ordinal_suffix(int(m.group(1))) |
|
|
| return _RE_ORDINAL.sub(_replace, text) |
|
|
|
|
| def expand_percentages(text: str) -> str: |
| """ |
| Expand percentage expressions. |
| |
| Examples: |
| "50% off" → "fifty percent off" |
| "3.5% rate" → "three point five percent rate" |
| "-2% change" → "negative two percent change" |
| """ |
|
|
| def _replace(m: re.Match) -> str: |
| raw = m.group(1).replace(',', '') |
| if '.' in raw: |
| return float_to_words(float(raw)) + ' percent' |
| return number_to_words(int(raw)) + ' percent' |
|
|
| return _RE_PERCENT.sub(_replace, text) |
|
|
|
|
| def expand_newlines(text: str) -> str: |
| """Change newlines/returns to a period and space for TTS pausing.""" |
| return _RE_NEWLINE.sub('. ', text) |
|
|
|
|
| def expand_tilde(text: str) -> str: |
| """Change ~ to 'about'.""" |
| return _RE_TILDE.sub('about ', text) |
|
|
|
|
| def expand_currency(text: str) -> str: |
| """ |
| Expand currency amounts, including optional scale suffixes. |
| |
| Examples: |
| "$100" → "one hundred dollars" |
| "€1,200.50" → "twelve hundred euros and fifty cents" |
| "£9.99" → "nine pounds and ninety-nine cents" |
| "$85K" → "eighty five thousand dollars" |
| "$2.5M" → "two point five million dollars" |
| """ |
|
|
| def _replace(m: re.Match) -> str: |
| symbol = m.group(1) |
| raw = m.group(2).replace(',', '') |
| scale_suffix = m.group(3) |
| unit = _CURRENCY_SYMBOLS.get(symbol, '') |
|
|
| |
| if scale_suffix: |
| |
| scale_word = _CURRENCY_SCALE_MAP.get(scale_suffix.upper(), scale_suffix.lower()) |
| num = float_to_words(raw) if '.' in raw else number_to_words(int(raw)) |
| return f'{num} {scale_word} {unit}{"s" if unit else ""}'.strip() |
|
|
| |
| if '.' in raw: |
| int_part, dec_part = raw.split('.', 1) |
| dec_val = int(dec_part[:2].ljust(2, '0')) |
| int_words = number_to_words(int(int_part)) |
| result = f'{int_words} {unit}s' if unit else int_words |
| if dec_val: |
| cents = number_to_words(dec_val) |
| result += f' and {cents} cent{"s" if dec_val != 1 else ""}' |
| else: |
| val = int(raw) |
| words = number_to_words(val) |
| result = f'{words} {unit}{"s" if val != 1 and unit else ""}' if unit else words |
| return result |
|
|
| return _RE_CURRENCY.sub(_replace, text) |
|
|
|
|
| def expand_time(text: str) -> str: |
| """ |
| Expand time expressions. |
| |
| Examples: |
| "3:30pm" → "three thirty pm" |
| "14:00" → "fourteen hundred" |
| "9:05 AM" → "nine oh five am" |
| "12:00pm" → "twelve pm" |
| """ |
|
|
| def _replace(m: re.Match) -> str: |
| h = int(m.group(1)) |
| mins = int(m.group(2)) |
| suffix = (' ' + m.group(4).lower()) if m.group(4) else '' |
| h_words = number_to_words(h) |
| if mins == 0: |
| return f'{h_words} hundred{suffix}' if not m.group(4) else f'{h_words}{suffix}' |
| elif mins < 10: |
| return f'{h_words} oh {number_to_words(mins)}{suffix}' |
| else: |
| return f'{h_words} {number_to_words(mins)}{suffix}' |
|
|
| return _RE_TIME.sub(_replace, text) |
|
|
|
|
| def expand_ranges(text: str) -> str: |
| """ |
| Expand numeric ranges. |
| |
| Examples: |
| "10-20 items" → "ten to twenty items" |
| "pages 100-200" → "pages one hundred to two hundred" |
| "2020-2024" → "twenty twenty to twenty twenty-four" |
| """ |
|
|
| def _replace(m: re.Match) -> str: |
| lo = number_to_words(int(m.group(1))) |
| hi = number_to_words(int(m.group(2))) |
| return f'{lo} to {hi}' |
|
|
| return _RE_RANGE.sub(_replace, text) |
|
|
|
|
| def expand_model_names(text: str) -> str: |
| """ |
| Normalise version/model names that use letter-hyphen-number patterns, |
| so the number is not misread as negative. |
| |
| Examples: |
| "GPT-3" → "GPT 3" |
| "gpt-3.5" → "gpt 3.5" |
| "GPL-3" → "GPL 3" |
| "Python-3.10"→ "Python 3.10" |
| "v2.0" stays as "v2.0" (no hyphen — handled by number replacement) |
| "IPv6" stays as "IPv6" |
| """ |
| return _RE_MODEL_VER.sub(lambda m: f'{m.group(1)} {m.group(2)}', text) |
|
|
|
|
| def expand_units(text: str) -> str: |
| """ |
| Expand common measurement units glued to numbers. |
| |
| Examples: |
| "100km" → "one hundred kilometers" |
| "50kg" → "fifty kilograms" |
| "25°C" → "twenty-five degrees Celsius" |
| "5GB" → "five gigabytes" |
| """ |
| _unit_map = { |
| 'km': 'kilometers', |
| 'kg': 'kilograms', |
| 'mg': 'milligrams', |
| 'ml': 'milliliters', |
| 'gb': 'gigabytes', |
| 'mb': 'megabytes', |
| 'kb': 'kilobytes', |
| 'tb': 'terabytes', |
| 'hz': 'hertz', |
| 'khz': 'kilohertz', |
| 'mhz': 'megahertz', |
| 'ghz': 'gigahertz', |
| 'mph': 'miles per hour', |
| 'kph': 'kilometers per hour', |
| 'ms': 'milliseconds', |
| 'ns': 'nanoseconds', |
| 'µs': 'microseconds', |
| '°c': 'degrees Celsius', |
| 'c°': 'degrees Celsius', |
| '°f': 'degrees Fahrenheit', |
| 'f°': 'degrees Fahrenheit', |
| } |
|
|
| def _replace(m: re.Match) -> str: |
| raw = m.group(1) |
| unit = m.group(2).lower() |
| expanded = _unit_map.get(unit, m.group(2)) |
| num = float_to_words(float(raw)) if '.' in raw else number_to_words(int(raw)) |
| return f'{num} {expanded}' |
|
|
| return _RE_UNIT.sub(_replace, text) |
|
|
|
|
| def expand_roman_numerals(text: str, context_words: bool = True) -> str: |
| """ |
| Expand Roman numerals that appear as standalone tokens (optionally |
| only when preceded by a title-like word to avoid false positives). |
| |
| Examples: |
| "World War II" → "World War two" |
| "Chapter IV" → "Chapter four" |
| "Louis XIV" → "Louis fourteen" |
| "mix I with V" → left unchanged (ambiguous single letters) |
| """ |
| _TITLE_WORDS = re.compile( |
| r'\b(war|chapter|part|volume|act|scene|book|section|article|' |
| r'king|queen|pope|louis|henry|edward|george|william|james|' |
| r'phase|round|level|stage|class|type|version|episode|season)\b', |
| re.IGNORECASE, |
| ) |
|
|
| def _replace(m: re.Match) -> str: |
| roman = m.group(0) |
| if not roman.strip(): |
| return roman |
| |
| if len(roman) == 1 and roman in 'IVX': |
| |
| start = m.start() |
| preceding = text[max(0, start - 30) : start] |
| if not _TITLE_WORDS.search(preceding): |
| return roman |
| try: |
| val = roman_to_int(roman) |
| if val == 0: |
| return roman |
| return number_to_words(val) |
| except Exception: |
| return roman |
|
|
| return _RE_ROMAN.sub(_replace, text) |
|
|
|
|
| def normalize_leading_decimals(text: str) -> str: |
| """ |
| Normalise bare leading-decimal floats so the number pipeline handles them. |
| |
| Examples: |
| ".5 teaspoons" → "0.5 teaspoons" |
| "-.25 adjustment" → "-0.25 adjustment" |
| """ |
| |
| text = re.sub(r'(?<!\d)(-)\.([\d])', r'\g<1>0.\2', text) |
| return _RE_LEAD_DEC.sub(r'0.\1', text) |
|
|
|
|
| def expand_scientific_notation(text: str) -> str: |
| """ |
| Expand scientific-notation numbers to spoken form. |
| |
| Examples: |
| "1e-4" → "one times ten to the negative four" |
| "2.5e10" → "two point five times ten to the ten" |
| "6.022E23"→ "six point zero two two times ten to the twenty three" |
| """ |
|
|
| def _replace(m: re.Match) -> str: |
| coeff_raw = m.group(1) |
| exp = int(m.group(2)) |
| coeff_words = ( |
| float_to_words(coeff_raw) if '.' in coeff_raw else number_to_words(int(coeff_raw)) |
| ) |
| exp_words = number_to_words(abs(exp)) |
| sign = 'negative ' if exp < 0 else '' |
| return f'{coeff_words} times ten to the {sign}{exp_words}' |
|
|
| return _RE_SCI.sub(_replace, text) |
|
|
|
|
| def expand_scale_suffixes(text: str) -> str: |
| """ |
| Expand standalone uppercase scale suffixes attached to numbers. |
| |
| Examples: |
| "7B parameters" → "seven billion parameters" |
| "340M model" → "three hundred forty million model" |
| "1.5K salary" → "one point five thousand salary" |
| "$100K budget" → "$100K budget" (currency handled upstream) |
| """ |
| _map = {'K': 'thousand', 'M': 'million', 'B': 'billion', 'T': 'trillion'} |
|
|
| def _replace(m: re.Match) -> str: |
| raw = m.group(1) |
| suffix = m.group(2) |
| scale_word = _map.get(suffix, suffix) |
| num = float_to_words(raw) if '.' in raw else number_to_words(int(raw)) |
| return f'{num} {scale_word}' |
|
|
| return _RE_SCALE.sub(_replace, text) |
|
|
|
|
| def expand_fractions(text: str) -> str: |
| """ |
| Expand simple numeric fractions. |
| |
| Examples: |
| "1/2 cup" → "one half cup" |
| "3/4 mile" → "three quarters mile" |
| "2/3 done" → "two thirds done" |
| "5/8 inch" → "five eighths inch" |
| """ |
|
|
| def _replace(m: re.Match) -> str: |
| num = int(m.group(1)) |
| den = int(m.group(2)) |
| if den == 0: |
| return m.group() |
| num_words = number_to_words(num) |
| if den == 2: |
| denom_word = 'half' if num == 1 else 'halves' |
| elif den == 4: |
| denom_word = 'quarter' if num == 1 else 'quarters' |
| else: |
| denom_word = _ordinal_suffix(den) |
| if num != 1: |
| denom_word += 's' |
| return f'{num_words} {denom_word}' |
|
|
| return _RE_FRACTION.sub(_replace, text) |
|
|
|
|
| def expand_decades(text: str) -> str: |
| """ |
| Expand decade expressions to words. |
| |
| Examples: |
| "the 80s" → "the eighties" |
| "the 1980s" → "the nineteen eighties" |
| "the 2020s" → "the twenty twenties" |
| "'90s music" → "nineties music" |
| """ |
| _decade_map = { |
| 0: 'hundreds', |
| 1: 'tens', |
| 2: 'twenties', |
| 3: 'thirties', |
| 4: 'forties', |
| 5: 'fifties', |
| 6: 'sixties', |
| 7: 'seventies', |
| 8: 'eighties', |
| 9: 'nineties', |
| } |
|
|
| def _replace(m: re.Match) -> str: |
| base = int(m.group(1)) |
| decade_digit = base % 10 |
| decade_word = _decade_map.get(decade_digit, '') |
| if base < 10: |
| return decade_word |
| century_part = base // 10 |
| return f'{number_to_words(century_part)} {decade_word}' |
|
|
| return _RE_DECADE.sub(_replace, text) |
|
|
|
|
| def expand_ip_addresses(text: str) -> str: |
| """ |
| Expand IPv4 addresses to spoken digits per octet. |
| |
| Examples: |
| "192.168.1.1" → "one nine two dot one six eight dot one dot one" |
| "10.0.0.1" → "one zero dot zero dot zero dot one" |
| """ |
| _d = { |
| '0': 'zero', |
| '1': 'one', |
| '2': 'two', |
| '3': 'three', |
| '4': 'four', |
| '5': 'five', |
| '6': 'six', |
| '7': 'seven', |
| '8': 'eight', |
| '9': 'nine', |
| } |
|
|
| def _octet(s: str) -> str: |
| return ' '.join(_d[c] for c in s) |
|
|
| def _replace(m: re.Match) -> str: |
| return ' dot '.join(_octet(g) for g in m.groups()) |
|
|
| return re.sub(r'\b(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})\b', _replace, text) |
|
|
|
|
| def expand_phone_numbers(text: str) -> str: |
| """ |
| Expand US phone numbers to spoken digits before range expansion claims the hyphens. |
| |
| Examples: |
| "555-1234" → "five five five one two three four" |
| "555-123-4567" → "five five five one two three four five six seven" |
| "1-800-555-0199" → "one eight zero zero five five five zero one nine nine" |
| """ |
| _d = { |
| '0': 'zero', |
| '1': 'one', |
| '2': 'two', |
| '3': 'three', |
| '4': 'four', |
| '5': 'five', |
| '6': 'six', |
| '7': 'seven', |
| '8': 'eight', |
| '9': 'nine', |
| } |
|
|
| def _digits(s: str) -> str: |
| return ' '.join(_d[c] for c in s) |
|
|
| def _join(*groups) -> str: |
| return ' '.join(_digits(g) for g in groups) |
|
|
| |
| |
| text = re.sub( |
| r'(?<!\d-)(?<!\d)\b(\d{1,2})-(\d{3})-(\d{3})-(\d{4})\b(?!-\d)', |
| lambda m: _join(*m.groups()), |
| text, |
| ) |
| |
| text = re.sub( |
| r'(?<!\d-)(?<!\d)\b(\d{3})-(\d{3})-(\d{4})\b(?!-\d)', lambda m: _join(*m.groups()), text |
| ) |
| |
| text = re.sub(r'(?<!\d-)\b(\d{3})-(\d{4})\b(?!-\d)', lambda m: _join(*m.groups()), text) |
| return text |
|
|
|
|
| def expand_months(text: str) -> str: |
| """ |
| Expands Jan, Feb, etc. to January, February. |
| Only triggers if the abbreviation is likely a date. |
| """ |
|
|
| def _replace(m: re.Match) -> str: |
| return _MONTH_MAP.get(m.group(1), m.group(1)) |
|
|
| |
| text = _RE_MONTHS.sub(_replace, text) |
|
|
| |
| text = _RE_MAY.sub('May', text) |
|
|
| return text |
|
|
|
|
| |
| |
| |
|
|
|
|
| def replace_numbers(text: str, replace_floats: bool = True) -> str: |
| """ |
| Replace all numeric tokens with their word equivalents. |
| |
| Examples: |
| "There are 1200 students" → "There are twelve hundred students" |
| "Pi is 3.14" → "Pi is three point one four" |
| "gpt-3 rocks" → "gpt-3 rocks" (hyphen not treated as minus) |
| """ |
|
|
| def _replace(m: re.Match) -> str: |
| raw = m.group().replace(',', '') |
| try: |
| if '.' in raw and replace_floats: |
| |
| return float_to_words(raw) |
| else: |
| return number_to_words(int(float(raw))) |
| except (ValueError, OverflowError): |
| return m.group() |
|
|
| return _RE_NUMBER.sub(_replace, text) |
|
|
|
|
| def to_lowercase(text: str) -> str: |
| """Convert text to lowercase.""" |
| return text.lower() |
|
|
|
|
| def remove_urls(text: str, replacement: str = '') -> str: |
| """Remove URLs from text.""" |
| return _RE_URL.sub(replacement, text).strip() |
|
|
|
|
| def remove_emails(text: str, replacement: str = '') -> str: |
| """Remove email addresses from text.""" |
| return _RE_EMAIL.sub(replacement, text).strip() |
|
|
|
|
| def remove_html_tags(text: str) -> str: |
| """Strip HTML tags from text.""" |
| return _RE_HTML.sub(' ', text) |
|
|
|
|
| def remove_hashtags(text: str, replacement: str = '') -> str: |
| """Remove hashtags (e.g. #NLP) from text.""" |
| return _RE_HASHTAG.sub(replacement, text) |
|
|
|
|
| def remove_mentions(text: str, replacement: str = '') -> str: |
| """Remove @mentions from text.""" |
| return _RE_MENTION.sub(replacement, text) |
|
|
|
|
| def remove_punctuation(text: str) -> str: |
| """Remove all punctuation characters.""" |
| return _RE_PUNCT.sub(' ', text) |
|
|
|
|
| def remove_extra_whitespace(text: str) -> str: |
| """Collapse multiple whitespace characters into a single space and strip ends.""" |
| return _RE_SPACES.sub(' ', text).strip() |
|
|
|
|
| def normalize_unicode(text: str, form: str = 'NFC') -> str: |
| """Normalize unicode characters (NFC, NFD, NFKC, or NFKD).""" |
| return unicodedata.normalize(form, text) |
|
|
|
|
| def remove_accents(text: str) -> str: |
| """Remove diacritical marks (accents) from characters.""" |
| nfkd = unicodedata.normalize('NFD', text) |
| return ''.join(c for c in nfkd if unicodedata.category(c) != 'Mn') |
|
|
|
|
| def expand_contractions(text: str) -> str: |
| """ |
| Expand common English contractions. |
| |
| Examples: |
| "don't" → "do not" |
| "they're" → "they are" |
| "I've" → "I have" |
| """ |
| contractions = { |
| r"\bcan't\b": 'cannot', |
| r"\bwon't\b": 'will not', |
| r"\bshan't\b": 'shall not', |
| r"\bain't\b": 'is not', |
| r"\blet's\b": 'let us', |
| r"\b(\w+)n't\b": r'\1 not', |
| r"\b(\w+)'re\b": r'\1 are', |
| r"\b(\w+)'ve\b": r'\1 have', |
| r"\b(\w+)'ll\b": r'\1 will', |
| r"\b(\w+)'d\b": r'\1 would', |
| r"\b(\w+)'m\b": r'\1 am', |
| r"\bit's\b": 'it is', |
| } |
| for pattern, replacement in contractions.items(): |
| text = re.sub(pattern, replacement, text, flags=re.IGNORECASE) |
| return text |
|
|
|
|
| def remove_stopwords(text: str, stopwords: set | None = None) -> str: |
| """ |
| Remove stopwords from text. |
| |
| Args: |
| stopwords: Set of words to remove. Uses a built-in English set if None. |
| """ |
| if stopwords is None: |
| stopwords = { |
| 'a', |
| 'an', |
| 'the', |
| 'and', |
| 'or', |
| 'but', |
| 'in', |
| 'on', |
| 'at', |
| 'to', |
| 'for', |
| 'of', |
| 'with', |
| 'by', |
| 'from', |
| 'is', |
| 'was', |
| 'are', |
| 'were', |
| 'be', |
| 'been', |
| 'being', |
| 'have', |
| 'has', |
| 'had', |
| 'do', |
| 'does', |
| 'did', |
| 'will', |
| 'would', |
| 'could', |
| 'should', |
| 'may', |
| 'might', |
| 'this', |
| 'that', |
| 'these', |
| 'those', |
| 'it', |
| 'its', |
| 'i', |
| 'me', |
| 'my', |
| 'we', |
| 'our', |
| 'you', |
| 'your', |
| 'he', |
| 'she', |
| 'him', |
| 'her', |
| 'they', |
| 'them', |
| 'their', |
| } |
| tokens = text.split() |
| return ' '.join(t for t in tokens if t.lower() not in stopwords) |
|
|
|
|
| |
| |
| |
|
|
|
|
| class TextPreprocessor: |
| """ |
| Configurable preprocessing pipeline. |
| |
| Usage: |
| pp = TextPreprocessor( |
| lowercase=True, |
| replace_numbers=True, |
| remove_urls=True, |
| remove_html=True, |
| remove_punctuation=True, |
| ) |
| clean = pp("GPT-3 costs $0.002 per token — 50% cheaper than before!") |
| # → "gpt three costs zero dollars and zero point two cents per token fifty percent cheaper than before" |
| """ |
|
|
| def __init__( |
| self, |
| lowercase: bool = True, |
| replace_numbers: bool = True, |
| replace_floats: bool = True, |
| expand_newlines: bool = True, |
| expand_tilde: bool = True, |
| expand_abbreviations: bool = True, |
| expand_symbols: bool = True, |
| expand_contractions: bool = True, |
| expand_model_names: bool = True, |
| expand_ordinals: bool = True, |
| expand_percentages: bool = True, |
| expand_currency: bool = True, |
| expand_time: bool = True, |
| expand_ranges: bool = True, |
| expand_units: bool = True, |
| expand_scale_suffixes: bool = True, |
| expand_scientific_notation: bool = True, |
| expand_fractions: bool = True, |
| expand_decades: bool = True, |
| expand_phone_numbers: bool = True, |
| expand_ip_addresses: bool = True, |
| normalize_leading_decimals: bool = True, |
| expand_roman_numerals: bool = False, |
| remove_urls: bool = True, |
| remove_emails: bool = True, |
| remove_html: bool = True, |
| remove_hashtags: bool = False, |
| remove_mentions: bool = False, |
| remove_punctuation: bool = True, |
| remove_stopwords: bool = False, |
| stopwords: set | None = None, |
| normalize_unicode: bool = True, |
| remove_accents: bool = False, |
| remove_extra_whitespace: bool = True, |
| ): |
| self.config = {k: v for k, v in locals().items() if k != 'self'} |
| self._stopwords = stopwords |
|
|
| def __call__(self, text: str) -> str: |
| return self.process(text) |
|
|
| def process(self, text: str) -> str: |
| cfg = self.config |
| if cfg.get('expand_abbreviations'): |
| text = expand_abbreviations(text) |
| text = expand_months(text) |
| if cfg.get('expand_newlines'): |
| text = expand_newlines(text) |
| if cfg.get('expand_symbols'): |
| text = expand_symbols(text) |
| if cfg.get('expand_tilde'): |
| text = expand_tilde(text) |
| if cfg['normalize_unicode']: |
| text = normalize_unicode(text) |
| if cfg['remove_html']: |
| text = remove_html_tags(text) |
| if cfg['remove_urls']: |
| text = remove_urls(text) |
| if cfg['remove_emails']: |
| text = remove_emails(text) |
| if cfg['remove_hashtags']: |
| text = remove_hashtags(text) |
| if cfg['remove_mentions']: |
| text = remove_mentions(text) |
| if cfg['expand_contractions']: |
| text = expand_contractions(text) |
| |
| if cfg['expand_ip_addresses']: |
| text = expand_ip_addresses(text) |
| |
| if cfg['normalize_leading_decimals']: |
| text = normalize_leading_decimals(text) |
| |
| if cfg['expand_currency']: |
| text = expand_currency(text) |
| if cfg['expand_percentages']: |
| text = expand_percentages(text) |
| |
| if cfg['expand_scientific_notation']: |
| text = expand_scientific_notation(text) |
| if cfg['expand_time']: |
| text = expand_time(text) |
| if cfg['expand_ordinals']: |
| text = expand_ordinals(text) |
| if cfg['expand_units']: |
| text = expand_units(text) |
| |
| if cfg['expand_scale_suffixes']: |
| text = expand_scale_suffixes(text) |
| if cfg['expand_fractions']: |
| text = expand_fractions(text) |
| if cfg['expand_decades']: |
| text = expand_decades(text) |
| |
| if cfg['expand_phone_numbers']: |
| text = expand_phone_numbers(text) |
| if cfg['expand_ranges']: |
| text = expand_ranges(text) |
| if cfg['expand_model_names']: |
| text = expand_model_names(text) |
| if cfg['expand_roman_numerals']: |
| text = expand_roman_numerals(text) |
| if cfg['replace_numbers']: |
| text = replace_numbers(text, replace_floats=cfg['replace_floats']) |
| if cfg['remove_accents']: |
| text = remove_accents(text) |
| if cfg['remove_punctuation']: |
| text = remove_punctuation(text) |
| if cfg['lowercase']: |
| text = to_lowercase(text) |
| if cfg['remove_stopwords']: |
| text = remove_stopwords(text, self._stopwords) |
| if cfg['remove_extra_whitespace']: |
| text = remove_extra_whitespace(text) |
|
|
| return text |
|
|