""" Tera.VO Text Processing Module Full text normalization and encoding pipeline built from scratch. """ import re import numpy as np import inflect _inflect_engine = inflect.engine() # === Symbol Set === _pad = '_' _eos = '~' _bos = '^' _punctuation = '!\'(),.:;? -"' _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' symbols = [_pad] + [_bos] + [_eos] + list(_punctuation) + list(_letters) symbol_to_id = {s: i for i, s in enumerate(symbols)} id_to_symbol = {i: s for i, s in enumerate(symbols)} NUM_SYMBOLS = len(symbols) class TextProcessor: """Complete text processing pipeline for Tera.VO""" def __init__(self): self.symbol_to_id = symbol_to_id self.id_to_symbol = id_to_symbol self.num_symbols = NUM_SYMBOLS self.abbreviations = { 'mr.': 'mister', 'mrs.': 'missus', 'dr.': 'doctor', 'prof.': 'professor', 'sr.': 'senior', 'jr.': 'junior', 'st.': 'saint', 'vs.': 'versus', 'etc.': 'etcetera', 'govt.': 'government', 'dept.': 'department', 'jan.': 'january', 'feb.': 'february', 'mar.': 'march', 'apr.': 'april', 'aug.': 'august', 'sep.': 'september', 'oct.': 'october', 'nov.': 'november', 'dec.': 'december', 'approx.': 'approximately', 'univ.': 'university', } def normalize_text(self, text): """Full normalization pipeline""" text = text.strip() text = self._expand_abbreviations(text) text = self._expand_numbers(text) text = self._expand_symbols(text) text = self._collapse_whitespace(text) return text def _expand_abbreviations(self, text): for abbr, full in self.abbreviations.items(): text = re.sub(re.escape(abbr), full, text, flags=re.IGNORECASE) return text def _expand_numbers(self, text): text = re.sub( r'\$(\d+\.?\d*)', lambda m: self._currency(m.group(1)), text ) text = re.sub( r'(\d+\.?\d*)%', lambda m: self._number_words(m.group(1)) + ' percent', text ) text = re.sub( r'(\d+)(st|nd|rd|th)\b', lambda m: self._ordinal(int(m.group(1))), text ) text = re.sub( r'\b\d+\.?\d*\b', lambda m: self._number_words(m.group(0)), text ) return text def _currency(self, amount_str): parts = amount_str.split('.') dollars = int(parts[0]) result = self._number_words(str(dollars)) result += ' dollar' + ('s' if dollars != 1 else '') if len(parts) > 1 and int(parts[1]) > 0: cents = int(parts[1][:2].ljust(2, '0')) result += ' and ' + self._number_words(str(cents)) result += ' cent' + ('s' if cents != 1 else '') return result def _number_words(self, num_str): try: num = float(num_str) if num == int(num): return _inflect_engine.number_to_words(int(num)) return _inflect_engine.number_to_words(num_str) except (ValueError, TypeError): return num_str def _ordinal(self, num): try: return _inflect_engine.ordinal( _inflect_engine.number_to_words(num) ) except Exception: return str(num) def _expand_symbols(self, text): replacements = { '&': ' and ', '@': ' at ', '#': ' hash ', '+': ' plus ', '=': ' equals ', '/': ' slash ', } for sym, word in replacements.items(): text = text.replace(sym, word) return text def _collapse_whitespace(self, text): return re.sub(r'\s+', ' ', text).strip() def text_to_sequence(self, text): """Convert normalized text to integer sequence""" text = self.normalize_text(text) seq = [self.symbol_to_id[_bos]] for ch in text: if ch in self.symbol_to_id: seq.append(self.symbol_to_id[ch]) seq.append(self.symbol_to_id[_eos]) return seq def sequence_to_text(self, sequence): """Convert integer sequence back to text""" chars = [] for idx in sequence: if idx in self.id_to_symbol: s = self.id_to_symbol[idx] if s not in [_pad, _bos, _eos]: chars.append(s) return ''.join(chars) def pad_sequence(self, seq, max_len): """Pad or truncate sequence""" if len(seq) >= max_len: return seq[:max_len] return seq + [self.symbol_to_id[_pad]] * (max_len - len(seq)) def get_vocab_size(self): return self.num_symbols text_processor = TextProcessor()