Spaces:

vedaco
/

Tera.VO

Runtime error

File size: 4,778 Bytes

c9e92b5

"""
Tera.VO Text Processing Module
Full text normalization and encoding pipeline built from scratch.
"""

import re
import numpy as np
import inflect

_inflect_engine = inflect.engine()

# === Symbol Set ===
_pad = '_'
_eos = '~'
_bos = '^'
_punctuation = '!\'(),.:;? -"'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

symbols = [_pad] + [_bos] + [_eos] + list(_punctuation) + list(_letters)
symbol_to_id = {s: i for i, s in enumerate(symbols)}
id_to_symbol = {i: s for i, s in enumerate(symbols)}
NUM_SYMBOLS = len(symbols)


class TextProcessor:
    """Complete text processing pipeline for Tera.VO"""

    def __init__(self):
        self.symbol_to_id = symbol_to_id
        self.id_to_symbol = id_to_symbol
        self.num_symbols = NUM_SYMBOLS

        self.abbreviations = {
            'mr.': 'mister', 'mrs.': 'missus', 'dr.': 'doctor',
            'prof.': 'professor', 'sr.': 'senior', 'jr.': 'junior',
            'st.': 'saint', 'vs.': 'versus', 'etc.': 'etcetera',
            'govt.': 'government', 'dept.': 'department',
            'jan.': 'january', 'feb.': 'february', 'mar.': 'march',
            'apr.': 'april', 'aug.': 'august', 'sep.': 'september',
            'oct.': 'october', 'nov.': 'november', 'dec.': 'december',
            'approx.': 'approximately', 'univ.': 'university',
        }

    def normalize_text(self, text):
        """Full normalization pipeline"""
        text = text.strip()
        text = self._expand_abbreviations(text)
        text = self._expand_numbers(text)
        text = self._expand_symbols(text)
        text = self._collapse_whitespace(text)
        return text

    def _expand_abbreviations(self, text):
        for abbr, full in self.abbreviations.items():
            text = re.sub(re.escape(abbr), full, text, flags=re.IGNORECASE)
        return text

    def _expand_numbers(self, text):
        text = re.sub(
            r'\$(\d+\.?\d*)',
            lambda m: self._currency(m.group(1)), text
        )
        text = re.sub(
            r'(\d+\.?\d*)%',
            lambda m: self._number_words(m.group(1)) + ' percent', text
        )
        text = re.sub(
            r'(\d+)(st|nd|rd|th)\b',
            lambda m: self._ordinal(int(m.group(1))), text
        )
        text = re.sub(
            r'\b\d+\.?\d*\b',
            lambda m: self._number_words(m.group(0)), text
        )
        return text

    def _currency(self, amount_str):
        parts = amount_str.split('.')
        dollars = int(parts[0])
        result = self._number_words(str(dollars))
        result += ' dollar' + ('s' if dollars != 1 else '')
        if len(parts) > 1 and int(parts[1]) > 0:
            cents = int(parts[1][:2].ljust(2, '0'))
            result += ' and ' + self._number_words(str(cents))
            result += ' cent' + ('s' if cents != 1 else '')
        return result

    def _number_words(self, num_str):
        try:
            num = float(num_str)
            if num == int(num):
                return _inflect_engine.number_to_words(int(num))
            return _inflect_engine.number_to_words(num_str)
        except (ValueError, TypeError):
            return num_str

    def _ordinal(self, num):
        try:
            return _inflect_engine.ordinal(
                _inflect_engine.number_to_words(num)
            )
        except Exception:
            return str(num)

    def _expand_symbols(self, text):
        replacements = {
            '&': ' and ', '@': ' at ', '#': ' hash ',
            '+': ' plus ', '=': ' equals ', '/': ' slash ',
        }
        for sym, word in replacements.items():
            text = text.replace(sym, word)
        return text

    def _collapse_whitespace(self, text):
        return re.sub(r'\s+', ' ', text).strip()

    def text_to_sequence(self, text):
        """Convert normalized text to integer sequence"""
        text = self.normalize_text(text)
        seq = [self.symbol_to_id[_bos]]
        for ch in text:
            if ch in self.symbol_to_id:
                seq.append(self.symbol_to_id[ch])
        seq.append(self.symbol_to_id[_eos])
        return seq

    def sequence_to_text(self, sequence):
        """Convert integer sequence back to text"""
        chars = []
        for idx in sequence:
            if idx in self.id_to_symbol:
                s = self.id_to_symbol[idx]
                if s not in [_pad, _bos, _eos]:
                    chars.append(s)
        return ''.join(chars)

    def pad_sequence(self, seq, max_len):
        """Pad or truncate sequence"""
        if len(seq) >= max_len:
            return seq[:max_len]
        return seq + [self.symbol_to_id[_pad]] * (max_len - len(seq))

    def get_vocab_size(self):
        return self.num_symbols


text_processor = TextProcessor()