import re DIGRAPH_MAP = { "th": "T", "sh": "S", "ch": "C", "ph": "F", "oo": "U", "ee": "I", "ai": "A", "ou": "W", } def normalize_text(text: str) -> str: normalized = text.lower().strip() normalized = re.sub(r"[^a-z0-9\s,.;:!?'-]", " ", normalized) normalized = re.sub(r"\s+", " ", normalized) return normalized def text_to_symbols(text: str) -> list[str]: normalized = normalize_text(text) symbols: list[str] = [] i = 0 while i < len(normalized): pair = normalized[i : i + 2] if pair in DIGRAPH_MAP: symbols.append(DIGRAPH_MAP[pair]) i += 2 continue ch = normalized[i] if ch in ",.;:!?": symbols.append("|") elif ch == " ": symbols.append(" ") else: symbols.append(ch) i += 1 return symbols