File size: 895 Bytes
0a88ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import re


DIGRAPH_MAP = {
    "th": "T",
    "sh": "S",
    "ch": "C",
    "ph": "F",
    "oo": "U",
    "ee": "I",
    "ai": "A",
    "ou": "W",
}


def normalize_text(text: str) -> str:
    normalized = text.lower().strip()
    normalized = re.sub(r"[^a-z0-9\s,.;:!?'-]", " ", normalized)
    normalized = re.sub(r"\s+", " ", normalized)
    return normalized


def text_to_symbols(text: str) -> list[str]:
    normalized = normalize_text(text)
    symbols: list[str] = []
    i = 0
    while i < len(normalized):
        pair = normalized[i : i + 2]
        if pair in DIGRAPH_MAP:
            symbols.append(DIGRAPH_MAP[pair])
            i += 2
            continue

        ch = normalized[i]
        if ch in ",.;:!?":
            symbols.append("|")
        elif ch == " ":
            symbols.append(" ")
        else:
            symbols.append(ch)
        i += 1
    return symbols