File size: 900 Bytes
f95e1e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re

# Normalize keys once (lowercase)
CONTRACTIONS = {
    "a'": "aw",
    "an": "anw",
    "n'": "ne",
    "n": "ne",
    "ulu": "u",
    "b'a": "be a",
    "bɛ": "be",
    "nka": "nga",
    "loru": "duru"
}

# Escape + longest-first (critical for correctness)
PATTERN = re.compile(
    r'(?<![^\W\d_])('
    + '|'.join(sorted(map(re.escape, CONTRACTIONS), key=len, reverse=True))
    + r')(?![^\W\d_])',
    flags=re.IGNORECASE | re.UNICODE
)

SPACE_QUESTION_RE = re.compile(r'\s*\?')

def normalize_text(text: str) -> str:
    # 1) normalize spacing before ?
    text = SPACE_QUESTION_RE.sub(' ?', text)

    # 2) expand contractions
    text = PATTERN.sub(lambda m: CONTRACTIONS[m.group(0).lower()], text)

    # 3) capitalize first letter safely
    return text[:1].upper() + text[1:]


sentence = "bbk'a b'a di n'i sonna. na a sɔɔni? a be na'a nɔfɛ?"
print(normalize_text(sentence))