Spaces:
Running
Running
File size: 900 Bytes
f95e1e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import re
# Normalize keys once (lowercase)
CONTRACTIONS = {
"a'": "aw",
"an": "anw",
"n'": "ne",
"n": "ne",
"ulu": "u",
"b'a": "be a",
"bɛ": "be",
"nka": "nga",
"loru": "duru"
}
# Escape + longest-first (critical for correctness)
PATTERN = re.compile(
r'(?<![^\W\d_])('
+ '|'.join(sorted(map(re.escape, CONTRACTIONS), key=len, reverse=True))
+ r')(?![^\W\d_])',
flags=re.IGNORECASE | re.UNICODE
)
SPACE_QUESTION_RE = re.compile(r'\s*\?')
def normalize_text(text: str) -> str:
# 1) normalize spacing before ?
text = SPACE_QUESTION_RE.sub(' ?', text)
# 2) expand contractions
text = PATTERN.sub(lambda m: CONTRACTIONS[m.group(0).lower()], text)
# 3) capitalize first letter safely
return text[:1].upper() + text[1:]
sentence = "bbk'a b'a di n'i sonna. na a sɔɔni? a be na'a nɔfɛ?"
print(normalize_text(sentence)) |