Spaces:
Running
Running
| import re | |
| # Normalize keys once (lowercase) | |
| CONTRACTIONS = { | |
| "a'": "aw", | |
| "an": "anw", | |
| "n'": "ne", | |
| "n": "ne", | |
| "ulu": "u", | |
| "b'a": "be a", | |
| "bɛ": "be", | |
| "nka": "nga", | |
| "loru": "duru" | |
| } | |
| # Escape + longest-first (critical for correctness) | |
| PATTERN = re.compile( | |
| r'(?<![^\W\d_])(' | |
| + '|'.join(sorted(map(re.escape, CONTRACTIONS), key=len, reverse=True)) | |
| + r')(?![^\W\d_])', | |
| flags=re.IGNORECASE | re.UNICODE | |
| ) | |
| SPACE_QUESTION_RE = re.compile(r'\s*\?') | |
| def normalize_text(text: str) -> str: | |
| # 1) normalize spacing before ? | |
| text = SPACE_QUESTION_RE.sub(' ?', text) | |
| # 2) expand contractions | |
| text = PATTERN.sub(lambda m: CONTRACTIONS[m.group(0).lower()], text) | |
| # 3) capitalize first letter safely | |
| return text[:1].upper() + text[1:] | |
| sentence = "bbk'a b'a di n'i sonna. na a sɔɔni? a be na'a nɔfɛ?" | |
| print(normalize_text(sentence)) |