Spaces:
Running
Running
Upload normalize_bm_words.py
Browse files- normalize_bm_words.py +38 -0
normalize_bm_words.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
# Normalize keys once (lowercase)
|
| 4 |
+
CONTRACTIONS = {
|
| 5 |
+
"a'": "aw",
|
| 6 |
+
"an": "anw",
|
| 7 |
+
"n'": "ne",
|
| 8 |
+
"n": "ne",
|
| 9 |
+
"ulu": "u",
|
| 10 |
+
"b'a": "be a",
|
| 11 |
+
"bɛ": "be",
|
| 12 |
+
"nka": "nga",
|
| 13 |
+
"loru": "duru"
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
# Escape + longest-first (critical for correctness)
|
| 17 |
+
PATTERN = re.compile(
|
| 18 |
+
r'(?<![^\W\d_])('
|
| 19 |
+
+ '|'.join(sorted(map(re.escape, CONTRACTIONS), key=len, reverse=True))
|
| 20 |
+
+ r')(?![^\W\d_])',
|
| 21 |
+
flags=re.IGNORECASE | re.UNICODE
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
SPACE_QUESTION_RE = re.compile(r'\s*\?')
|
| 25 |
+
|
| 26 |
+
def normalize_text(text: str) -> str:
|
| 27 |
+
# 1) normalize spacing before ?
|
| 28 |
+
text = SPACE_QUESTION_RE.sub(' ?', text)
|
| 29 |
+
|
| 30 |
+
# 2) expand contractions
|
| 31 |
+
text = PATTERN.sub(lambda m: CONTRACTIONS[m.group(0).lower()], text)
|
| 32 |
+
|
| 33 |
+
# 3) capitalize first letter safely
|
| 34 |
+
return text[:1].upper() + text[1:]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
sentence = "bbk'a b'a di n'i sonna. na a sɔɔni? a be na'a nɔfɛ?"
|
| 38 |
+
print(normalize_text(sentence))
|