Spaces:
Running
Running
Delete normalize_bm_words.py
#4
by
Gaoussin
- opened
- normalize_bm_words.py +0 -38
normalize_bm_words.py
DELETED
|
@@ -1,38 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
|
| 3 |
-
# Normalize keys once (lowercase)
|
| 4 |
-
CONTRACTIONS = {
|
| 5 |
-
"a'": "aw",
|
| 6 |
-
"an": "anw",
|
| 7 |
-
"n'": "ne",
|
| 8 |
-
"n": "ne",
|
| 9 |
-
"ulu": "u",
|
| 10 |
-
"b'a": "be a",
|
| 11 |
-
"bɛ": "be",
|
| 12 |
-
"nka": "nga",
|
| 13 |
-
"loru": "duru"
|
| 14 |
-
}
|
| 15 |
-
|
| 16 |
-
# Escape + longest-first (critical for correctness)
|
| 17 |
-
PATTERN = re.compile(
|
| 18 |
-
r'(?<![^\W\d_])('
|
| 19 |
-
+ '|'.join(sorted(map(re.escape, CONTRACTIONS), key=len, reverse=True))
|
| 20 |
-
+ r')(?![^\W\d_])',
|
| 21 |
-
flags=re.IGNORECASE | re.UNICODE
|
| 22 |
-
)
|
| 23 |
-
|
| 24 |
-
SPACE_QUESTION_RE = re.compile(r'\s*\?')
|
| 25 |
-
|
| 26 |
-
def normalize_text(text: str) -> str:
|
| 27 |
-
# 1) normalize spacing before ?
|
| 28 |
-
text = SPACE_QUESTION_RE.sub(' ?', text)
|
| 29 |
-
|
| 30 |
-
# 2) expand contractions
|
| 31 |
-
text = PATTERN.sub(lambda m: CONTRACTIONS[m.group(0).lower()], text)
|
| 32 |
-
|
| 33 |
-
# 3) capitalize first letter safely
|
| 34 |
-
return text[:1].upper() + text[1:]
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
sentence = "bbk'a b'a di n'i sonna. na a sɔɔni? a be na'a nɔfɛ?"
|
| 38 |
-
print(normalize_text(sentence))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|