Gaoussin commited on
Commit
f95e1e6
·
verified ·
1 Parent(s): 63110aa

Upload normalize_bm_words.py

Browse files
Files changed (1) hide show
  1. normalize_bm_words.py +38 -0
normalize_bm_words.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ # Normalize keys once (lowercase)
4
+ CONTRACTIONS = {
5
+ "a'": "aw",
6
+ "an": "anw",
7
+ "n'": "ne",
8
+ "n": "ne",
9
+ "ulu": "u",
10
+ "b'a": "be a",
11
+ "bɛ": "be",
12
+ "nka": "nga",
13
+ "loru": "duru"
14
+ }
15
+
16
+ # Escape + longest-first (critical for correctness)
17
+ PATTERN = re.compile(
18
+ r'(?<![^\W\d_])('
19
+ + '|'.join(sorted(map(re.escape, CONTRACTIONS), key=len, reverse=True))
20
+ + r')(?![^\W\d_])',
21
+ flags=re.IGNORECASE | re.UNICODE
22
+ )
23
+
24
+ SPACE_QUESTION_RE = re.compile(r'\s*\?')
25
+
26
+ def normalize_text(text: str) -> str:
27
+ # 1) normalize spacing before ?
28
+ text = SPACE_QUESTION_RE.sub(' ?', text)
29
+
30
+ # 2) expand contractions
31
+ text = PATTERN.sub(lambda m: CONTRACTIONS[m.group(0).lower()], text)
32
+
33
+ # 3) capitalize first letter safely
34
+ return text[:1].upper() + text[1:]
35
+
36
+
37
+ sentence = "bbk'a b'a di n'i sonna. na a sɔɔni? a be na'a nɔfɛ?"
38
+ print(normalize_text(sentence))