Gaoussin commited on
Commit
894584d
·
verified ·
1 Parent(s): 50a21cf

Upload normalize_bm_output.py

Browse files
Files changed (1) hide show
  1. normalize_bm_output.py +34 -0
normalize_bm_output.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ # Normalize keys once (lowercase)
4
+ CONTRACTIONS = {
5
+ "ka a": "k'a",
6
+ "be a": "b'a",
7
+ "ne be": "n'be",
8
+ "taa a": "ta'a",
9
+ "ko o": "k'o"
10
+ }
11
+
12
+ # Escape + longest-first (critical for correctness)
13
+ PATTERN = re.compile(
14
+ r'(?<![^\W\d_])('
15
+ + '|'.join(sorted(map(re.escape, CONTRACTIONS), key=len, reverse=True))
16
+ + r')(?![^\W\d_])',
17
+ flags=re.IGNORECASE | re.UNICODE
18
+ )
19
+
20
+ SPACE_QUESTION_RE = re.compile(r'\s*\?')
21
+
22
+ def normalize_output(text: str) -> str:
23
+ # 1) normalize spacing before ?
24
+ text = SPACE_QUESTION_RE.sub(' ?', text)
25
+
26
+ # 2) expand contractions
27
+ text = PATTERN.sub(lambda m: CONTRACTIONS[m.group(0).lower()], text)
28
+
29
+ # 3) capitalize first letter safely
30
+ return text[:1].upper() + text[1:]
31
+
32
+
33
+ sentence = "a be ka a di ne ma ne be taa a fɛ?"
34
+ print(normalize_text(sentence))