Afro_voice_ai / src /data /bam_normalize.py
jefffffff9
Phase 3: Voice-to-Voice S2S pipeline β€” F5-TTS, LLM brain, CER metric
8952fff
Raw
History Blame Contribute Delete
2.28 kB
"""
Bambara phonetic normalizer.
Unifies French-influenced and informal spellings to the standard
N'Ko-derived Bambara orthography used in most NLP datasets.
Key rules (most impactful for ASR training):
ou β†’ u French vowel β†’ Bambara standard
gn β†’ Ι² French nasal palatal
ny β†’ Ι² English nasal palatal notation
dj β†’ j French palatal affricate
ch β†’ c French palatalized consonant
oo β†’ Ι” long open-o (common informal spelling)
ee β†’ Ι› long open-e (common informal spelling)
These rules run left-to-right on lower-cased text. They are conservative:
only unambiguous substitutions are applied so as not to corrupt words that
happen to contain these letter sequences in a non-phonemic context.
Usage:
from src.data.bam_normalize import normalize
text = normalize("I ni ce, a bΙ› djourou la")
# β†’ "i ni ce, a bΙ› juruu la"
"""
from __future__ import annotations
import re
import unicodedata
# ── Replacement table (order matters β€” longest match first) ─────────────────
_RULES: list[tuple[str, str]] = [
("ou", "u"), # most frequent French influence
("dj", "j"), # palatal affricate
("gn", "Ι²"), # nasal palatal (French orthography)
("ny", "Ι²"), # nasal palatal (English-style notation)
("ch", "c"), # palatalized stop
("oo", "Ι”"), # long open-o (informal doubling)
("ee", "Ι›"), # long open-e (informal doubling)
]
# Compile once for speed
_PATTERN = re.compile(
"|".join(re.escape(src) for src, _ in _RULES)
)
_REPLACEMENTS = {src: dst for src, dst in _RULES}
def normalize(text: str) -> str:
"""
Apply phonetic normalization to a Bambara text string.
Steps:
1. Unicode NFC normalization (collapse combining characters).
2. Lowercase.
3. Apply phoneme substitution rules.
4. Collapse multiple spaces.
"""
text = unicodedata.normalize("NFC", text)
text = text.lower()
text = _PATTERN.sub(lambda m: _REPLACEMENTS[m.group(0)], text)
text = re.sub(r" {2,}", " ", text).strip()
return text
def normalize_batch(texts: list[str]) -> list[str]:
return [normalize(t) for t in texts]