Spaces:

Gaoussin
/

bm-translator

Running

bm-translator / normalize_bm_words.py

Upload normalize_bm_words.py

f95e1e6 verified about 1 month ago

900 Bytes

	import re

	# Normalize keys once (lowercase)
	CONTRACTIONS = {
	"a'": "aw",
	"an": "anw",
	"n'": "ne",
	"n": "ne",
	"ulu": "u",
	"b'a": "be a",
	"bɛ": "be",
	"nka": "nga",
	"loru": "duru"
	}

	# Escape + longest-first (critical for correctness)
	PATTERN = re.compile(
	r'(?<![^\W\d_])('
	+ '\|'.join(sorted(map(re.escape, CONTRACTIONS), key=len, reverse=True))
	+ r')(?![^\W\d_])',
	flags=re.IGNORECASE \| re.UNICODE
	)

	SPACE_QUESTION_RE = re.compile(r'\s*\?')

	def normalize_text(text: str) -> str:
	# 1) normalize spacing before ?
	text = SPACE_QUESTION_RE.sub(' ?', text)

	# 2) expand contractions
	text = PATTERN.sub(lambda m: CONTRACTIONS[m.group(0).lower()], text)

	# 3) capitalize first letter safely
	return text[:1].upper() + text[1:]


	sentence = "bbk'a b'a di n'i sonna. na a sɔɔni? a be na'a nɔfɛ?"
	print(normalize_text(sentence))