Spaces:

Gaoussin
/

bm-translator

Running

App Files Files Community

bm-translator / normalize_bm_input.py

Gaoussin

Upload 3 files

fec37b6 verified about 1 month ago

raw

history blame contribute delete

3.11 kB

	import re

	# Define the de-contraction dictionary.
	# Keys are the contracted forms (what you want to replace).
	# Values are the expanded forms (what you want to replace them with).
	DE_CONTRACTIONS = {
	# Keys with apostrophes/special characters for multi-word expansion
	"k'a": "ka a",
	"a b'a": "a be a",
	"n'be": "ne be",
	"n'b'a":"ne be a",
	"b'a": "be a",
	"k'o": "ko o", # Corrected key-value based on original request
	"b'i": "be i",
	"k'i":"ka i",
	"k'aw":"ka aw",

	# Single-word keys (no apostrophe) for multi-word expansion
	"kɔkɔ": "kɔgɔ",
	"bɛ": "be"
	}

	def normalize_bm_input(text: str) -> str:
	"""
	De-contracts (expands) specific contracted forms in a string
	based on the DE_CONTRACTIONS dictionary.
	"""

	# 1. Ensure the text is lowercase for consistent matching
	text = text.lower()

	# --- Part 1: Handle Multi-Word Expansions ---

	# The condition for 'multi-word expansion' must check the VALUE (the expanded form)
	# not the KEY (the contracted form).
	multi_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' in v}

	# Sort keys (contracted forms) by length descending. This is CRUCIAL
	# for regex to match longer contracted forms (e.g., "a b'a") before
	# shorter ones that might be contained within them.
	sorted_multi_word = sorted(multi_word_expansions.items(), key=lambda item: len(item[0]), reverse=True)

	# Apply replacement for contracted forms that expand to multi-word phrases
	for contracted_form, expanded_phrase in sorted_multi_word:

	# Create a pattern to match the full contracted form, ensuring it's
	# surrounded by word boundaries. This ensures "b'a" is not matched
	# within "b'adi".
	pattern = r'\b' + re.escape(contracted_form) + r'\b'

	# Replace the full matched pattern with the expanded phrase
	text = re.sub(pattern, expanded_phrase, text)

	# --- Part 2: Handle Single-Word Expansions (e.g., 'kɔkɔ' -> 'kɔgɔ') ---

	# Filter for contractions that expand to a single word (no spaces in the value)
	single_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' not in v}

	def replace_single_word(match):
	"""Looks up the matched word (key) and returns the single-word expansion (value)."""
	word = match.group(0)
	# Use .get() to replace only the words present in the dictionary.
	return single_word_expansions.get(word, word)

	# Apply the replacement function to all whole words
	# This also catches cases like kɔkɔ and bɛ.
	text = re.sub(r'\b\S+\b', replace_single_word, text)

	# 2. Capitalize the first letter of the result for presentation
	return text[:1].upper() + text[1:]

	# --- Example Usage ---

	#input_text_4 = "k'a di a b'i fɛ kɔkɔ n'b'a fɔ. Bɛ jɛ."

	#print(f"Original Text: {input_text_4}")
	#normalized_4 = normalize_bm_input(input_text_4)
	#print(f"Normalized Text: {normalized_4}\n")

	# Expected Output: Ka a di a be i fɛ kɔgɔ ne be a fɔ. Be jɛ.