import re # Define the de-contraction dictionary. # Keys are the contracted forms (what you want to replace). # Values are the expanded forms (what you want to replace them with). DE_CONTRACTIONS = { # Keys with apostrophes/special characters for multi-word expansion "k'a": "ka a", "a b'a": "a be a", "n'be": "ne be", "n'b'a":"ne be a", "b'a": "be a", "k'o": "ko o", # Corrected key-value based on original request "b'i": "be i", "k'i":"ka i", "k'aw":"ka aw", # Single-word keys (no apostrophe) for multi-word expansion "kɔkɔ": "kɔgɔ", "bɛ": "be" } def normalize_bm_input(text: str) -> str: """ De-contracts (expands) specific contracted forms in a string based on the DE_CONTRACTIONS dictionary. """ # 1. Ensure the text is lowercase for consistent matching text = text.lower() # --- Part 1: Handle Multi-Word Expansions --- # The condition for 'multi-word expansion' must check the VALUE (the expanded form) # not the KEY (the contracted form). multi_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' in v} # Sort keys (contracted forms) by length descending. This is CRUCIAL # for regex to match longer contracted forms (e.g., "a b'a") before # shorter ones that might be contained within them. sorted_multi_word = sorted(multi_word_expansions.items(), key=lambda item: len(item[0]), reverse=True) # Apply replacement for contracted forms that expand to multi-word phrases for contracted_form, expanded_phrase in sorted_multi_word: # Create a pattern to match the full contracted form, ensuring it's # surrounded by word boundaries. This ensures "b'a" is not matched # within "b'adi". pattern = r'\b' + re.escape(contracted_form) + r'\b' # Replace the full matched pattern with the expanded phrase text = re.sub(pattern, expanded_phrase, text) # --- Part 2: Handle Single-Word Expansions (e.g., 'kɔkɔ' -> 'kɔgɔ') --- # Filter for contractions that expand to a single word (no spaces in the value) single_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' not in v} def replace_single_word(match): """Looks up the matched word (key) and returns the single-word expansion (value).""" word = match.group(0) # Use .get() to replace only the words present in the dictionary. return single_word_expansions.get(word, word) # Apply the replacement function to all whole words # This also catches cases like kɔkɔ and bɛ. text = re.sub(r'\b\S+\b', replace_single_word, text) # 2. Capitalize the first letter of the result for presentation return text[:1].upper() + text[1:] # --- Example Usage --- #input_text_4 = "k'a di a b'i fɛ kɔkɔ n'b'a fɔ. Bɛ jɛ." #print(f"Original Text: {input_text_4}") #normalized_4 = normalize_bm_input(input_text_4) #print(f"Normalized Text: {normalized_4}\n") # Expected Output: Ka a di a be i fɛ kɔgɔ ne be a fɔ. Be jɛ.