Spaces:
Running
Running
| import re | |
| # Define the de-contraction dictionary. | |
| # Keys are the contracted forms (what you want to replace). | |
| # Values are the expanded forms (what you want to replace them with). | |
| DE_CONTRACTIONS = { | |
| # Keys with apostrophes/special characters for multi-word expansion | |
| "k'a": "ka a", | |
| "a b'a": "a be a", | |
| "n'be": "ne be", | |
| "n'b'a":"ne be a", | |
| "b'a": "be a", | |
| "k'o": "ko o", # Corrected key-value based on original request | |
| "b'i": "be i", | |
| "k'i":"ka i", | |
| "k'aw":"ka aw", | |
| # Single-word keys (no apostrophe) for multi-word expansion | |
| "kɔkɔ": "kɔgɔ", | |
| "bɛ": "be" | |
| } | |
| def normalize_bm_input(text: str) -> str: | |
| """ | |
| De-contracts (expands) specific contracted forms in a string | |
| based on the DE_CONTRACTIONS dictionary. | |
| """ | |
| # 1. Ensure the text is lowercase for consistent matching | |
| text = text.lower() | |
| # --- Part 1: Handle Multi-Word Expansions --- | |
| # The condition for 'multi-word expansion' must check the VALUE (the expanded form) | |
| # not the KEY (the contracted form). | |
| multi_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' in v} | |
| # Sort keys (contracted forms) by length descending. This is CRUCIAL | |
| # for regex to match longer contracted forms (e.g., "a b'a") before | |
| # shorter ones that might be contained within them. | |
| sorted_multi_word = sorted(multi_word_expansions.items(), key=lambda item: len(item[0]), reverse=True) | |
| # Apply replacement for contracted forms that expand to multi-word phrases | |
| for contracted_form, expanded_phrase in sorted_multi_word: | |
| # Create a pattern to match the full contracted form, ensuring it's | |
| # surrounded by word boundaries. This ensures "b'a" is not matched | |
| # within "b'adi". | |
| pattern = r'\b' + re.escape(contracted_form) + r'\b' | |
| # Replace the full matched pattern with the expanded phrase | |
| text = re.sub(pattern, expanded_phrase, text) | |
| # --- Part 2: Handle Single-Word Expansions (e.g., 'kɔkɔ' -> 'kɔgɔ') --- | |
| # Filter for contractions that expand to a single word (no spaces in the value) | |
| single_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' not in v} | |
| def replace_single_word(match): | |
| """Looks up the matched word (key) and returns the single-word expansion (value).""" | |
| word = match.group(0) | |
| # Use .get() to replace only the words present in the dictionary. | |
| return single_word_expansions.get(word, word) | |
| # Apply the replacement function to all whole words | |
| # This also catches cases like kɔkɔ and bɛ. | |
| text = re.sub(r'\b\S+\b', replace_single_word, text) | |
| # 2. Capitalize the first letter of the result for presentation | |
| return text[:1].upper() + text[1:] | |
| # --- Example Usage --- | |
| #input_text_4 = "k'a di a b'i fɛ kɔkɔ n'b'a fɔ. Bɛ jɛ." | |
| #print(f"Original Text: {input_text_4}") | |
| #normalized_4 = normalize_bm_input(input_text_4) | |
| #print(f"Normalized Text: {normalized_4}\n") | |
| # Expected Output: Ka a di a be i fɛ kɔgɔ ne be a fɔ. Be jɛ. |