Spaces:
Running
Running
File size: 3,113 Bytes
fec37b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import re
# Define the de-contraction dictionary.
# Keys are the contracted forms (what you want to replace).
# Values are the expanded forms (what you want to replace them with).
DE_CONTRACTIONS = {
# Keys with apostrophes/special characters for multi-word expansion
"k'a": "ka a",
"a b'a": "a be a",
"n'be": "ne be",
"n'b'a":"ne be a",
"b'a": "be a",
"k'o": "ko o", # Corrected key-value based on original request
"b'i": "be i",
"k'i":"ka i",
"k'aw":"ka aw",
# Single-word keys (no apostrophe) for multi-word expansion
"kɔkɔ": "kɔgɔ",
"bɛ": "be"
}
def normalize_bm_input(text: str) -> str:
"""
De-contracts (expands) specific contracted forms in a string
based on the DE_CONTRACTIONS dictionary.
"""
# 1. Ensure the text is lowercase for consistent matching
text = text.lower()
# --- Part 1: Handle Multi-Word Expansions ---
# The condition for 'multi-word expansion' must check the VALUE (the expanded form)
# not the KEY (the contracted form).
multi_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' in v}
# Sort keys (contracted forms) by length descending. This is CRUCIAL
# for regex to match longer contracted forms (e.g., "a b'a") before
# shorter ones that might be contained within them.
sorted_multi_word = sorted(multi_word_expansions.items(), key=lambda item: len(item[0]), reverse=True)
# Apply replacement for contracted forms that expand to multi-word phrases
for contracted_form, expanded_phrase in sorted_multi_word:
# Create a pattern to match the full contracted form, ensuring it's
# surrounded by word boundaries. This ensures "b'a" is not matched
# within "b'adi".
pattern = r'\b' + re.escape(contracted_form) + r'\b'
# Replace the full matched pattern with the expanded phrase
text = re.sub(pattern, expanded_phrase, text)
# --- Part 2: Handle Single-Word Expansions (e.g., 'kɔkɔ' -> 'kɔgɔ') ---
# Filter for contractions that expand to a single word (no spaces in the value)
single_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' not in v}
def replace_single_word(match):
"""Looks up the matched word (key) and returns the single-word expansion (value)."""
word = match.group(0)
# Use .get() to replace only the words present in the dictionary.
return single_word_expansions.get(word, word)
# Apply the replacement function to all whole words
# This also catches cases like kɔkɔ and bɛ.
text = re.sub(r'\b\S+\b', replace_single_word, text)
# 2. Capitalize the first letter of the result for presentation
return text[:1].upper() + text[1:]
# --- Example Usage ---
#input_text_4 = "k'a di a b'i fɛ kɔkɔ n'b'a fɔ. Bɛ jɛ."
#print(f"Original Text: {input_text_4}")
#normalized_4 = normalize_bm_input(input_text_4)
#print(f"Normalized Text: {normalized_4}\n")
# Expected Output: Ka a di a be i fɛ kɔgɔ ne be a fɔ. Be jɛ. |