File size: 3,113 Bytes
fec37b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re

# Define the de-contraction dictionary.
# Keys are the contracted forms (what you want to replace).
# Values are the expanded forms (what you want to replace them with).
DE_CONTRACTIONS = {
    # Keys with apostrophes/special characters for multi-word expansion
    "k'a": "ka a",
    "a b'a": "a be a",
    "n'be": "ne be",
    "n'b'a":"ne be a",
    "b'a": "be a",
    "k'o": "ko o", # Corrected key-value based on original request
    "b'i": "be i",
    "k'i":"ka i",
    "k'aw":"ka aw",
    
    # Single-word keys (no apostrophe) for multi-word expansion
    "kɔkɔ": "kɔgɔ",
    "bɛ": "be"
}

def normalize_bm_input(text: str) -> str:
    """
    De-contracts (expands) specific contracted forms in a string
    based on the DE_CONTRACTIONS dictionary.
    """
    
    # 1. Ensure the text is lowercase for consistent matching
    text = text.lower()
    
    # --- Part 1: Handle Multi-Word Expansions ---
    
    # The condition for 'multi-word expansion' must check the VALUE (the expanded form)
    # not the KEY (the contracted form).
    multi_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' in v}
    
    # Sort keys (contracted forms) by length descending. This is CRUCIAL 
    # for regex to match longer contracted forms (e.g., "a b'a") before
    # shorter ones that might be contained within them.
    sorted_multi_word = sorted(multi_word_expansions.items(), key=lambda item: len(item[0]), reverse=True)
    
    # Apply replacement for contracted forms that expand to multi-word phrases
    for contracted_form, expanded_phrase in sorted_multi_word:
        
        # Create a pattern to match the full contracted form, ensuring it's 
        # surrounded by word boundaries. This ensures "b'a" is not matched 
        # within "b'adi".
        pattern = r'\b' + re.escape(contracted_form) + r'\b'
        
        # Replace the full matched pattern with the expanded phrase
        text = re.sub(pattern, expanded_phrase, text)

    # --- Part 2: Handle Single-Word Expansions (e.g., 'kɔkɔ' -> 'kɔgɔ') ---
    
    # Filter for contractions that expand to a single word (no spaces in the value)
    single_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' not in v}
    
    def replace_single_word(match):
        """Looks up the matched word (key) and returns the single-word expansion (value)."""
        word = match.group(0)
        # Use .get() to replace only the words present in the dictionary.
        return single_word_expansions.get(word, word)
        
    # Apply the replacement function to all whole words
    # This also catches cases like kɔkɔ and bɛ.
    text = re.sub(r'\b\S+\b', replace_single_word, text)
    
    # 2. Capitalize the first letter of the result for presentation
    return text[:1].upper() + text[1:]

# --- Example Usage ---

#input_text_4 = "k'a di a b'i fɛ kɔkɔ n'b'a fɔ. Bɛ jɛ."

#print(f"Original Text: {input_text_4}")
#normalized_4 = normalize_bm_input(input_text_4)
#print(f"Normalized Text: {normalized_4}\n")

# Expected Output: Ka a di a be i fɛ kɔgɔ ne be a fɔ. Be jɛ.