File size: 2,595 Bytes
894584d
 
fec37b6
894584d
fec37b6
894584d
fec37b6
894584d
fec37b6
 
 
 
 
894584d
 
fec37b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
894584d
fec37b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
894584d
 
fec37b6
 
 
894584d
fec37b6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re

# Define the contractions dictionary
CONTRACTIONS = {
    # Multi-word contractions (keys are space-separated)
    "ka a": "k'a",
    "a be a": "a b'a",
    "be a": "b'a",
    "ko o": "k'o",
    "di i":"d'i",
    "be i":"b'i"
    # Example Single-word contraction added:
    #"kaa": "k'aa" # Assuming this is a desired single-word contraction
}

def normalize_bm_output(text: str) -> str:
    """
    Normalizes specific contractions (both single-word and multi-word)
    in a string.
    """
    
    # 1. Ensure the text is lowercase as specified in your requirement
    text = text.lower()
    
    # --- Part 1: Handle Multi-Word Contractions ---
    
    # Filter for and sort multi-word keys by length descending to prevent partial matches
    multi_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' in k}
    sorted_multi_word = sorted(multi_word_contractions.items(), key=lambda item: len(item[0]), reverse=True)
    
    # Apply replacement for multi-word phrases
    for original_phrase, contracted_form in sorted_multi_word:
        # Create a pattern to match the full phrase, ensuring it's surrounded by 
        # word boundaries or start/end of string.
        # re.escape handles any special characters in the key
        pattern = r'\b' + re.escape(original_phrase) + r'\b'
        
        # Replace the full matched pattern with the contracted form
        text = re.sub(pattern, contracted_form, text, flags=re.IGNORECASE)

    # --- Part 2: Handle Single-Word Contractions ---
    
    # Filter for single-word keys (no spaces)
    single_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' not in k}
    
    # Use a regular expression and a function to map the words based on the dictionary
    
    def replace_single_word(match):
        """Looks up the matched word in the single-word contractions dictionary."""
        word = match.group(0)
        # Use .get() with the original word as the default to ensure non-contracted
        # words are left alone.
        return single_word_contractions.get(word, word)
        
    # The pattern r'\b\w+\b' matches every single whole word in the text.
    # The replacement function replace_single_word is called for every match.
    text = re.sub(r'\b\w+\b', replace_single_word, text)
    
    return text[:1].upper() + text[1:]

# --- Example Usage with both types of contractions ---

#input_text_4 = "ka a di a be i fɛ kɔgɔ ne be a fɔ."

#print(f"Original Text: {input_text_4}")
#normalized_4 = normalize_bm_output(input_text_4)
#print(f"Normalized Text: {normalized_4}\n")