Gaoussin commited on
Commit
3e56fc4
·
verified ·
1 Parent(s): ab77b7f

Delete normalize_bm_input.py

Browse files
Files changed (1) hide show
  1. normalize_bm_input.py +0 -80
normalize_bm_input.py DELETED
@@ -1,80 +0,0 @@
1
- import re
2
-
3
- # Define the de-contraction dictionary.
4
- # Keys are the contracted forms (what you want to replace).
5
- # Values are the expanded forms (what you want to replace them with).
6
- DE_CONTRACTIONS = {
7
- # Keys with apostrophes/special characters for multi-word expansion
8
- "k'a": "ka a",
9
- "a b'a": "a be a",
10
- "n'be": "ne be",
11
- "n'b'a":"ne be a",
12
- "b'a": "be a",
13
- "k'o": "ko o", # Corrected key-value based on original request
14
- "b'i": "be i",
15
- "k'i":"ka i",
16
- "k'aw":"ka aw",
17
-
18
- # Single-word keys (no apostrophe) for multi-word expansion
19
- "kɔkɔ": "kɔgɔ",
20
- "bɛ": "be"
21
- }
22
-
23
- def normalize_bm_input(text: str) -> str:
24
- """
25
- De-contracts (expands) specific contracted forms in a string
26
- based on the DE_CONTRACTIONS dictionary.
27
- """
28
-
29
- # 1. Ensure the text is lowercase for consistent matching
30
- text = text.lower()
31
-
32
- # --- Part 1: Handle Multi-Word Expansions ---
33
-
34
- # The condition for 'multi-word expansion' must check the VALUE (the expanded form)
35
- # not the KEY (the contracted form).
36
- multi_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' in v}
37
-
38
- # Sort keys (contracted forms) by length descending. This is CRUCIAL
39
- # for regex to match longer contracted forms (e.g., "a b'a") before
40
- # shorter ones that might be contained within them.
41
- sorted_multi_word = sorted(multi_word_expansions.items(), key=lambda item: len(item[0]), reverse=True)
42
-
43
- # Apply replacement for contracted forms that expand to multi-word phrases
44
- for contracted_form, expanded_phrase in sorted_multi_word:
45
-
46
- # Create a pattern to match the full contracted form, ensuring it's
47
- # surrounded by word boundaries. This ensures "b'a" is not matched
48
- # within "b'adi".
49
- pattern = r'\b' + re.escape(contracted_form) + r'\b'
50
-
51
- # Replace the full matched pattern with the expanded phrase
52
- text = re.sub(pattern, expanded_phrase, text)
53
-
54
- # --- Part 2: Handle Single-Word Expansions (e.g., 'kɔkɔ' -> 'kɔgɔ') ---
55
-
56
- # Filter for contractions that expand to a single word (no spaces in the value)
57
- single_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' not in v}
58
-
59
- def replace_single_word(match):
60
- """Looks up the matched word (key) and returns the single-word expansion (value)."""
61
- word = match.group(0)
62
- # Use .get() to replace only the words present in the dictionary.
63
- return single_word_expansions.get(word, word)
64
-
65
- # Apply the replacement function to all whole words
66
- # This also catches cases like kɔkɔ and bɛ.
67
- text = re.sub(r'\b\S+\b', replace_single_word, text)
68
-
69
- # 2. Capitalize the first letter of the result for presentation
70
- return text[:1].upper() + text[1:]
71
-
72
- # --- Example Usage ---
73
-
74
- #input_text_4 = "k'a di a b'i fɛ kɔkɔ n'b'a fɔ. Bɛ jɛ."
75
-
76
- #print(f"Original Text: {input_text_4}")
77
- #normalized_4 = normalize_bm_input(input_text_4)
78
- #print(f"Normalized Text: {normalized_4}\n")
79
-
80
- # Expected Output: Ka a di a be i fɛ kɔgɔ ne be a fɔ. Be jɛ.