Spaces:
Running
Running
Delete normalize_bm_output.py
Browse files- normalize_bm_output.py +0 -67
normalize_bm_output.py
DELETED
|
@@ -1,67 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
|
| 3 |
-
# Define the contractions dictionary
|
| 4 |
-
CONTRACTIONS = {
|
| 5 |
-
# Multi-word contractions (keys are space-separated)
|
| 6 |
-
"ka a": "k'a",
|
| 7 |
-
"a be a": "a b'a",
|
| 8 |
-
"be a": "b'a",
|
| 9 |
-
"ko o": "k'o",
|
| 10 |
-
"di i":"d'i",
|
| 11 |
-
"be i":"b'i"
|
| 12 |
-
# Example Single-word contraction added:
|
| 13 |
-
#"kaa": "k'aa" # Assuming this is a desired single-word contraction
|
| 14 |
-
}
|
| 15 |
-
|
| 16 |
-
def normalize_bm_output(text: str) -> str:
|
| 17 |
-
"""
|
| 18 |
-
Normalizes specific contractions (both single-word and multi-word)
|
| 19 |
-
in a string.
|
| 20 |
-
"""
|
| 21 |
-
|
| 22 |
-
# 1. Ensure the text is lowercase as specified in your requirement
|
| 23 |
-
text = text.lower()
|
| 24 |
-
|
| 25 |
-
# --- Part 1: Handle Multi-Word Contractions ---
|
| 26 |
-
|
| 27 |
-
# Filter for and sort multi-word keys by length descending to prevent partial matches
|
| 28 |
-
multi_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' in k}
|
| 29 |
-
sorted_multi_word = sorted(multi_word_contractions.items(), key=lambda item: len(item[0]), reverse=True)
|
| 30 |
-
|
| 31 |
-
# Apply replacement for multi-word phrases
|
| 32 |
-
for original_phrase, contracted_form in sorted_multi_word:
|
| 33 |
-
# Create a pattern to match the full phrase, ensuring it's surrounded by
|
| 34 |
-
# word boundaries or start/end of string.
|
| 35 |
-
# re.escape handles any special characters in the key
|
| 36 |
-
pattern = r'\b' + re.escape(original_phrase) + r'\b'
|
| 37 |
-
|
| 38 |
-
# Replace the full matched pattern with the contracted form
|
| 39 |
-
text = re.sub(pattern, contracted_form, text, flags=re.IGNORECASE)
|
| 40 |
-
|
| 41 |
-
# --- Part 2: Handle Single-Word Contractions ---
|
| 42 |
-
|
| 43 |
-
# Filter for single-word keys (no spaces)
|
| 44 |
-
single_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' not in k}
|
| 45 |
-
|
| 46 |
-
# Use a regular expression and a function to map the words based on the dictionary
|
| 47 |
-
|
| 48 |
-
def replace_single_word(match):
|
| 49 |
-
"""Looks up the matched word in the single-word contractions dictionary."""
|
| 50 |
-
word = match.group(0)
|
| 51 |
-
# Use .get() with the original word as the default to ensure non-contracted
|
| 52 |
-
# words are left alone.
|
| 53 |
-
return single_word_contractions.get(word, word)
|
| 54 |
-
|
| 55 |
-
# The pattern r'\b\w+\b' matches every single whole word in the text.
|
| 56 |
-
# The replacement function replace_single_word is called for every match.
|
| 57 |
-
text = re.sub(r'\b\w+\b', replace_single_word, text)
|
| 58 |
-
|
| 59 |
-
return text[:1].upper() + text[1:]
|
| 60 |
-
|
| 61 |
-
# --- Example Usage with both types of contractions ---
|
| 62 |
-
|
| 63 |
-
#input_text_4 = "ka a di a be i fɛ kɔgɔ ne be a fɔ."
|
| 64 |
-
|
| 65 |
-
#print(f"Original Text: {input_text_4}")
|
| 66 |
-
#normalized_4 = normalize_bm_output(input_text_4)
|
| 67 |
-
#print(f"Normalized Text: {normalized_4}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|