Spaces:

Gaoussin
/

bm-translator

Running

App Files Files Community

bm-translator / normalize_bm_output.py

Gaoussin

Upload 3 files

fec37b6 verified about 1 month ago

raw

history blame

2.6 kB

	import re

	# Define the contractions dictionary
	CONTRACTIONS = {
	# Multi-word contractions (keys are space-separated)
	"ka a": "k'a",
	"a be a": "a b'a",
	"be a": "b'a",
	"ko o": "k'o",
	"di i":"d'i",
	"be i":"b'i"
	# Example Single-word contraction added:
	#"kaa": "k'aa" # Assuming this is a desired single-word contraction
	}

	def normalize_bm_output(text: str) -> str:
	"""
	Normalizes specific contractions (both single-word and multi-word)
	in a string.
	"""

	# 1. Ensure the text is lowercase as specified in your requirement
	text = text.lower()

	# --- Part 1: Handle Multi-Word Contractions ---

	# Filter for and sort multi-word keys by length descending to prevent partial matches
	multi_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' in k}
	sorted_multi_word = sorted(multi_word_contractions.items(), key=lambda item: len(item[0]), reverse=True)

	# Apply replacement for multi-word phrases
	for original_phrase, contracted_form in sorted_multi_word:
	# Create a pattern to match the full phrase, ensuring it's surrounded by
	# word boundaries or start/end of string.
	# re.escape handles any special characters in the key
	pattern = r'\b' + re.escape(original_phrase) + r'\b'

	# Replace the full matched pattern with the contracted form
	text = re.sub(pattern, contracted_form, text, flags=re.IGNORECASE)

	# --- Part 2: Handle Single-Word Contractions ---

	# Filter for single-word keys (no spaces)
	single_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' not in k}

	# Use a regular expression and a function to map the words based on the dictionary

	def replace_single_word(match):
	"""Looks up the matched word in the single-word contractions dictionary."""
	word = match.group(0)
	# Use .get() with the original word as the default to ensure non-contracted
	# words are left alone.
	return single_word_contractions.get(word, word)

	# The pattern r'\b\w+\b' matches every single whole word in the text.
	# The replacement function replace_single_word is called for every match.
	text = re.sub(r'\b\w+\b', replace_single_word, text)

	return text[:1].upper() + text[1:]

	# --- Example Usage with both types of contractions ---

	#input_text_4 = "ka a di a be i fɛ kɔgɔ ne be a fɔ."

	#print(f"Original Text: {input_text_4}")
	#normalized_4 = normalize_bm_output(input_text_4)
	#print(f"Normalized Text: {normalized_4}\n")