Spaces:

Gaoussin
/

bm-translator

Running

App Files Files Community

Gaoussin commited on Dec 14, 2025

Commit

fec37b6

verified ·

1 Parent(s): 2553a5d

Upload 3 files

Browse files

Files changed (3) hide show

main.py +20 -11
normalize_bm_input.py +80 -0
normalize_bm_output.py +56 -23

main.py CHANGED Viewed

@@ -4,8 +4,8 @@ from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 # Note: Keep the imports together for clarity
 from transformers import NllbTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
-from normalize_bm_words import normalize_text
-from normalize_bm_output import normalize_output
 # =====================
 # 1️⃣ Environment / Cache
@@ -71,15 +71,24 @@ def translateTo(text, src, tgt):
 @app.post("/translate")
 def translate(request: TranslationRequest):
     try:
-        is_bm = request.src_lang == "bam_Latn"
-        text = normalize_text(request.text) if is_bm else request.text
-        result = translateTo(text, request.src_lang, request.tgt_lang)
-        normal_result = normalize_output(result) if is_bm else result
-        # List structure: [translation, model_version]
-        translation_list = [normal_result, model_name]
         return [translation_list]
     except Exception as e:
         print(f"An error occurred during translation: {e}")

 from pydantic import BaseModel
 # Note: Keep the imports together for clarity
 from transformers import NllbTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
+from normalize_bm_input import normalize_bm_input
+from normalize_bm_output import normalize_bm_output
 # =====================
 # 1️⃣ Environment / Cache
 @app.post("/translate")
 def translate(request: TranslationRequest):
     try:
+        ###
+        # Check if the source language is Bambara for input normalization
+        is_src_bm = request.src_lang == "bam_Latn"
+        # Check if the target language is Bambara for output normalization
+        is_tgt_bm = request.tgt_lang == "bam_Latn"
+        # --- 1. Input Normalization (Pre-processing) ---
+        # Normalize the input ONLY if the source language is Bambara.
+        text_to_translate = normalize_bm_input(request.text) if is_src_bm else request.text
+        print("before",text_to_translate)
+        # --- 2. Core Translation ---
+        result = translateTo(text_to_translate, request.src_lang, request.tgt_lang)
+        print("after",result)
+        # --- 3. Output Normalization (Post-processing) ---
+        # Normalize the result (de-contract) ONLY if the target language is Bambara.
+        final_translation = normalize_bm_output(result) if is_tgt_bm else result
+        # --- 4. Final Output ---
+        translation_list = [final_translation, model_name]
+        ###
         return [translation_list]
     except Exception as e:
         print(f"An error occurred during translation: {e}")

normalize_bm_input.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import re
+# Define the de-contraction dictionary.
+# Keys are the contracted forms (what you want to replace).
+# Values are the expanded forms (what you want to replace them with).
+DE_CONTRACTIONS = {
+    # Keys with apostrophes/special characters for multi-word expansion
+    "k'a": "ka a",
+    "a b'a": "a be a",
+    "n'be": "ne be",
+    "n'b'a":"ne be a",
+    "b'a": "be a",
+    "k'o": "ko o", # Corrected key-value based on original request
+    "b'i": "be i",
+    "k'i":"ka i",
+    "k'aw":"ka aw",
+    # Single-word keys (no apostrophe) for multi-word expansion
+    "kɔkɔ": "kɔgɔ",
+    "bɛ": "be"
+}
+def normalize_bm_input(text: str) -> str:
+    """
+    De-contracts (expands) specific contracted forms in a string
+    based on the DE_CONTRACTIONS dictionary.
+    """
+    # 1. Ensure the text is lowercase for consistent matching
+    text = text.lower()
+    # --- Part 1: Handle Multi-Word Expansions ---
+    # The condition for 'multi-word expansion' must check the VALUE (the expanded form)
+    # not the KEY (the contracted form).
+    multi_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' in v}
+    # Sort keys (contracted forms) by length descending. This is CRUCIAL
+    # for regex to match longer contracted forms (e.g., "a b'a") before
+    # shorter ones that might be contained within them.
+    sorted_multi_word = sorted(multi_word_expansions.items(), key=lambda item: len(item[0]), reverse=True)
+    # Apply replacement for contracted forms that expand to multi-word phrases
+    for contracted_form, expanded_phrase in sorted_multi_word:
+        # Create a pattern to match the full contracted form, ensuring it's
+        # surrounded by word boundaries. This ensures "b'a" is not matched
+        # within "b'adi".
+        pattern = r'\b' + re.escape(contracted_form) + r'\b'
+        # Replace the full matched pattern with the expanded phrase
+        text = re.sub(pattern, expanded_phrase, text)
+    # --- Part 2: Handle Single-Word Expansions (e.g., 'kɔkɔ' -> 'kɔgɔ') ---
+    # Filter for contractions that expand to a single word (no spaces in the value)
+    single_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' not in v}
+    def replace_single_word(match):
+        """Looks up the matched word (key) and returns the single-word expansion (value)."""
+        word = match.group(0)
+        # Use .get() to replace only the words present in the dictionary.
+        return single_word_expansions.get(word, word)
+    # Apply the replacement function to all whole words
+    # This also catches cases like kɔkɔ and bɛ.
+    text = re.sub(r'\b\S+\b', replace_single_word, text)
+    # 2. Capitalize the first letter of the result for presentation
+    return text[:1].upper() + text[1:]
+# --- Example Usage ---
+#input_text_4 = "k'a di a b'i fɛ kɔkɔ n'b'a fɔ. Bɛ jɛ."
+#print(f"Original Text: {input_text_4}")
+#normalized_4 = normalize_bm_input(input_text_4)
+#print(f"Normalized Text: {normalized_4}\n")
+# Expected Output: Ka a di a be i fɛ kɔgɔ ne be a fɔ. Be jɛ.

normalize_bm_output.py CHANGED Viewed

@@ -1,34 +1,67 @@
 import re
-# Normalize keys once (lowercase)
 CONTRACTIONS = {
     "ka a": "k'a",
     "be a": "b'a",
-    "ne be": "n'be",
-    "taa a": "ta'a",
-    "ko o": "k'o"
 }
-# Escape + longest-first (critical for correctness)
-PATTERN = re.compile(
-    r'(?<![^\W\d_])('
-    + '|'.join(sorted(map(re.escape, CONTRACTIONS), key=len, reverse=True))
-    + r')(?![^\W\d_])',
-    flags=re.IGNORECASE | re.UNICODE
-)
-SPACE_QUESTION_RE = re.compile(r'\s*\?')
-def normalize_output(text: str) -> str:
-    # 1) normalize spacing before ?
-    text = SPACE_QUESTION_RE.sub(' ?', text)
-    # 2) expand contractions
-    text = PATTERN.sub(lambda m: CONTRACTIONS[m.group(0).lower()], text)
-    # 3) capitalize first letter safely
     return text[:1].upper() + text[1:]
-#sentence = "a be ka a di ne ma ne be taa a fɛ?"
-#print(normalize_text(sentence))

 import re
+# Define the contractions dictionary
 CONTRACTIONS = {
+    # Multi-word contractions (keys are space-separated)
     "ka a": "k'a",
+    "a be a": "a b'a",
     "be a": "b'a",
+    "ko o": "k'o",
+    "di i":"d'i",
+    "be i":"b'i"
+    # Example Single-word contraction added:
+    #"kaa": "k'aa" # Assuming this is a desired single-word contraction
 }
+def normalize_bm_output(text: str) -> str:
+    """
+    Normalizes specific contractions (both single-word and multi-word)
+    in a string.
+    """
+    # 1. Ensure the text is lowercase as specified in your requirement
+    text = text.lower()
+    # --- Part 1: Handle Multi-Word Contractions ---
+    # Filter for and sort multi-word keys by length descending to prevent partial matches
+    multi_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' in k}
+    sorted_multi_word = sorted(multi_word_contractions.items(), key=lambda item: len(item[0]), reverse=True)
+    # Apply replacement for multi-word phrases
+    for original_phrase, contracted_form in sorted_multi_word:
+        # Create a pattern to match the full phrase, ensuring it's surrounded by
+        # word boundaries or start/end of string.
+        # re.escape handles any special characters in the key
+        pattern = r'\b' + re.escape(original_phrase) + r'\b'
+        # Replace the full matched pattern with the contracted form
+        text = re.sub(pattern, contracted_form, text, flags=re.IGNORECASE)
+    # --- Part 2: Handle Single-Word Contractions ---
+    # Filter for single-word keys (no spaces)
+    single_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' not in k}
+    # Use a regular expression and a function to map the words based on the dictionary
+    def replace_single_word(match):
+        """Looks up the matched word in the single-word contractions dictionary."""
+        word = match.group(0)
+        # Use .get() with the original word as the default to ensure non-contracted
+        # words are left alone.
+        return single_word_contractions.get(word, word)
+    # The pattern r'\b\w+\b' matches every single whole word in the text.
+    # The replacement function replace_single_word is called for every match.
+    text = re.sub(r'\b\w+\b', replace_single_word, text)
     return text[:1].upper() + text[1:]
+# --- Example Usage with both types of contractions ---
+#input_text_4 = "ka a di a be i fɛ kɔgɔ ne be a fɔ."
+#print(f"Original Text: {input_text_4}")
+#normalized_4 = normalize_bm_output(input_text_4)
+#print(f"Normalized Text: {normalized_4}\n")