Spaces:

Gaoussin
/

bm-translator

Running

App Files Files Community

myHugginfacePull

by Gaoussin - opened Dec 13, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+31

-174

Files changed (3) hide show

main.py +31 -27
normalize_bm_input.py +0 -80
normalize_bm_output.py +0 -67

main.py CHANGED Viewed

@@ -2,17 +2,9 @@ import os
 import torch
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 # Note: Keep the imports together for clarity
-from transformers import (
-    NllbTokenizer,
-    AutoModelForSeq2SeqLM,
-    Seq2SeqTrainer,
-    Seq2SeqTrainingArguments,
-    DataCollatorForSeq2Seq,
-)
-from normalize_bm_input import normalize_bm_input
-from normalize_bm_output import normalize_bm_output
 # =====================
 # 1️⃣ Environment / Cache
@@ -34,7 +26,7 @@ print(f"Using device: {device}")
 # =====================
 # Charger le modèle et le tokenizer NLLB
 try:
-    model_name = "Gaoussin/Bamalingua-2"
     tokenizer = NllbTokenizer.from_pretrained(model_name)
     # Move model to the selected device (CPU or GPU)
     model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
@@ -48,13 +40,20 @@ except Exception as e:
 # =====================
 app = FastAPI()
 # Input schema
 class TranslationRequest(BaseModel):
     text: str
     src_lang: str  # e.g., "bam_Latn"
     tgt_lang: str  # e.g., "fra_Latn"
 # =====================
 # 5️⃣ Translation function - Restored to user's original logic
@@ -62,36 +61,41 @@ class TranslationRequest(BaseModel):
 def translateTo(text, src, tgt):
     tokenizer.src_lang = src
     tokenizer.tgt_lang = tgt
-    print({text, tokenizer.src_lang, tokenizer.tgt_lang})
     # Prepare input for the model
     # We explicitly move the inputs to the same device as the model
     inputs = tokenizer(text, return_tensors="pt").to(device)
     # Generate translation using the user's logic
     output = model.generate(**inputs, max_length=128)
     # Decode the output
     return tokenizer.decode(output[0], skip_special_tokens=True)
 # =====================
 # 6️⃣ API Endpoints - Applying the Response Model
 # =====================
-@app.post("/translate")
 def translate(request: TranslationRequest):
     try:
-        # --- 2. Core Translation ---
-        result = translateTo(request.text, request.src_lang, request.tgt_lang)
-        # --- 4. Final Output ---
-        translation_list = [result, model_name]
-        ###
-        return [translation_list]
     except Exception as e:
         print(f"An error occurred during translation: {e}")
-        raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
 @app.get("/")
 def root():
-    return {"message": "API is running 🚀"}

 import torch
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 # Note: Keep the imports together for clarity
+from transformers import NllbTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
+from normalize_bm_words import normalize_text
 # =====================
 # 1️⃣ Environment / Cache
 # =====================
 # Charger le modèle et le tokenizer NLLB
 try:
+    model_name = "Gaoussin/bamalingua-4"
     tokenizer = NllbTokenizer.from_pretrained(model_name)
     # Move model to the selected device (CPU or GPU)
     model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
 # =====================
 app = FastAPI()
 # Input schema
 class TranslationRequest(BaseModel):
     text: str
     src_lang: str  # e.g., "bam_Latn"
     tgt_lang: str  # e.g., "fra_Latn"
+# Output schema (THE FIX: ensures both fields are returned)
+class TranslationResponse(BaseModel):
+    """
+    Ensures both the translated text and the app version ID are included
+    in the response JSON.
+    """
+    translation: str
+    appVersionId: str
 # =====================
 # 5️⃣ Translation function - Restored to user's original logic
 def translateTo(text, src, tgt):
     tokenizer.src_lang = src
     tokenizer.tgt_lang = tgt
+    print(tokenizer.src_lang, tokenizer.tgt_lang)
     # Prepare input for the model
     # We explicitly move the inputs to the same device as the model
     inputs = tokenizer(text, return_tensors="pt").to(device)
     # Generate translation using the user's logic
     output = model.generate(**inputs, max_length=128)
     # Decode the output
     return tokenizer.decode(output[0], skip_special_tokens=True)
 # =====================
 # 6️⃣ API Endpoints - Applying the Response Model
 # =====================
+@app.post("/translate", response_model=TranslationResponse) # <-- Fix remains here
 def translate(request: TranslationRequest):
     try:
+        # normalize_text from imported file
+        text = normalize_text(request.text)
+        result = translateTo(text, request.src_lang, request.tgt_lang)
+        appVersionId = "App Version id = 2"
+        # Return the dictionary matching the TranslationResponse schema
+        return {"translation": result, "appVersionId": appVersionId}
     except Exception as e:
         print(f"An error occurred during translation: {e}")
+        # When raising an HTTPException, the response model is bypassed,
+        # and a standard JSON error is returned.
+        raise HTTPException(
+            status_code=500,
+            detail=f"Translation failed: {str(e)}"
+        )
 @app.get("/")
 def root():
+    return {"message": "API is running 🚀"}

normalize_bm_input.py DELETED Viewed

@@ -1,80 +0,0 @@
-import re
-# Define the de-contraction dictionary.
-# Keys are the contracted forms (what you want to replace).
-# Values are the expanded forms (what you want to replace them with).
-DE_CONTRACTIONS = {
-    # Keys with apostrophes/special characters for multi-word expansion
-    "k'a": "ka a",
-    "a b'a": "a be a",
-    "n'be": "ne be",
-    "n'b'a":"ne be a",
-    "b'a": "be a",
-    "k'o": "ko o", # Corrected key-value based on original request
-    "b'i": "be i",
-    "k'i":"ka i",
-    "k'aw":"ka aw",
-    # Single-word keys (no apostrophe) for multi-word expansion
-    "kɔkɔ": "kɔgɔ",
-    "bɛ": "be"
-}
-def normalize_bm_input(text: str) -> str:
-    """
-    De-contracts (expands) specific contracted forms in a string
-    based on the DE_CONTRACTIONS dictionary.
-    """
-    # 1. Ensure the text is lowercase for consistent matching
-    text = text.lower()
-    # --- Part 1: Handle Multi-Word Expansions ---
-    # The condition for 'multi-word expansion' must check the VALUE (the expanded form)
-    # not the KEY (the contracted form).
-    multi_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' in v}
-    # Sort keys (contracted forms) by length descending. This is CRUCIAL
-    # for regex to match longer contracted forms (e.g., "a b'a") before
-    # shorter ones that might be contained within them.
-    sorted_multi_word = sorted(multi_word_expansions.items(), key=lambda item: len(item[0]), reverse=True)
-    # Apply replacement for contracted forms that expand to multi-word phrases
-    for contracted_form, expanded_phrase in sorted_multi_word:
-        # Create a pattern to match the full contracted form, ensuring it's
-        # surrounded by word boundaries. This ensures "b'a" is not matched
-        # within "b'adi".
-        pattern = r'\b' + re.escape(contracted_form) + r'\b'
-        # Replace the full matched pattern with the expanded phrase
-        text = re.sub(pattern, expanded_phrase, text)
-    # --- Part 2: Handle Single-Word Expansions (e.g., 'kɔkɔ' -> 'kɔgɔ') ---
-    # Filter for contractions that expand to a single word (no spaces in the value)
-    single_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' not in v}
-    def replace_single_word(match):
-        """Looks up the matched word (key) and returns the single-word expansion (value)."""
-        word = match.group(0)
-        # Use .get() to replace only the words present in the dictionary.
-        return single_word_expansions.get(word, word)
-    # Apply the replacement function to all whole words
-    # This also catches cases like kɔkɔ and bɛ.
-    text = re.sub(r'\b\S+\b', replace_single_word, text)
-    # 2. Capitalize the first letter of the result for presentation
-    return text[:1].upper() + text[1:]
-# --- Example Usage ---
-#input_text_4 = "k'a di a b'i fɛ kɔkɔ n'b'a fɔ. Bɛ jɛ."
-#print(f"Original Text: {input_text_4}")
-#normalized_4 = normalize_bm_input(input_text_4)
-#print(f"Normalized Text: {normalized_4}\n")
-# Expected Output: Ka a di a be i fɛ kɔgɔ ne be a fɔ. Be jɛ.

normalize_bm_output.py DELETED Viewed

@@ -1,67 +0,0 @@
-import re
-# Define the contractions dictionary
-CONTRACTIONS = {
-    # Multi-word contractions (keys are space-separated)
-    "ka a": "k'a",
-    "a be a": "a b'a",
-    "be a": "b'a",
-    "ko o": "k'o",
-    "di i":"d'i",
-    "be i":"b'i"
-    # Example Single-word contraction added:
-    #"kaa": "k'aa" # Assuming this is a desired single-word contraction
-}
-def normalize_bm_output(text: str) -> str:
-    """
-    Normalizes specific contractions (both single-word and multi-word)
-    in a string.
-    """
-    # 1. Ensure the text is lowercase as specified in your requirement
-    text = text.lower()
-    # --- Part 1: Handle Multi-Word Contractions ---
-    # Filter for and sort multi-word keys by length descending to prevent partial matches
-    multi_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' in k}
-    sorted_multi_word = sorted(multi_word_contractions.items(), key=lambda item: len(item[0]), reverse=True)
-    # Apply replacement for multi-word phrases
-    for original_phrase, contracted_form in sorted_multi_word:
-        # Create a pattern to match the full phrase, ensuring it's surrounded by
-        # word boundaries or start/end of string.
-        # re.escape handles any special characters in the key
-        pattern = r'\b' + re.escape(original_phrase) + r'\b'
-        # Replace the full matched pattern with the contracted form
-        text = re.sub(pattern, contracted_form, text, flags=re.IGNORECASE)
-    # --- Part 2: Handle Single-Word Contractions ---
-    # Filter for single-word keys (no spaces)
-    single_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' not in k}
-    # Use a regular expression and a function to map the words based on the dictionary
-    def replace_single_word(match):
-        """Looks up the matched word in the single-word contractions dictionary."""
-        word = match.group(0)
-        # Use .get() with the original word as the default to ensure non-contracted
-        # words are left alone.
-        return single_word_contractions.get(word, word)
-    # The pattern r'\b\w+\b' matches every single whole word in the text.
-    # The replacement function replace_single_word is called for every match.
-    text = re.sub(r'\b\w+\b', replace_single_word, text)
-    return text[:1].upper() + text[1:]
-# --- Example Usage with both types of contractions ---
-#input_text_4 = "ka a di a be i fɛ kɔgɔ ne be a fɔ."
-#print(f"Original Text: {input_text_4}")
-#normalized_4 = normalize_bm_output(input_text_4)
-#print(f"Normalized Text: {normalized_4}\n")