Spaces:

Gaoussin
/

bm-translator

Running

App Files Files Community

Gaoussin commited on Oct 26, 2025

Commit

8d78e99

verified ·

1 Parent(s): bd57210

Update main.py

Browse files

Files changed (1) hide show

main.py +53 -63

main.py CHANGED Viewed

@@ -1,88 +1,78 @@
 import os
-# 2️⃣ Optional: force cache to writable directory
 os.environ["HF_HOME"] = "/tmp/hf"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf"
 os.environ["HF_DATASETS_CACHE"] = "/tmp/hf"
 os.makedirs("/tmp/hf", exist_ok=True)
-from fastapi import FastAPI
-from pydantic import BaseModel
-#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-#from replacer import replace_words, replace_dict
-#from datasets import Dataset
-from transformers import MBartForConditionalGeneration, MBart50Tokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
-# -------------------------
-# 1️⃣ Get your HF token from Space Secrets
-# In your Space, go to Settings → Secrets → add HF_TOKEN
-#HF_TOKEN = os.environ.get("HF_TOKEN")
-#if HF_TOKEN is None:
-#    raise ValueError("HF_TOKEN not found. Please add it in your Space Secrets.")
-# -------------------------
-# -------------------------
-# 3️⃣ Load private model
-model_name = "Gaoussin/bamalingua-bm_ml-fr_XX"
-model = MBartForConditionalGeneration.from_pretrained(model_name)
-tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50")
 #####
-def fix_tokenizer(tokenizer, new_lang='bm_ml'):
-    """
-    Add a new language token to the tokenizer vocabulary
-    (this should be done each time after its initialization)
-    """
-    # Check if the language token already exists
-    if new_lang not in tokenizer.lang_code_to_id:
-        # Add the new language as an additional special token
-        tokenizer.add_special_tokens({'additional_special_tokens': [new_lang]})
-        # Update the internal language code mappings
-        # Note: This is a workaround as MBart50Tokenizer doesn't have a direct way to add lang codes
-        # The new token will be added at the end of the vocabulary
-        new_id = len(tokenizer) - 1
-        tokenizer.lang_code_to_id[new_lang] = new_id
-        tokenizer.id_to_lang_code[new_id] = new_lang
-        print(f"Added new language token '{new_lang}' with ID {new_id}")
-    else:
-        print(f"Language token '{new_lang}' already exists in tokenizer.")
-fix_tokenizer(tokenizer, new_lang='bm_ml')
-model.resize_token_embeddings(len(tokenizer))
-print("model resized")
-######
-tgt_lang = "bm_ml"
-# -------------------------
-# 4️⃣ FastAPI app
 app = FastAPI()
 class TranslationRequest(BaseModel):
     text: str
 @app.post("/translate")
 def translate(request: TranslationRequest):
-    #reverse_dict = {v: k for k, v in replace_dict.items()}
-    #text_for_ai = replace_words(request.text, reverse_dict)
-    inputs = tokenizer(
-    request.text,
-    return_tensors="pt",
-    max_length=128,
-    truncation=True)
-    outputs = model.generate(
-    **inputs,
-    forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
-    text2 = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
-    #text_for_user = replace_words(text2, replace_dict)
-    return {"translation": text2[0].upper() + text2[1:]}
 @app.get("/")
 def root():
-    return {"message": "API is running"}

 import os
+import torch
+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
+# 1️⃣ Cache (optional)
 os.environ["HF_HOME"] = "/tmp/hf"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf"
 os.environ["HF_DATASETS_CACHE"] = "/tmp/hf"
 os.makedirs("/tmp/hf", exist_ok=True)
+# 2️⃣ HF TOKEN
+HF_TOKEN = os.environ.get("HF_TOKEN")
+if HF_TOKEN is None:
+    raise ValueError("HF_TOKEN not found. Please add it in your Space Secrets.")
+# 3️⃣ DEVICE
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# 4️⃣ Load model + tokenizer (PRIVATE REPO)
+#model_name = "Gaoussin/bamalingua-bm-fr"
+#tokenizer = MBart50TokenizerFast.from_pretrained(model_name, token=HF_TOKEN)
+#model = MBartForConditionalGeneration.from_pretrained(model_name, token=HF_TOKEN).to(device)
+####
+# 3. Load tokenizer & add Bambara token
+# ========================================
+model_name = "my_tokenizer"
+# Load the tokenizer with a default language and suppress the error
+try:
+    tokenizer = MBart50Tokenizer.from_pretrained(model_name, src_lang="en_XX")
+except KeyError:
+    # If loading with en_XX fails, try without specifying src_lang and fix afterwards
+    tokenizer = MBart50Tokenizer.from_pretrained(model_name)
+# Add the new language as an additional special token and update mappings
+new_lang = 'bm_ml'
+if new_lang not in tokenizer.lang_code_to_id:
+    tokenizer.add_special_tokens({'additional_special_tokens': [new_lang]})
+    # Update the internal language code mappings
+    new_id = len(tokenizer) - 1
+    tokenizer.lang_code_to_id[new_lang] = new_id
+    tokenizer.id_to_lang_code[new_id] = new_lang
+    print(f"Added new language token '{new_lang}' with ID {new_id}")
+else:
+    print(f"Language token '{new_lang}' already exists in tokenizer.")
+# Load model
+model = MBartForConditionalGeneration.from_pretrained("Gaoussin/bamalingua-bm_ml-fr_XX")
+model.resize_token_embeddings(len(tokenizer))
 #####
+# 5️⃣ Translation function
+def translateTo(text, src_lang, tgt_lang):
+    tokenizer.src_lang = src_lang
+    inputs = tokenizer(text, return_tensors="pt").to(device)
+    tgt_id = tokenizer.lang_code_to_id[tgt_lang]
+    generated = model.generate(**inputs, forced_bos_token_id=tgt_id)
+    return tokenizer.decode(generated[0], skip_special_tokens=True)
+# 6️⃣ FastAPI
 app = FastAPI()
 class TranslationRequest(BaseModel):
     text: str
+    src_lang: str
+    tgt_lang: str
 @app.post("/translate")
 def translate(request: TranslationRequest):
+    output = translateTo(request.text, request.src_lang, request.tgt_lang)
+    return {"translation": output}
 @app.get("/")
 def root():
+    return {"message": "API is running ✅"}