import yaml from transformers import MarianMTModel, MarianTokenizer, pipeline # ---------------- Load config ---------------- CONFIG_FILE = "config.yaml" def load_config(): with open(CONFIG_FILE, "r", encoding="utf-8") as f: return yaml.safe_load(f) CONFIG = load_config() # ---------------- Marian models ---------------- MARIAN_MODELS = { "yoruba": { "to_en": "Helsinki-NLP/opus-mt-yo-en", "to_lang": "Helsinki-NLP/opus-mt-en-yo" }, "hausa": { "to_en": "Helsinki-NLP/opus-mt-ha-en", "to_lang": "Helsinki-NLP/opus-mt-en-ha" }, "igbo": { "to_en": "Helsinki-NLP/opus-mt-ig-en", "to_lang": "Helsinki-NLP/opus-mt-en-ig" }, "pidgin": { "to_en": "Helsinki-NLP/opus-mt-pcm-en", "to_lang": "Helsinki-NLP/opus-mt-en-pcm" }, } # ---------------- Fallback dictionary ---------------- CUSTOM_DICT = { "esan": {"Koyo": "Greetings", "Wa gié": "Come here"}, "tiv": {"M sugh u": "Good morning", "M gbee": "I am fine"}, "calabar": {"Nsidibe": "Welcome", "Abadie": "How are you?"}, "benin": {"Oba gha to kpere": "Long live the king", "Koyo": "Greetings"}, "pidgin": { "How far": "How are you?", "Wetin dey happen": "What’s going on?", "Omo": "Kid / person (informal)", }, } class Translator: def __init__(self, n2n_enabled=False): self.n2n_enabled = n2n_enabled def translate_with_marian(self, model_name, text): tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) inputs = tokenizer(text, return_tensors="pt") translated = model.generate(**inputs) return tokenizer.decode(translated[0], skip_special_tokens=True) def translate(self, text, input_lang, output_lang): if not text.strip(): return "" # ---- Marian supported ---- if input_lang in MARIAN_MODELS: if output_lang == "english": return self.translate_with_marian(MARIAN_MODELS[input_lang]["to_en"], text) elif output_lang == input_lang: return text elif output_lang in MARIAN_MODELS: if self.n2n_enabled: en_text = self.translate_with_marian(MARIAN_MODELS[input_lang]["to_en"], text) return self.translate_with_marian(MARIAN_MODELS[output_lang]["to_lang"], en_text) else: return f"(🚧 Nigerian↔Nigerian to {output_lang} not enabled)" # ---- Dictionary fallback ---- else: if output_lang == "english": return CUSTOM_DICT.get(input_lang, {}).get(text, text) elif output_lang == input_lang: return text else: return f"(⚠️ Dictionary doesn't support {input_lang}→{output_lang})" class CustomTranslator: def __init__(self, model_dir="./training/outputs/model"): self.tokenizer = MarianTokenizer.from_pretrained(model_dir) self.model = MarianMTModel.from_pretrained(model_dir) self.pipeline = pipeline("translation", model=self.model, tokenizer=self.tokenizer) def translate(self, text): return self.pipeline(text)[0]["translation_text"]