File size: 3,398 Bytes
924f487
c3a047c
924f487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3a047c
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import yaml
from transformers import MarianMTModel, MarianTokenizer, pipeline

# ---------------- Load config ----------------
CONFIG_FILE = "config.yaml"

def load_config():
    with open(CONFIG_FILE, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)

CONFIG = load_config()

# ---------------- Marian models ----------------
MARIAN_MODELS = {
    "yoruba": {
        "to_en": "Helsinki-NLP/opus-mt-yo-en",
        "to_lang": "Helsinki-NLP/opus-mt-en-yo"
    },
    "hausa": {
        "to_en": "Helsinki-NLP/opus-mt-ha-en",
        "to_lang": "Helsinki-NLP/opus-mt-en-ha"
    },
    "igbo": {
        "to_en": "Helsinki-NLP/opus-mt-ig-en",
        "to_lang": "Helsinki-NLP/opus-mt-en-ig"
    },
    "pidgin": {
        "to_en": "Helsinki-NLP/opus-mt-pcm-en",
        "to_lang": "Helsinki-NLP/opus-mt-en-pcm"
    },
}

# ---------------- Fallback dictionary ----------------
CUSTOM_DICT = {
    "esan": {"Koyo": "Greetings", "Wa gié": "Come here"},
    "tiv": {"M sugh u": "Good morning", "M gbee": "I am fine"},
    "calabar": {"Nsidibe": "Welcome", "Abadie": "How are you?"},
    "benin": {"Oba gha to kpere": "Long live the king", "Koyo": "Greetings"},
    "pidgin": {
        "How far": "How are you?",
        "Wetin dey happen": "What’s going on?",
        "Omo": "Kid / person (informal)",
    },
}


class Translator:
    def __init__(self, n2n_enabled=False):
        self.n2n_enabled = n2n_enabled

    def translate_with_marian(self, model_name, text):
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        inputs = tokenizer(text, return_tensors="pt")
        translated = model.generate(**inputs)
        return tokenizer.decode(translated[0], skip_special_tokens=True)

    def translate(self, text, input_lang, output_lang):
        if not text.strip():
            return ""

        # ---- Marian supported ----
        if input_lang in MARIAN_MODELS:
            if output_lang == "english":
                return self.translate_with_marian(MARIAN_MODELS[input_lang]["to_en"], text)
            elif output_lang == input_lang:
                return text
            elif output_lang in MARIAN_MODELS:
                if self.n2n_enabled:
                    en_text = self.translate_with_marian(MARIAN_MODELS[input_lang]["to_en"], text)
                    return self.translate_with_marian(MARIAN_MODELS[output_lang]["to_lang"], en_text)
                else:
                    return f"(🚧 Nigerian↔Nigerian to {output_lang} not enabled)"
        # ---- Dictionary fallback ----
        else:
            if output_lang == "english":
                return CUSTOM_DICT.get(input_lang, {}).get(text, text)
            elif output_lang == input_lang:
                return text
            else:
                return f"(⚠️ Dictionary doesn't support {input_lang}{output_lang})"



class CustomTranslator:
    def __init__(self, model_dir="./training/outputs/model"):
        self.tokenizer = MarianTokenizer.from_pretrained(model_dir)
        self.model = MarianMTModel.from_pretrained(model_dir)
        self.pipeline = pipeline("translation", model=self.model, tokenizer=self.tokenizer)

    def translate(self, text):
        return self.pipeline(text)[0]["translation_text"]