# translator.py from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch from langdetect import detect MODEL_NAME = "facebook/nllb-200-distilled-600M" # =============================== # LANGUAGE MAP # =============================== LANGUAGE_MAP = { # ===== EXISTING ===== "english": "eng_Latn", "french": "fra_Latn", "german": "deu_Latn", "spanish": "spa_Latn", "hindi": "hin_Deva", "marathi": "mar_Deva", "tamil": "tam_Taml", "telugu": "tel_Telu", "kannada": "kan_Knda", "bengali": "ben_Beng", "urdu": "urd_Arab", "arabic": "arb_Arab", "persian": "pes_Arab", "japanese": "jpn_Jpan", "chinese": "zho_Hans", "korean": "kor_Hang", "russian": "rus_Cyrl", "italian": "ita_Latn", "portuguese": "por_Latn", "dutch": "nld_Latn", "swedish": "swe_Latn", "norwegian": "nob_Latn", "danish": "dan_Latn", "finnish": "fin_Latn", "polish": "pol_Latn", "czech": "ces_Latn", "slovak": "slk_Latn", "hungarian": "hun_Latn", "romanian": "ron_Latn", "bulgarian": "bul_Cyrl", "ukrainian": "ukr_Cyrl", "greek": "ell_Grek", "gujarati": "guj_Gujr", "punjabi": "pan_Guru", "malayalam": "mal_Mlym", "thai": "tha_Thai", "vietnamese": "vie_Latn", "indonesian": "ind_Latn", "turkish": "tur_Latn", "hebrew": "heb_Hebr", # ===== KURDISH (IMPORTANT) ===== "kurdish_kurmanji": "kmr_Latn", # Northern Kurdish (Latin) "kurdish_sorani": "ckb_Arab", # Central Kurdish (Arabic) # ===== SOUTH ASIAN ===== "nepali": "npi_Deva", "sinhala": "sin_Sinh", "odia": "ory_Orya", "assamese": "asm_Beng", "maithili": "mai_Deva", "santali": "sat_Olck", # ===== SOUTHEAST ASIA ===== "malay": "zsm_Latn", "filipino": "tgl_Latn", "khmer": "khm_Khmr", "lao": "lao_Laoo", "burmese": "mya_Mymr", # ===== EAST ASIA ===== "traditional_chinese": "zho_Hant", "mongolian": "mon_Cyrl", # ===== CENTRAL ASIA ===== "kazakh": "kaz_Cyrl", "uzbek": "uzn_Latn", "tajik": "tgk_Cyrl", "kyrgyz": "kir_Cyrl", "turkmen": "tuk_Latn", # ===== MIDDLE EAST ===== "pashto": "pbt_Arab", "sindhi": "snd_Arab", # ===== AFRICAN ===== "swahili": "swh_Latn", "amharic": "amh_Ethi", "yoruba": "yor_Latn", "igbo": "ibo_Latn", "hausa": "hau_Latn", "zulu": "zul_Latn", "xhosa": "xho_Latn", "somali": "som_Latn", "afrikaans": "afr_Latn", # ===== EUROPE EXTRA ===== "estonian": "est_Latn", "latvian": "lav_Latn", "lithuanian": "lit_Latn", "icelandic": "isl_Latn", "irish": "gle_Latn", "welsh": "cym_Latn", "albanian": "sqi_Latn", "serbian": "srp_Cyrl", "croatian": "hrv_Latn", "slovenian": "slv_Latn", # ===== OTHERS ===== "latin": "lat_Latn", "esperanto": "epo_Latn" } ISO_TO_LANGUAGE_KEY = { # ===== EXISTING ===== "en": "english", "fr": "french", "de": "german", "es": "spanish", "hi": "hindi", "mr": "marathi", "ta": "tamil", "te": "telugu", "kn": "kannada", "bn": "bengali", "ur": "urdu", "ar": "arabic", "fa": "persian", "ja": "japanese", "zh": "chinese", "ko": "korean", "ru": "russian", "it": "italian", "pt": "portuguese", "nl": "dutch", "sv": "swedish", "no": "norwegian", "da": "danish", "fi": "finnish", "pl": "polish", "cs": "czech", "sk": "slovak", "hu": "hungarian", "ro": "romanian", "bg": "bulgarian", "uk": "ukrainian", "el": "greek", "gu": "gujarati", "pa": "punjabi", "ml": "malayalam", "th": "thai", "vi": "vietnamese", "id": "indonesian", "tr": "turkish", "he": "hebrew", # ===== KURDISH ===== "ku": "kurdish_kurmanji", # default kurdish "ckb": "kurdish_sorani", # ===== SOUTH ASIAN ===== "ne": "nepali", "si": "sinhala", "or": "odia", "as": "assamese", "mai": "maithili", "sat": "santali", # ===== SOUTHEAST ASIA ===== "ms": "malay", "tl": "filipino", "km": "khmer", "lo": "lao", "my": "burmese", # ===== CENTRAL ASIA ===== "kk": "kazakh", "uz": "uzbek", "tg": "tajik", "ky": "kyrgyz", "tk": "turkmen", # ===== MIDDLE EAST ===== "ps": "pashto", "sd": "sindhi", # ===== AFRICAN ===== "sw": "swahili", "am": "amharic", "yo": "yoruba", "ig": "igbo", "ha": "hausa", "zu": "zulu", "xh": "xhosa", "so": "somali", "af": "afrikaans", # ===== EUROPE EXTRA ===== "et": "estonian", "lv": "latvian", "lt": "lithuanian", "is": "icelandic", "ga": "irish", "cy": "welsh", "sq": "albanian", "sr": "serbian", "hr": "croatian", "sl": "slovenian", # ===== OTHERS ===== "la": "latin", "eo": "esperanto" } # =============================== # LOAD MODEL ONCE # =============================== print("Loading translation model...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) print("Model loaded successfully.") # =============================== # HELPERS # =============================== def get_nllb_code(lang: str): lang = lang.lower().strip() if lang in LANGUAGE_MAP: return LANGUAGE_MAP[lang] if lang in ISO_TO_LANGUAGE_KEY: key = ISO_TO_LANGUAGE_KEY[lang] return LANGUAGE_MAP[key] return "eng_Latn" # fallback # =============================== # MANUAL TRANSLATOR # =============================== def standard_translator(text: str, src_lang: str, tgt_lang: str) -> str: src_code = get_nllb_code(src_lang) tgt_code = get_nllb_code(tgt_lang) tokenizer.src_lang = src_code inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model.generate( **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code), max_length=512 ) return tokenizer.decode(output[0], skip_special_tokens=True) # =============================== # AUTO TRANSLATOR # =============================== def smart_auto_translator(text: str, target_lang: str): detected_iso = detect(text) detected_lang_key = ISO_TO_LANGUAGE_KEY.get(detected_iso, detected_iso) src_code = get_nllb_code(detected_iso) tgt_code = get_nllb_code(target_lang) tokenizer.src_lang = src_code inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model.generate( **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code), max_length=512 ) translated_text = tokenizer.decode(output[0], skip_special_tokens=True) return { "translated_text": translated_text, "detected_iso": detected_iso, "detected_language": detected_lang_key }