cortex-translator / translator.py
junaid17's picture
Upload 4 files
7a60e77 verified
# translator.py
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from langdetect import detect
MODEL_NAME = "facebook/nllb-200-distilled-600M"
# ===============================
# LANGUAGE MAP
# ===============================
LANGUAGE_MAP = {
# ===== EXISTING =====
"english": "eng_Latn", "french": "fra_Latn", "german": "deu_Latn", "spanish": "spa_Latn",
"hindi": "hin_Deva", "marathi": "mar_Deva", "tamil": "tam_Taml", "telugu": "tel_Telu",
"kannada": "kan_Knda", "bengali": "ben_Beng", "urdu": "urd_Arab", "arabic": "arb_Arab",
"persian": "pes_Arab", "japanese": "jpn_Jpan", "chinese": "zho_Hans", "korean": "kor_Hang",
"russian": "rus_Cyrl", "italian": "ita_Latn", "portuguese": "por_Latn", "dutch": "nld_Latn",
"swedish": "swe_Latn", "norwegian": "nob_Latn", "danish": "dan_Latn", "finnish": "fin_Latn",
"polish": "pol_Latn", "czech": "ces_Latn", "slovak": "slk_Latn", "hungarian": "hun_Latn",
"romanian": "ron_Latn", "bulgarian": "bul_Cyrl", "ukrainian": "ukr_Cyrl", "greek": "ell_Grek",
"gujarati": "guj_Gujr", "punjabi": "pan_Guru", "malayalam": "mal_Mlym",
"thai": "tha_Thai", "vietnamese": "vie_Latn", "indonesian": "ind_Latn",
"turkish": "tur_Latn", "hebrew": "heb_Hebr",
# ===== KURDISH (IMPORTANT) =====
"kurdish_kurmanji": "kmr_Latn", # Northern Kurdish (Latin)
"kurdish_sorani": "ckb_Arab", # Central Kurdish (Arabic)
# ===== SOUTH ASIAN =====
"nepali": "npi_Deva",
"sinhala": "sin_Sinh",
"odia": "ory_Orya",
"assamese": "asm_Beng",
"maithili": "mai_Deva",
"santali": "sat_Olck",
# ===== SOUTHEAST ASIA =====
"malay": "zsm_Latn",
"filipino": "tgl_Latn",
"khmer": "khm_Khmr",
"lao": "lao_Laoo",
"burmese": "mya_Mymr",
# ===== EAST ASIA =====
"traditional_chinese": "zho_Hant",
"mongolian": "mon_Cyrl",
# ===== CENTRAL ASIA =====
"kazakh": "kaz_Cyrl",
"uzbek": "uzn_Latn",
"tajik": "tgk_Cyrl",
"kyrgyz": "kir_Cyrl",
"turkmen": "tuk_Latn",
# ===== MIDDLE EAST =====
"pashto": "pbt_Arab",
"sindhi": "snd_Arab",
# ===== AFRICAN =====
"swahili": "swh_Latn",
"amharic": "amh_Ethi",
"yoruba": "yor_Latn",
"igbo": "ibo_Latn",
"hausa": "hau_Latn",
"zulu": "zul_Latn",
"xhosa": "xho_Latn",
"somali": "som_Latn",
"afrikaans": "afr_Latn",
# ===== EUROPE EXTRA =====
"estonian": "est_Latn",
"latvian": "lav_Latn",
"lithuanian": "lit_Latn",
"icelandic": "isl_Latn",
"irish": "gle_Latn",
"welsh": "cym_Latn",
"albanian": "sqi_Latn",
"serbian": "srp_Cyrl",
"croatian": "hrv_Latn",
"slovenian": "slv_Latn",
# ===== OTHERS =====
"latin": "lat_Latn",
"esperanto": "epo_Latn"
}
ISO_TO_LANGUAGE_KEY = {
# ===== EXISTING =====
"en": "english", "fr": "french", "de": "german", "es": "spanish", "hi": "hindi",
"mr": "marathi", "ta": "tamil", "te": "telugu", "kn": "kannada", "bn": "bengali",
"ur": "urdu", "ar": "arabic", "fa": "persian", "ja": "japanese", "zh": "chinese",
"ko": "korean", "ru": "russian", "it": "italian", "pt": "portuguese", "nl": "dutch",
"sv": "swedish", "no": "norwegian", "da": "danish", "fi": "finnish", "pl": "polish",
"cs": "czech", "sk": "slovak", "hu": "hungarian", "ro": "romanian", "bg": "bulgarian",
"uk": "ukrainian", "el": "greek", "gu": "gujarati", "pa": "punjabi", "ml": "malayalam",
"th": "thai", "vi": "vietnamese", "id": "indonesian", "tr": "turkish", "he": "hebrew",
# ===== KURDISH =====
"ku": "kurdish_kurmanji", # default kurdish
"ckb": "kurdish_sorani",
# ===== SOUTH ASIAN =====
"ne": "nepali",
"si": "sinhala",
"or": "odia",
"as": "assamese",
"mai": "maithili",
"sat": "santali",
# ===== SOUTHEAST ASIA =====
"ms": "malay",
"tl": "filipino",
"km": "khmer",
"lo": "lao",
"my": "burmese",
# ===== CENTRAL ASIA =====
"kk": "kazakh",
"uz": "uzbek",
"tg": "tajik",
"ky": "kyrgyz",
"tk": "turkmen",
# ===== MIDDLE EAST =====
"ps": "pashto",
"sd": "sindhi",
# ===== AFRICAN =====
"sw": "swahili",
"am": "amharic",
"yo": "yoruba",
"ig": "igbo",
"ha": "hausa",
"zu": "zulu",
"xh": "xhosa",
"so": "somali",
"af": "afrikaans",
# ===== EUROPE EXTRA =====
"et": "estonian",
"lv": "latvian",
"lt": "lithuanian",
"is": "icelandic",
"ga": "irish",
"cy": "welsh",
"sq": "albanian",
"sr": "serbian",
"hr": "croatian",
"sl": "slovenian",
# ===== OTHERS =====
"la": "latin",
"eo": "esperanto"
}
# ===============================
# LOAD MODEL ONCE
# ===============================
print("Loading translation model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
print("Model loaded successfully.")
# ===============================
# HELPERS
# ===============================
def get_nllb_code(lang: str):
lang = lang.lower().strip()
if lang in LANGUAGE_MAP:
return LANGUAGE_MAP[lang]
if lang in ISO_TO_LANGUAGE_KEY:
key = ISO_TO_LANGUAGE_KEY[lang]
return LANGUAGE_MAP[key]
return "eng_Latn" # fallback
# ===============================
# MANUAL TRANSLATOR
# ===============================
def standard_translator(text: str, src_lang: str, tgt_lang: str) -> str:
src_code = get_nllb_code(src_lang)
tgt_code = get_nllb_code(tgt_lang)
tokenizer.src_lang = src_code
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = model.generate(
**inputs,
forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
max_length=512
)
return tokenizer.decode(output[0], skip_special_tokens=True)
# ===============================
# AUTO TRANSLATOR
# ===============================
def smart_auto_translator(text: str, target_lang: str):
detected_iso = detect(text)
detected_lang_key = ISO_TO_LANGUAGE_KEY.get(detected_iso, detected_iso)
src_code = get_nllb_code(detected_iso)
tgt_code = get_nllb_code(target_lang)
tokenizer.src_lang = src_code
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = model.generate(
**inputs,
forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
max_length=512
)
translated_text = tokenizer.decode(output[0], skip_special_tokens=True)
return {
"translated_text": translated_text,
"detected_iso": detected_iso,
"detected_language": detected_lang_key
}