AVoice-TTS / runtime /omnivoice /utils /lang_map.py
Hak5's picture
Add bundled AVoice runtime for HF-only inference
7496177 verified
#!/usr/bin/env python3
# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
#
# See ../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Language name to ISO 639-3 code mapping.
Auto-generated from ``docs/lang_id_name_map.tsv``. Provides ``LANG_NAME_TO_ID``
(for resolving language names to codes) and ``LANG_IDS`` (the set of supported
ISO 639-3 codes). Used by ``OmniVoice.generate()`` to resolve user-provided
language names.
"""
# Auto-generated from docs/lang_id_name_map.tsv
# Maps lowercase language name -> language ID code
LANG_NAME_TO_ID = {
"abadi": "kbt",
"abkhazian": "ab",
"abron": "abr",
"abua": "abn",
"adamawa fulfulde": "fub",
"adyghe": "ady",
"afade": "aal",
"afrikaans": "af",
"agwagwune": "yay",
"aja (benin)": "ajg",
"akebu": "keu",
"alago": "ala",
"albanian": "sq",
"algerian arabic": "arq",
"algerian saharan arabic": "aao",
"ambo-pasco quechua": "qva",
"ambonese malay": "abs",
"amdo tibetan": "adx",
"amharic": "am",
"anaang": "anw",
"angika": "anp",
"antankarana malagasy": "xmv",
"aragonese": "an",
"arbëreshë albanian": "aae",
"arequipa-la unión quechua": "qxu",
"armenian": "hy",
"ashe": "ahs",
"ashéninka perené": "prq",
"askopan": "eiv",
"assamese": "as",
"asturian": "ast",
"atayal": "tay",
"awak": "awo",
"ayacucho quechua": "quy",
"azerbaijani": "az",
"baatonum": "bba",
"bacama": "bcy",
"bade": "bde",
"bafia": "ksf",
"bafut": "bfd",
"bagirmi fulfulde": "fui",
"bago-kusuntu": "bqg",
"baharna arabic": "abv",
"bakoko": "bkh",
"balanta-ganja": "bjt",
"balti": "bft",
"bamenyam": "bce",
"bamun": "bax",
"bangwinji": "bsj",
"banjar": "bjn",
"bankon": "abb",
"baoulé": "bci",
"bara malagasy": "bhr",
"barok": "bjk",
"basa (cameroon)": "bas",
"basa (nigeria)": "bzw",
"bashkir": "ba",
"basque": "eu",
"batak mandailing": "btm",
"batanga": "bnm",
"bateri": "btv",
"bats": "bbl",
"bayot": "bda",
"bebele": "beb",
"belarusian": "be",
"bengali": "bn",
"betawi": "bew",
"bhili": "bhb",
"bhojpuri": "bho",
"bilur": "bxf",
"bima": "bhp",
"bodo": "brx",
"boghom": "bux",
"bokyi": "bky",
"bomu": "bmq",
"bondei": "bou",
"borgu fulfulde": "fue",
"bosnian": "bs",
"brahui": "brh",
"braj": "bra",
"breton": "br",
"buduma": "bdm",
"buginese": "bug",
"bukharic": "bhh",
"bulgarian": "bg",
"bulu (cameroon)": "bum",
"bundeli": "bns",
"bunun": "bnn",
"bura-pabir": "bwr",
"burak": "bys",
"burmese": "my",
"burushaski": "bsk",
"cacaloxtepec mixtec": "miu",
"cajatambo north lima quechua": "qvl",
"cakfem-mushere": "cky",
"cameroon pidgin": "wes",
"campidanese sardinian": "sro",
"cantonese": "yue",
"catalan": "ca",
"cebuano": "ceb",
"cen": "cen",
"central kurdish": "ckb",
"central nahuatl": "nhn",
"central pame": "pbs",
"central pashto": "pst",
"central puebla nahuatl": "ncx",
"central tarahumara": "tar",
"central yupik": "esu",
"central-eastern niger fulfulde": "fuq",
"chadian arabic": "shu",
"chichewa": "ny",
"chichicapan zapotec": "zpv",
"chiga": "cgg",
"chimalapa zoque": "zoh",
"chimborazo highland quichua": "qug",
"chinese": "zh",
"chiquián ancash quechua": "qxa",
"chitwania tharu": "the",
"chokwe": "cjk",
"chuvash": "cv",
"cibak": "ckl",
"coastal konjo": "kjc",
"copainalá zoque": "zoc",
"cornish": "kw",
"corongo ancash quechua": "qwa",
"croatian": "hr",
"cross river mbembe": "mfn",
"cuyamecalco mixtec": "xtu",
"czech": "cs",
"dadiya": "dbd",
"dagbani": "dag",
"dameli": "dml",
"danish": "da",
"dargwa": "dar",
"dazaga": "dzg",
"deccan": "dcc",
"degema": "deg",
"dera (nigeria)": "kna",
"dghwede": "dgh",
"dhatki": "mki",
"dhivehi": "dv",
"dhofari arabic": "adf",
"dijim-bwilim": "cfa",
"dogri": "dgo",
"domaaki": "dmk",
"dotyali": "dty",
"duala": "dua",
"dutch": "nl",
"dũya": "ldb",
"dyula": "dyu",
"eastern balochi": "bgp",
"eastern bolivian guaraní": "gui",
"eastern egyptian bedawi arabic": "avl",
"eastern krahn": "kqo",
"eastern mari": "mhr",
"eastern yiddish": "ydd",
"ebrié": "ebr",
"eggon": "ego",
"egyptian arabic": "arz",
"ejagham": "etu",
"eleme": "elm",
"eloyi": "afo",
"embu": "ebu",
"english": "en",
"erzya": "myv",
"esan": "ish",
"esperanto": "eo",
"estonian": "et",
"eton (cameroon)": "eto",
"ewondo": "ewo",
"extremaduran": "ext",
"fang (equatorial guinea)": "fan",
"fanti": "fat",
"farefare": "gur",
"fe'fe'": "fmp",
"filipino": "fil",
"filomena mata-coahuitlán totonac": "tlp",
"finnish": "fi",
"fipa": "fip",
"french": "fr",
"fulah": "ff",
"galician": "gl",
"gambian wolof": "wof",
"ganda": "lg",
"garhwali": "gbm",
"gawar-bati": "gwt",
"gawri": "gwc",
"gbagyi": "gbr",
"gbari": "gby",
"geji": "gyz",
"gen": "gej",
"georgian": "ka",
"german": "de",
"geser-gorom": "ges",
"gheg albanian": "aln",
"ghomálá'": "bbj",
"gidar": "gid",
"glavda": "glw",
"goan konkani": "gom",
"goaria": "gig",
"goemai": "ank",
"gola": "gol",
"greek": "el",
"guarani": "gn",
"guduf-gava": "gdf",
"guerrero amuzgo": "amu",
"gujarati": "gu",
"gujari": "gju",
"gulf arabic": "afb",
"gurgula": "ggg",
"gusii": "guz",
"gusilay": "gsl",
"gweno": "gwe",
"güilá zapotec": "ztu",
"hadothi": "hoj",
"hahon": "hah",
"haitian": "ht",
"hakha chin": "cnh",
"hakö": "hao",
"halia": "hla",
"hausa": "ha",
"hawaiian": "haw",
"hazaragi": "haz",
"hebrew": "he",
"hemba": "hem",
"herero": "hz",
"highland konjo": "kjk",
"hijazi arabic": "acw",
"hindi": "hi",
"huarijio": "var",
"huautla mazatec": "mau",
"huaxcaleca nahuatl": "nhq",
"huba": "hbb",
"huitepec mixtec": "mxs",
"hula": "hul",
"hungarian": "hu",
"hunjara-kaina ke": "hkk",
"hwana": "hwo",
"ibibio": "ibb",
"icelandic": "is",
"idakho-isukha-tiriki": "ida",
"idoma": "idu",
"igbo": "ig",
"igo": "ahl",
"ikposo": "kpo",
"ikwere": "ikw",
"imbabura highland quichua": "qvi",
"indonesian": "id",
"indus kohistani": "mvy",
"interlingua (international auxiliary language association)": "ia",
"inupiaq": "ik",
"irish": "ga",
"iron ossetic": "os",
"isekiri": "its",
"isoko": "iso",
"italian": "it",
"ito": "itw",
"itzá": "itz",
"ixtayutla mixtec": "vmj",
"izon": "ijc",
"jambi malay": "jax",
"japanese": "ja",
"jaqaru": "jqr",
"jauja wanca quechua": "qxw",
"jaunsari": "jns",
"javanese": "jv",
"jiba": "juo",
"jju": "kaj",
"judeo-moroccan arabic": "aju",
"juxtlahuaca mixtec": "vmc",
"kabardian": "kbd",
"kabras": "lkb",
"kabuverdianu": "kea",
"kabyle": "kab",
"kachi koli": "gjk",
"kairak": "ckr",
"kalabari": "ijn",
"kalasha": "kls",
"kalenjin": "kln",
"kalkoti": "xka",
"kamba": "kam",
"kamo": "kcq",
"kanauji": "bjj",
"kanembu": "kbl",
"kannada": "kn",
"karekare": "kai",
"kashmiri": "ks",
"kathoriya tharu": "tkt",
"kati": "bsh",
"kazakh": "kk",
"keiyo": "eyo",
"khams tibetan": "khg",
"khana": "ogo",
"khetrani": "xhe",
"khmer": "km",
"khowar": "khw",
"kinga": "zga",
"kinnauri": "kfk",
"kinyarwanda": "rw",
"kirghiz": "ky",
"kirya-konzəl": "fkk",
"kochila tharu": "thq",
"kohistani shina": "plk",
"kohumono": "bcs",
"kok borok": "trp",
"kol (papua new guinea)": "kol",
"kom (cameroon)": "bkm",
"koma": "kmy",
"konkani": "knn",
"konzo": "koo",
"korean": "ko",
"korwa": "kfp",
"kota (india)": "kfe",
"koti": "eko",
"kuanua": "ksd",
"kuanyama": "kj",
"kui (india)": "uki",
"kulung (nigeria)": "bbu",
"kuot": "kto",
"kushi": "kuh",
"kwambi": "kwm",
"kwasio": "nmg",
"lala-roba": "lla",
"lamang": "hia",
"lao": "lo",
"larike-wakasihu": "alo",
"lasi": "lss",
"latgalian": "ltg",
"latvian": "lv",
"levantine arabic": "apc",
"liana-seti": "ste",
"liberia kpelle": "xpe",
"liberian english": "lir",
"libyan arabic": "ayl",
"ligurian": "lij",
"lijili": "mgi",
"lingala": "ln",
"lithuanian": "lt",
"loarki": "lrk",
"logooli": "rag",
"logudorese sardinian": "src",
"loja highland quichua": "qvj",
"loloda": "loa",
"longuda": "lnu",
"loxicha zapotec": "ztp",
"luba-lulua": "lua",
"luo": "luo",
"lushai": "lus",
"luxembourgish": "lb",
"maasina fulfulde": "ffm",
"maba (chad)": "mde",
"macedo-romanian": "rup",
"macedonian": "mk",
"mada (cameroon)": "mxu",
"mafa": "maf",
"maithili": "mai",
"malay": "ms",
"malayalam": "ml",
"mali": "gcc",
"malinaltepec me'phaa": "tcf",
"maltese": "mt",
"mandara": "tbf",
"mandjak": "mfv",
"manggarai": "mqy",
"manipuri": "mni",
"mansoanka": "msw",
"manx": "gv",
"maori": "mi",
"marathi": "mr",
"marghi central": "mrt",
"marghi south": "mfm",
"maria (india)": "mrr",
"marwari (pakistan)": "mve",
"masana": "mcn",
"masikoro malagasy": "msh",
"matsés": "mcf",
"mazaltepec zapotec": "zpy",
"mazatlán mazatec": "vmz",
"mazatlán mixe": "mzl",
"mbe": "mfo",
"mbo (cameroon)": "mbo",
"mbum": "mdd",
"medumba": "byv",
"mekeo": "mek",
"meru": "mer",
"mesopotamian arabic": "acm",
"mewari": "mtr",
"min nan chinese": "nan",
"mingrelian": "xmf",
"mitlatongo mixtec": "vmm",
"miya": "mkf",
"mokpwe": "bri",
"moksha": "mdf",
"mom jango": "ver",
"mongolian": "mn",
"moroccan arabic": "ary",
"motu": "meu",
"mpiemo": "mcx",
"mpumpong": "mgg",
"mundang": "mua",
"mungaka": "mhk",
"musey": "mse",
"musgu": "mug",
"musi": "mui",
"naba": "mne",
"najdi arabic": "ars",
"nalik": "nal",
"nawdm": "nmz",
"ndonga": "ng",
"neapolitan": "nap",
"nepali": "npi",
"ngamo": "nbh",
"ngas": "anc",
"ngiemboon": "nnh",
"ngizim": "ngi",
"ngomba": "jgo",
"ngombale": "nla",
"nigerian fulfulde": "fuv",
"nigerian pidgin": "pcm",
"nimadi": "noe",
"nobiin": "fia",
"north mesopotamian arabic": "ayp",
"north moluccan malay": "max",
"northern betsimisaraka malagasy": "bmm",
"northern hindko": "hno",
"northern kurdish": "kmr",
"northern pame": "pmq",
"northern pashto": "pbu",
"northern uzbek": "uzn",
"northwest gbaya": "gya",
"norwegian": "no",
"norwegian bokmål": "nb",
"norwegian nynorsk": "nn",
"notsi": "ncf",
"nyankpa": "yes",
"nyungwe": "nyu",
"nzanyi": "nja",
"nüpode huitoto": "hux",
"occitan": "oc",
"od": "odk",
"odia": "ory",
"odual": "odu",
"omani arabic": "acx",
"orizaba nahuatl": "nlv",
"orma": "orc",
"ormuri": "oru",
"oromo": "om",
"pahari-potwari": "phr",
"paiwan": "pwn",
"panjabi": "pa",
"papuan malay": "pmy",
"parkari koli": "kvx",
"pedi": "nso",
"pero": "pip",
"persian": "fa",
"petats": "pex",
"phalura": "phl",
"piemontese": "pms",
"piya-kwonci": "piy",
"plateau malagasy": "plt",
"polish": "pl",
"poqomam": "poc",
"portuguese": "pt",
"pulaar": "fuc",
"pular": "fuf",
"puno quechua": "qxp",
"pushto": "ps",
"pökoot": "pko",
"qaqet": "byx",
"quiotepec chinantec": "chq",
"rana tharu": "thr",
"rangi": "lag",
"rapoisi": "kyx",
"ratahan": "rth",
"rayón zoque": "zor",
"romanian": "ro",
"romansh": "rm",
"rombo": "rof",
"rotokas": "roo",
"rukai": "dru",
"russian": "ru",
"sacapulteco": "quv",
"saidi arabic": "aec",
"sakalava malagasy": "skg",
"sakizaya": "szy",
"saleman": "sau",
"samba daka": "ccg",
"samba leko": "ndi",
"san felipe otlaltepec popoloca": "pow",
"san francisco del mar huave": "hue",
"san juan atzingo popoloca": "poe",
"san martín itunyoso triqui": "trq",
"san miguel el grande mixtec": "mig",
"sansi": "ssi",
"sanskrit": "sa",
"santa ana de tusi pasco quechua": "qxt",
"santa catarina albarradas zapotec": "ztn",
"santali": "sat",
"santiago del estero quichua": "qus",
"saposa": "sps",
"saraiki": "skr",
"sardinian": "sc",
"saya": "say",
"sediq": "trv",
"serbian": "sr",
"seri": "sei",
"shina": "scl",
"shona": "sn",
"siar-lak": "sjr",
"sibe": "nco",
"sicilian": "scn",
"sihuas ancash quechua": "qws",
"sikkimese": "sip",
"sinaugoro": "snc",
"sindhi": "sd",
"sindhi bhil": "sbn",
"sinhala": "si",
"sinicahua mixtec": "xti",
"sipacapense": "qum",
"siwai": "siw",
"slovak": "sk",
"slovenian": "sl",
"solos": "sol",
"somali": "so",
"soninke": "snk",
"south giziga": "giz",
"south ucayali ashéninka": "cpy",
"southeastern nochixtlán mixtec": "mxy",
"southern betsimisaraka malagasy": "bzc",
"southern pashto": "pbt",
"southern pastaza quechua": "qup",
"soyaltepec mazatec": "vmp",
"spanish": "es",
"standard arabic": "arb",
"standard moroccan tamazight": "zgh",
"sudanese arabic": "apd",
"sulka": "sua",
"svan": "sva",
"swahili": "sw",
"swedish": "sv",
"tae'": "rob",
"tahaggart tamahaq": "thv",
"taita": "dav",
"tajik": "tg",
"tamil": "ta",
"tandroy-mahafaly malagasy": "tdx",
"tangale": "tan",
"tanosy malagasy": "txy",
"tarok": "yer",
"tatar": "tt",
"tedaga": "tuq",
"telugu": "te",
"tem": "kdh",
"teop": "tio",
"tepeuxila cuicatec": "cux",
"tepinapa chinantec": "cte",
"tera": "ttr",
"terei": "buo",
"termanu": "twu",
"tesaka malagasy": "tkg",
"tetelcingo nahuatl": "nhg",
"teutila cuicatec": "cut",
"thai": "th",
"tibetan": "bo",
"tidaá mixtec": "mtx",
"tidore": "tvo",
"tigak": "tgc",
"tigre": "tig",
"tigrinya": "ti",
"tilquiapan zapotec": "zts",
"tinputz": "tpz",
"tlacoapa me'phaa": "tpl",
"tlacoatzintepec chinantec": "ctl",
"tlingit": "tli",
"toki pona": "tok",
"tomoip": "tqp",
"tondano": "tdn",
"tonsea": "txs",
"tooro": "ttj",
"torau": "ttu",
"torwali": "trw",
"tsimihety malagasy": "xmw",
"tsotso": "lto",
"tswana": "tn",
"tugen": "tuy",
"tuki": "bag",
"tula": "tul",
"tulu": "tcy",
"tunen": "tvu",
"tungag": "lcm",
"tunisian arabic": "aeb",
"tupuri": "tui",
"turkana": "tuv",
"turkish": "tr",
"turkmen": "tk",
"tututepec mixtec": "mtu",
"twi": "tw",
"ubaghara": "byc",
"uighur": "ug",
"ukrainian": "uk",
"umbundu": "umb",
"upper sorbian": "hsb",
"urdu": "ur",
"ushojo": "ush",
"uzbek": "uz",
"vai": "vai",
"vietnamese": "vi",
"votic": "vot",
"võro": "vro",
"waci gbe": "wci",
"wadiyara koli": "kxp",
"waja": "wja",
"wakhi": "wbl",
"wanga": "lwg",
"wapan": "juk",
"warji": "wji",
"welsh": "cy",
"wemale": "weo",
"western frisian": "fy",
"western highland purepecha": "pua",
"western juxtlahuaca mixtec": "jmx",
"western maninkakan": "mlq",
"western mari": "mrj",
"western niger fulfulde": "fuh",
"western panjabi": "pnb",
"wolof": "wo",
"wuzlam": "udl",
"xanaguía zapotec": "ztg",
"xhosa": "xh",
"yace": "ekr",
"yakut": "sah",
"yalahatan": "jal",
"yanahuanca pasco quechua": "qur",
"yangben": "yav",
"yaqui": "yaq",
"yauyos quechua": "qux",
"yekhee": "ets",
"yiddish": "yi",
"yidgha": "ydg",
"yoruba": "yo",
"yutanduchi mixtec": "mab",
"zacatlán-ahuacatlán-tepetzintla nahuatl": "nhi",
"zarma": "dje",
"zaza": "zza",
"zulu": "zu",
"ömie": "aom",
}
LANG_NAMES = set(LANG_NAME_TO_ID.keys())
LANG_IDS = set(LANG_NAME_TO_ID.values())
# Exceptions where .title() doesn't match the canonical casing from the TSV.
_TITLE_EXCEPTIONS = {
"fe'fe'": "Fe'fe'",
"dũya": "Dũya",
"santiago del estero quichua": "Santiago del Estero Quichua",
"santa ana de tusi pasco quechua": "Santa Ana de Tusi Pasco Quechua",
"malinaltepec me'phaa": "Malinaltepec Me'phaa",
"tlacoapa me'phaa": "Tlacoapa Me'phaa",
}
def lang_display_name(name: str) -> str:
"""Return a display-friendly version of a lowercase language name.
Uses .title() for most names, with manual exceptions for cases like
apostrophes and small words (de, del) that should stay lowercase.
"""
return _TITLE_EXCEPTIONS.get(name, name.title())