import collections import itertools import os import re try: import jamo except ImportError: jamo = None WIKIPRON_DIR = r"C:\Users\micha\Documents\Dev\wikipron\data\scrape\tsv" OUTPUT_DIR = r"" LANG_PATHS = {"french": "fra_latn_broad_filtered.tsv"} # Full list of languages # LANG_CODES = ['bulgarian', 'czech', 'french', 'german', 'mandarin_hani', 'polish', 'portuguese_brazil', # 'portuguese_portugal', 'russian', 'spanish_spain', 'spanish_latin_america', 'swedish', # 'tamil', 'thai', 'turkish', 'ukrainian', 'mandarin_hani_beijing', 'mandarin_hani_taiwan', 'mandarin_hani_standard', # 'korean_hangul', 'hausa', 'japanese', 'vietnamese_hanoi', 'vietnamese_hue', 'vietnamese_hochiminhcity', # 'serbocroatian_croatian', 'serbocroatian_serbian'] LANG_CODES = ["czech"] BAD_GRAPHEMES = { "english_us": { "%", "/", "@", "²", "à", "á", "â", "ä", "æ", "ç", "è", "é", "ê", "ë", "í", "î", "ï", "ñ", "ó", "ô", "õ", "ö", "ø", "ù", "ú", "ü", "ā", "ą", "č", "ē", "ę", "ğ", "ı", "ł", "ń", "ō", "ő", "œ", "ř", "ū", "ș", "ț", "ʼ", "ṭ", "₂", }, "english_uk": { "%", "/", "@", "²", "à", "á", "â", "ä", "æ", "ç", "è", "é", "ê", "ë", "í", "î", "ï", "ñ", "ó", "ô", "õ", "ö", "ø", "ù", "ú", "ü", "ā", "ą", "č", "ē", "ę", "ğ", "ı", "ł", "ń", "ō", "ő", "œ", "ř", "ū", "ș", "ț", "ʼ", "ṭ", "₂", "ã", "å", "û", "ī", "ž", ".", }, "polish": {"+", ".", "ü", "ö", "ø", "ƶ", "ñ", "ç", "à", "á", "è", "é", "í"}, "french": {".", "/", "º", "å", "æ", "ÿ", "ș"}, "japanese": {" ", "&", "+", "、", "〆", "〼", "〼", "=", "𫡤", "・", "×", "ゞ", "ゟ", "ゑ", "ゐ", "ヲ"}, "mandarin_hani_beijing": { "A", "B", "C", "D", "E", "G", "H", "I", "K", "M", "N", "O", "P", "Q", "S", "T", "U", "V", "X", "Y", "Z", "e", "p", "u", "·", "α", "β", "γ", "…", "⿰", "ㄅ", "ㄆ", "ㄇ", "ㄈ", "𰚼", "𰯼", "𫇦", }, "mandarin_hani_taiwan": { "A", "B", "C", "D", "E", "G", "H", "I", "K", "M", "N", "O", "P", "Q", "S", "T", "U", "V", "X", "Y", "Z", "e", "p", "u", "·", "α", "β", "γ", "…", "⿰", "ㄅ", "ㄆ", "ㄇ", "ㄈ", "𰚼", "𰯼", "𫇦", }, "mandarin_hani_standard": { "A", "B", "C", "D", "E", "G", "H", "I", "K", "M", "N", "O", "P", "Q", "S", "T", "U", "V", "X", "Y", "Z", "e", "p", "u", "·", "α", "β", "γ", "…", "⿰", "ㄅ", "ㄆ", "ㄇ", "ㄈ", "𰚼", "𰯼", "𫇦", }, "german": {"'", ".", "@", "à", "á", "ç", "è", "é", "ê", "ó", "ø", "œ", "í", "ë"}, "portuguese_brazil": {"'", "."}, "portuguese_portugal": {"'", "."}, "russian": {"'", ".", "/", "ѳ"}, "spanish_spain": {"'", ".", "ö", "ꝇ", "î", "ç"}, "spanish_latin_america": {"'", ".", "ö", "ꝇ", "î", "ç"}, "thai": {"…", "'", "/"}, "turkish": {"̇", "'"}, "tamil": {"ࢳ", "ࢳ", "ࢴ", "ࢴ", "ஃ"}, "vietnamese_hanoi": { "'", ".", ",", }, "vietnamese_hue": { "'", ".", ",", }, "vietnamese_hochiminhcity": { "'", ".", ",", }, } BAD_PHONES = { "english_uk": {"ɪː", "aː", "eː", "a", "o", "oː", "eː", "e"}, "english_us": {"ɒ", "aː", "a", "o", "oː", "eː", "e", "ɪː", "ɛː"}, "german": {"ʊɪ"}, "czech": {"ə"}, "spanish_latin_america": {"ɹ", "ɚ", "ʒ", "ə", "ɪ"}, "spanish_spain": {"ɹ", "ɚ", "ʒ", "ə", "ɪ"}, "mandarin_hani_taiwan": {"ai", "a", "ei", "o", "ə", "z̩", "ʐ̩"}, "mandarin_hani_standard": {"ai", "a", "ei", "o", "ə", "z̩", "ʐ̩"}, "mandarin_hani_beijing": {"ai", "a", "ei", "o", "ə", "z̩", "ʐ̩"}, } VOWELS = { "english_us": { "aɪ", "aʊ", "eɪ", "i", "iː", "oɪ", "oʊ", "u", "uː", "æ", "ɑ", "ɑː", "ɔ", "ɔɪ", "ɔː", "ə", "ɚ", "ɛ", "ɝ", "ɝː", "ɪ", "ʊ", "ʌ", }, "english_uk": { "aɪ", "aʊ", "eɪ", "i", "iː", "oɪ", "oʊ", "u", "uː", "æ", "ɑ", "ɑː", "ɔ", "ɔɪ", "ɔː", "ə", "ɚ", "ɛ", "ɝ", "ɝː", "ɪ", "ʊ", "ʌ", "aɪ", "aʊ", "eɪ", "i", "iː", "oɪ", "oʊ", "u", "uː", "æ", "ɑ", "ɑː", "ɒ", "ɔ", "ɔɪ", "ɔː", "ɛ", "ɛː", "ɜ", "ɜː", "ʊ", "ʌ", }, "vietnamese_hanoi": {"a", "aː", "e", "i", "o", "u", "ɔ", "ə", "əː", "ɛ", "ɨ", "ʊ", "ɪ"}, "vietnamese_hue": {"a", "aː", "e", "i", "o", "u", "ɔ", "ə", "əː", "ɛ", "ɨ", "ʊ", "ɪ"}, "vietnamese_hochiminhcity": { "a", "aː", "e", "i", "o", "u", "ɔ", "ə", "əː", "ɛ", "ɨ", "ʊ", "ɪ", }, "mandarin_hani": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"}, "mandarin_hani_standard": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"}, "mandarin_hani_taiwan": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"}, "mandarin_hani_beijing": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"}, "thai": { "a", "aː", "e", "eː", "i", "iː", "o", "oː", "u", "ə", "uː", "ɔ", "ɔː", "ɛ", "ɛː", "ɤ", "ɤː", "ɯ", "ɯː", }, "swedish": { "a", "aʊ", "aː", "e", "eː", "i", "iː", "o", "oː", "u", "uː", "y", "yʷ", "yː", "æ", "æː", "êː", "ø", "øː", "ø̀ː", "œ", "œː", "œ̞", "œ̞ː", "ɑ", "ɑː", "ɒː", "ɒ̀ː", "ɔ", "ə", "ɚ", "ɛ", "ɛɵ", "ɛː", "ɛ̂", "ɛ̄", "ɜ", "ɝ", "ɪ", "ɵ", "ɵː", "ɵ̄", "ɶ", "ɶː", "ʉ", "ʉː", "ʉ̂ː", "ʉ̟ː", "ʊ", "ʊː", "ʏ", "ỳː", "ỵː", }, } VOWEL_PATTERNS = {"swedish": re.compile(r"^[aeiɛøæuoʊêɔɪœɑʉɵɶ̂œ̞ː˧˩ɒyʏʉ̟ː˧˩əː˧˩˥]+$")} LANG_MAPPING = { "bulgarian": { "d̪": "d", "t̪": "t", "ɐ": "a", "æ": "a", "a̟": "a", "e": "ɛ", "ə": "ɤ", "o̝": "ɔ", "o̟": "ɔ", "u̟": "u", "ʉ": "u", "ʊ": "u", "ɤ̞": "ɤ", "ɤ̟": "ɤ", "lʲ": "ʎ", "l": "ɫ", "ɾ": "r", "iː": "i j", "s̪": "s", "n̪": "n", "ɾʲ": "rʲ", "nʲ": "ɲ", "ɡʲ": "ɟ", "kʲ": "c", }, "russian": {}, "czech": { "ɫ": "l", "ɾ": "r", "ɔ": "o", "ɔː": "oː", }, "serbocroatian_croatian": { "ʋ": "v", "ɕ": "ʃ", "ʑ": "ʒ", "ô": "o˦˨", "ôː": "oː˦˨", "ûː": "uː˦˨", "û": "u˦˨", "î": "i˦˨", "îː": "iː˦˨", "êː": "eː˦˨", "ê": "e˦˨", "âː": "aː˦˨", "â": "a˦˨", "r̂": "r̩˦˨", "r̂ː": "r̩ː˦˨", "řː": "r̩ː˨˦", "ř": "r̩˨˦", "ěː": "eː˨˦", "ě": "e˨˦", "ǎ": "a˨˦", "ǎː": "aː˨˦", "ǐː": "iː˨˦", "ǐ": "i˨˦", "ǒ": "o˨˦", "ǒː": "oː˨˦", "ǔː": "uː˨˦", "ǔ": "u˨˦", }, "serbocroatian_serbian": { "ʋ": "v", "ɕ": "ʃ", "ʑ": "ʒ", "ô": "o˦˨", "ôː": "oː˦˨", "ûː": "uː˦˨", "û": "u˦˨", "î": "i˦˨", "îː": "iː˦˨", "êː": "eː˦˨", "ê": "e˦˨", "âː": "aː˦˨", "â": "a˦˨", "r̂": "r̩˦˨", "r̂ː": "r̩ː˦˨", "řː": "r̩ː˨˦", "ř": "r̩˨˦", "ěː": "eː˨˦", "ě": "e˨˦", "ǎ": "a˨˦", "ǎː": "aː˨˦", "ǐː": "iː˨˦", "ǐ": "i˨˦", "ǒ": "o˨˦", "ǒː": "oː˨˦", "ǔː": "uː˨˦", "ǔ": "u˨˦", }, "french": {"r": "ʁ", "œ̃": "ɛ̃"}, "vietnamese_hanoi": { "k̟̚": "k̚", "ŋ̟": "ŋ", "ï": "ɨ", }, "german": { "b̥": "b", "d̥": "d", "ɡ̊": "ɡ", "r": "ʁ", "ŋ̍": "n̩", "ɱ̩": "n̩", "ŋ̩": "n̩", "ʀ": "ʁ", "χ": "x", "ʋ": "v", "ɘ": "ə", "i": "ɪ", "ø": "øː", "o": "ɔ", "u": "ʊ", "œː": "øː", "y": "ʏ", "e": "ɛ", "ɛː": "eː", "ɔː": "oː", "ɑː": "aː", "ɒː": "aː", }, "mandarin_hani": { "b̥": "p", "d̥": "t", "g̊": "k", "ɡ̊": "k", "ɖʐ̥": "ʈʂ", "dz̥": "ts", "dʑ̥": "tɕ", "ä": "a", "æ̃": "a", "ɤ": "o", "ɤ̃": "o", "ʊ̃": "o", "ɪ": "i", "ɻʷ": "ɻ", "ʊ": "u", "ɛ": "e", "ɑ": "a", "ɑ̃": "a", "ɔ": "o", "ɔː": "o", "⁵⁵": "˥", "⁵¹": "˥˩", "³⁵": "˧˥", "²¹⁴": "˨˩˦", }, "mandarin_hani_taiwan": { "b̥": "p", "d̥": "t", "g̊": "k", "ɡ̊": "k", "ɖʐ̥": "ʈʂ", "dz̥": "ts", "dʑ̥": "tɕ", "ä": "a", "æ̃": "a", "ɤ": "o", "ɤ̃": "o", "ʊ̃": "o", "ɪ": "i", "ɻʷ": "ɻ", "ʊ": "u", "ɛ": "e", "ɑ": "a", "ɑ̃": "a", "ɔ": "o", "ɔː": "o", "⁵⁵": "˥", "⁵¹": "˥˩", "³⁵": "˧˥", "²¹⁴": "˨˩˦", }, "mandarin_hani_beijing": { "b̥": "p", "d̥": "t", "g̊": "k", "ɡ̊": "k", "ɖʐ̥": "ʈʂ", "dz̥": "ts", "dʑ̥": "tɕ", "ä": "a", "æ̃": "a", "ɤ": "o", "ɤ̃": "o", "ʊ̃": "o", "ɪ": "i", "ɻʷ": "ɻ", "ʊ": "u", "ɛ": "e", "ɑ": "a", "ɑ̃": "a", "ɔ": "o", "ɔː": "o", "⁵⁵": "˥", "⁵¹": "˥˩", "³⁵": "˧˥", "²¹⁴": "˨˩˦", }, "mandarin_hani_standard": { "b̥": "p", "d̥": "t", "g̊": "k", "ɡ̊": "k", "ɖʐ̥": "ʈʂ", "dz̥": "ts", "dʑ̥": "tɕ", "ä": "a", "æ̃": "a", "ɤ": "o", "ɤ̃": "o", "ʊ̃": "o", "ɪ": "i", "ɻʷ": "ɻ", "ʊ": "u", "ɛ": "e", "ɑ": "a", "ɑ̃": "a", "ɔ": "o", "ɔː": "o", "⁵⁵": "˥", "⁵¹": "˥˩", "³⁵": "˧˥", "²¹⁴": "˨˩˦", }, "polish": { "s̪": "s", "r̥ː": "r", "r̥": "r", "ɫ": "l", "w̃": "n", }, "portuguese_brazil": { "ã": "ɐ̃", "ɫ": "l", "ʁ": "x", "ɹ": "x", "ɻ": "x", "χ": "x", "ɦ": "x", "h": "x", "r": "x", "ɪ": "i", "ʊ": "u", }, "portuguese_portugal": { "ã": "ɐ̃", "ɫ": "l", "r": "ʁ", }, "swedish": { "ɛ̄": "ɛ̂", "ɵ̄": "ɵ̂", "ɘ": "ɵ", "ə": "ɛ", "ʁ": "r", "ɾ": "r", "ɹ": "r", "v": "ʋ", "w": "ʋ", "ɜ": "ɛ", "æː": "ɛː", "ø": "øː", "æ": "ɛ", "ˇl": "l", "yʷ": "y", "œ̞ː": "øː", "œː": "øː", "œ̞": "œ", "ç": "ɕ", "bː": "b", # removing length in consonants "ɖː": "ɖ", "ɖˑ": "ɖ", "ˈt": "tʰ", "ˈk": "kʰ", "ˈp": "pʰ", "dː": "d", "jː": "j", "kː": "kʰ", "lː": "l", "mː": "m", "nː": "n", "fː": "f", "ɧː": "ɧ", "pː": "pʰ", "rː": "r", "sː": "s", "tˑ": "t", "tʰː": "tʰ", "pʰː": "pʰ", "kʰː": "kʰ", "tː": "tʰ", "ŋː": "ŋ", "ɲ": "ɳ", "ɕː": "ɕ", "ɡː": "ɡ", "ʈː": "ʈʰ", "ʈʰː": "ʈʰ", "ʂː": "ʈ", "ỵː": "yː", "ʉ̟̂": "ʉ̂", "ʉ̟ː": "ʉː", "ʉ̂": "ʉ̂ː", "ɒː": "ɑː", "aː": "ɑː", "ɑ": "ɑː", "e": "eː", "o": "oː", "u": "uː", "i": "iː", "y": "yː", "ɒ̀ː": "ɑ̀ː", "ʊː": "ʊ", "ʉ": "ʉː", "ɵː": "uː", "ɶː": "øː", }, "tamil": { "l̪": "l", "l̪ː": "lː", "r̥": "r", "ɾ̪": "ɾ", "h": "ɦ", "tʃ": "tɕ", "ɕ": "tɕ", "tʃː": "tɕː", }, "thai": { "cʰ": "tɕʰ", "c": "tɕ", "ɔ̌": "ɔ˩˩˦", "ǎː": "aː˩˩˦", "áː": "aː˦˥", "à": "a˨˩", "ì": "i˨˩", }, "ukrainian": { "ɫ": "l", "ʍ": "ʋ", "w": "ʋ", "v": "ʋ", #'e': 'ɛ', #'o': 'ɔ', "ɫː": "lː", }, "japanese": { "o̞": "o", "n̩": "n", "ä": "a", "ɡ̊": "ɡ", "ḁ": "a", "ẽ": "e", "m̩ː": "mː", "e̥": "e", "u͍": "ɯ", "ɯ̃ᵝ": "ɯ", "u͍ː": "ɯː", "w͍": "w", "y": "j", "r": "ɾ", "ɽ": "ɾ", "ɾ̥": "ɾ", "ɯᵝ": "ɯ", "ɯᵝː": "ɯː", "ɯ̟̃ᵝː": "ɯː", "ɯ̥ᵝ": "ɯ̥", "ʲkʲ": "kʲ", "nʲ": "ɲ", "tɕʲ": "tɕ", "ɕʲ": "ɕ", "ĩː": "iː", "õ̞ː": "oː", "i̥̥": "i̥", "e̞̊": "e", "ẽ̞ː": "eː", "ã̠ː": "aː", "õ̞": "o", "d̥": "d", "b̥": "b", "o̞ː": "oː", "e̞ː": "eː", "e̞": "e", "ẽ̞": "e", "ĩ": "i", "ɸ̥": "ɸ", "ɨ̃ᵝː": "ɨː", "ĩ̥": "i", "a̠ː": "aː", "a̠": "a", "o̞̊": "o", "dʑʲ": "dʑ", "ɾ̠": "ɾ", "ã̠": "a", "õ̥": "o", "dʲ": "dʑ", "tʲ": "tɕ", # 'ɯ̟ᵝ': 'ɯ', "ɰᵝ": "w", "ɰᵝː": "wː", # 'ɯ̟̊ᵝ': 'ɨ̥', # 'ɯ̟ᵝː': 'ɨː', # 'ɯ̟̃ᵝ': 'ɨ', # 'ɨ̥ᵝ': 'ɨ̥', # 'ɨᵝ': 'ɨ', # 'ɨ̃ᵝ': 'ɨ', # 'ɨᵝː': 'ɨː', "ɯ̟̊": "ɯ̥", "ɲ̟": "ɲ", "ŋʲ": "ɲ", "p̚ʲ": "p̚", "k̚ʲ": "k̚", "t̚ʲ": "t̚", }, "turkish": { "ɑ": "a", "ɑː": "a", "aː": "a", "iː": "i", "uː": "u", "ɛ": "e", "e̞": "e", "ɔ": "o", "ʊ": "u", "ʏ": "y", "β": "v", "o̞": "o", "ɪ": "i", "ø": "œ", "ɾ̝̊": "ɾ", }, "korean_hangul": { "a̠": "a", "e̞": "e", "e̞ː": "eː", "a̠ː": "a", "o̞": "o", "o̞ː": "oː", "ʌ̹": "ʌ", "ɘː": "ʌː", "ɦ": "h", "ɸʷ": "ɸ", "ʃʰ": "sʰ", }, "english_uk": { "ɝː": "ɜː", "əː": "ɜː", "æː": "æ", "ɝ": "ɜ", "ɚ": "ə", "ɫ": "l", "r": "ɹ", "ʍ": "w", }, "english_us": { "ɫ": "l", "r": "ɹ", "ʍ": "w", "æː": "æ", }, "spanish_spain": { "ɣ̞": "ɣ", "β̞": "β", "ð̞": "ð", "θ̬": "θ", "w̝": "w", "nʲ": "ɲ", "n̟": "n", "lʲ": "ʎ", "l̟": "l", "i̯": "j", "u̯": "w", "h": "x", "n̪": "n", "d": "d̪", }, "spanish_latin_america": { "ɣ̞": "ɣ", "β̞": "β", "ð̞": "ð", "w̝": "w", "nʲ": "ɲ", "lʲ": "ʎ", "i̯": "j", "u̯": "w", "n̪": "n", "l̪": "l", "l̟": "l", "h": "x", "n̟": "n", "d": "d̪", }, } GLOBAL_REMAPPING = { "õ": "õ", # Fix glyphs to use diacritics "ẽ": "ẽ", "ũ": "ũ", "ĩ": "ĩ", "ã": "ã", } def read_source(lang): graphemes = set() phones = set() dictionary = [] path = os.path.join(WIKIPRON_DIR, LANG_PATHS[lang]) with open(path, "r", encoding="utf8") as f: for line in f: line = line.strip() if not line: continue if "\t" in line: line = line.split("\t") word = line[0] pronunciation = line[1].split() else: line = line.split() word = line[0] pronunciation = line[1:] word = word.lower() if lang in BAD_GRAPHEMES: if any(x in BAD_GRAPHEMES[lang] for x in word): print(word) continue graphemes.update(word) phones.update(pronunciation) dictionary.append((word, pronunciation)) return dictionary, graphemes, phones def save_dictionary(dictionary, lang): deduplication = set() final_phones = collections.Counter() path = os.path.join(OUTPUT_DIR, f"{lang}_mfa.dict") with open(path, "w", encoding="utf8") as f: for w, p in sorted(dictionary): final_phones.update(p) p = " ".join(p) if (w, p) in deduplication: continue f.write("{}\t{}\n".format(w, p)) deduplication.add((w, p)) print("Final phones:", sorted(final_phones)) print("Final phone counts:", sorted(final_phones.items(), key=lambda x: -x[1])) def convert_language_specific(word, phones, lang): new_pron = [] if lang == "swedish": for i, p in enumerate(phones): for k, v in LANG_MAPPING[lang].items(): if p == k: phones[i] = v break for i, p in enumerate(phones): if p == "¹": found_first = False found_second = False for j in range(i + 1, len(phones)): if VOWEL_PATTERNS[lang].match(phones[j]): if not found_first: phones[j] += "˥˧" # Falling tone found_first = True elif not found_second: phones[j] += "˩" # Low tone found_second = True else: break continue elif p == "²": found_first = False found_second = False for j in range(i + 1, len(phones)): if phones[j] in VOWELS[lang]: if not found_first: phones[j] += "˧˩" # Falling tone found_first = True elif not found_second: phones[j] += "˥˩" # Falling tone found_second = True else: break continue new_pron.append(p) phones = new_pron new_pron = [] for i, p in enumerate(phones): if lang == "english_us": if lang in LANG_MAPPING: for k, v in LANG_MAPPING[lang].items(): if p == k: p = v break if p == "ʒ" and len(new_pron) and new_pron[-1] == "d": # fix up affricates being split new_pron[-1] = "dʒ" continue elif ( p == "ʃ" and len(new_pron) and new_pron[-1] == "t" ): # fix up affricates being split new_pron[-1] = "tʃ" continue elif p in ["ɪ", "j"] and len(new_pron) and new_pron[-1] in {"e", "ɔ", "o"}: new_pron[-1] += "ɪ" continue elif p in {"ʊ", "u"} and len(new_pron) and new_pron[-1] in {"a", "ɑ", "ʌ"}: new_pron[-1] = "aʊ" continue elif p in ["ɪ", "j"] and len(new_pron) and new_pron[-1] in {"a", "ɑ", "ʌ"}: new_pron[-1] = "aɪ" continue elif p in {"ʊ", "u"} and len(new_pron) and new_pron[-1] in {"ə", "o", "ɔ"}: new_pron[-1] = "oʊ" continue elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"o", "ɔ"}: new_pron[-1] = "ɔ" p = "ɹ" elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"i", "ɪː", "ɪ"}: new_pron[-1] = "ɪ" p = "ɹ" elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"u", "ʊ"}: new_pron[-1] = "ʊ" p = "ɹ" elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"e", "ɛ", "ɛː", "æ", "æː"}: new_pron[-1] = "ɛ" p = "ɹ" elif p == "ɹ" and len(new_pron) and new_pron[-1] in ["ɜ", "ɜː"]: new_pron[-1] = "ɝ" continue elif ( p == "ɹ" and len(new_pron) > 1 and new_pron[-1] == "ə" and new_pron[-2] in {"ɪ", "i", "ɪː"} ): new_pron[-1] = "ɹ" new_pron[-2] = "ɪ" continue elif ( p == "ɹ" and len(new_pron) > 1 and new_pron[-1] == "ə" and new_pron[-2] in {"ʊ", "u"} ): new_pron[-1] = "ɹ" new_pron[-2] = "ʊ" continue elif ( p == "ɹ" and len(new_pron) > 1 and new_pron[-1] == "ə" and new_pron[-2] in {"e", "ɛ", "ɛː"} ): new_pron[-1] = "ɹ" new_pron[-2] = "ɛ" continue elif p == "w" and len(new_pron) and new_pron[-1] == "h": # get rid of h w sequences new_pron[-1] = "w" continue elif p in {"k", "ɡ"} and len(new_pron) and new_pron[-1] == "n": new_pron[-1] = "ŋ" continue elif p in {"ɜ", "ɜː"} and (i == len(phones) - 1 or phones[i + 1] != "ɹ"): p = "ɝ" elif p == "ɪ" and i == len(phones) - 1: p = "i" elif ( p == "l" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1 ): # final syllabic l's new_pron[-1] = "l̩" continue elif ( p == "m" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1 ): # final syllabic m's new_pron[-1] = "m̩" continue elif ( p == "n" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1 ): # final syllabic n's new_pron[-1] = "n̩" continue elif ( p == "ɹ" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1 ): # final syllabic r's new_pron[-1] = "ɚ" continue elif p == "ŋ" and i == len(phones) - 1 and new_pron[-1] == "i": new_pron[-1] = "ɪ" continue elif lang == "english_uk": if lang in LANG_MAPPING: for k, v in LANG_MAPPING[lang].items(): if p == k: p = v break if p == "ɹ" and i == len(phones) - 1: continue elif p == "ɪ" and i == len(phones) - 1: p = "i" elif ( p in {"l", "m", "n"} and i == len(phones) - 1 and len(new_pron) and new_pron[-1] in {"ə", "əː"} ): new_pron[-1] = p + "̩" continue elif p == "ɪ" and len(new_pron) and new_pron[-1] in {"e", "a", "ɔ", "o"}: new_pron[-1] = new_pron[-1] + p continue elif p == "ʊ" and len(new_pron) and new_pron[-1] in {"e", "a"}: new_pron[-1] = new_pron[-1] + p continue elif p in {"ʊ", "u"} and len(new_pron) and new_pron[-1] in {"ə", "o", "ɔ"}: new_pron[-1] = "oʊ" continue elif ( p == "ʒ" and len(new_pron) and new_pron[-1] == "d" ): # fix up affricates being split new_pron[-1] = "dʒ" continue elif ( p == "ʃ" and len(new_pron) and new_pron[-1] == "t" ): # fix up affricates being split new_pron[-1] = "tʃ" continue elif p == "w" and len(new_pron) and new_pron[-1] == "h": # get rid of h w sequences new_pron[-1] = "w" continue elif p in {"k", "ɡ"} and len(new_pron) and new_pron[-1] == "n": new_pron[-1] = "ŋ" continue elif p == "ə" and len(new_pron) and new_pron[-1] == "ɛ": new_pron[-1] = "ɛː" continue elif p == "ŋ" and i == len(phones) - 1 and new_pron[-1] == "i": new_pron[-1] = "ɪ" continue elif ( p == "ɹ" and len(new_pron) > 2 and new_pron[-1] == "ə" and new_pron[-2] in {"e", "ɛ", "ʊ", "ɪ", "ɪː", "ɛː"} ): new_pron[-1] = p continue elif lang == "bulgarian": if p in {"s", "ʃ", "sʲ"} and len(new_pron) and new_pron[-1] == "t": new_pron[-1] += p continue elif p == "ʒ" and len(new_pron) and new_pron[-1] == "d": new_pron[-1] = "dʒ" continue elif p in {"ɡ", "k"} and len(new_pron) and new_pron[-1] in {"n"}: new_pron[-1] = "ŋ" elif p in {"v", "f"} and len(new_pron) and new_pron[-1] in {"n"}: new_pron[-1] = "ɱ" elif lang == "czech": if p in {"u", "ʊ"} and len(new_pron) and new_pron[-1] in {"o", "ɔ"}: new_pron[-1] = "ow" continue elif p in ["u", "ʊ"] and len(new_pron) and new_pron[-1] in {"a"}: new_pron[-1] = "aw" continue elif p in {"u", "ʊ"} and len(new_pron) and new_pron[-1] in {"e", "ɛ"}: new_pron[-1] = "ew" continue elif p in {"ʃ", "s"} and len(new_pron) and new_pron[-1] in {"t"}: new_pron[-1] += p continue elif p in {"ʒ"} and len(new_pron) and new_pron[-1] in {"d"}: new_pron[-1] += p continue elif p == "ʊ": p = "u" elif p == "e": p = "ɛ" elif lang.startswith("serbocroatian"): if p in {"ɕ", "ʂ", "ʃ"} and len(new_pron) and new_pron[-1] == "t": new_pron[-1] += p continue elif p in {"ʑ", "ʐ", "ʒ"} and len(new_pron) and new_pron[-1] == "d": new_pron[-1] += p continue elif lang == "german": if lang in LANG_MAPPING: for k, v in LANG_MAPPING[lang].items(): if p == k: p = v break if p in {"ʏ", "ɪ"} and len(new_pron) and new_pron[-1] in {"o", "ɔ"}: new_pron[-1] = "ɔʏ" continue elif p == "ɪ" and len(new_pron) and new_pron[-1] == "a": new_pron[-1] = "aɪ" continue elif p == "ɪ" and len(new_pron) and new_pron[-1] == "ʊ": new_pron[-1] = "ʊɪ" continue elif p == "ʊ" and len(new_pron) and new_pron[-1] == "a": new_pron[-1] = "aʊ" continue elif p == "e" and len(new_pron) and new_pron[-1] == "ɐ": new_pron[-1] = "ɐ" continue elif p == "ʔ": continue elif p in {"tʰ", "kʰ", "pʰ"} and i == len(phones) - 1: p = p[0] elif ( p in {"tʰ", "kʰ", "pʰ"} and len(new_pron) and new_pron[-1] in {"s", "ts", "ʃ", "tʃ"} ): p = p[0] elif p in {"t", "k", "p"} and i == 0: p += "ʰ" elif p in {"s", "ʃ"} and i == 1 and new_pron[-1] in {"tʰ"}: new_pron[-1] = "t" + p continue elif ( p in {"v", "s", "x", "ʁ", "l", "j"} and len(new_pron) and new_pron[-1] in {"tʰ", "kʰ", "pʰ"} ): new_pron[-1] = new_pron[-1][0] elif p == "s" and len(new_pron) and new_pron[-1] == "t": if "z" in word or "c" in word: new_pron[-1] = "ts" continue elif p == "õ": new_pron.append("ɔ") new_pron.append("n") continue elif p == "ɛ̃": new_pron.append("eː") new_pron.append("n") continue elif lang.startswith("mandarin_hani"): vowel_pattern = re.compile(r"^[ayeiouəɚʊɤ̃]+[²³⁰¹⁴⁵]*$") for k, v in LANG_MAPPING[lang].items(): if p == k: p = v break if p in {"²", "³", "¹", "⁰", "⁴", "⁵", "⁻", "⁽", "⁾"} and len(new_pron): index = -1 for j in range(len(new_pron) - 1, -1, -1): if vowel_pattern.match(new_pron[j]) or "̩" in new_pron[j]: index = j break if new_pron[index].endswith("²¹⁴"): continue new_pron[index] += p continue elif p.startswith("ˀ"): new_pron.append("ʔ") if p[1] in LANG_MAPPING[lang]: new_pron.append(LANG_MAPPING[lang][p[1]]) else: new_pron.append(p[1]) continue elif ( any(p.startswith(x) for x in VOWELS[lang]) and len(new_pron) and re.match(r"^[ayeiouəɚʊɤ̃]+$", new_pron[-1]) ): new_pron[-1] += p continue elif lang == "portuguese_brazil": for k, v in LANG_MAPPING[lang].items(): if p == k: p = v break if p == "w̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ"}: new_pron[-1] += p continue elif p == "j̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ", "ẽ", "ũ"}: new_pron[-1] += p continue elif ( p == "j" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "ɔ", "o", "u"} ): new_pron[-1] += p continue elif p == "w" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "i"}: new_pron[-1] += p continue elif lang == "portuguese_portugal": for k, v in LANG_MAPPING[lang].items(): if p == k: p = v break if p == "w̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ"}: new_pron[-1] += p continue elif p == "j̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ", "ẽ", "ũ"}: new_pron[-1] += p continue elif ( p == "j" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "ɔ", "o", "u"} ): new_pron[-1] += p continue elif p == "w" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "i"}: new_pron[-1] += p continue elif lang == "swedish": if p == "ʒ" and len(new_pron) and new_pron[-1] == "d": new_pron[-1] += p continue elif ( p in {"k", "kʰ", "ɡ"} and len(new_pron) and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"} ): new_pron[-1] = "ŋ" elif ( p in {"t", "tʰ", "d"} and len(new_pron) and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"} ): new_pron[-1] = "n" elif ( p in {"p", "pʰ", "b"} and len(new_pron) and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"} ): new_pron[-1] = "m" elif ( p in {"ʈ", "ʈʰ", "ɖ", "ʂ"} and len(new_pron) and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"} ): new_pron[-1] = "ɳ" elif p == "s" and len(new_pron) and new_pron[-1] == "r" and "rr" not in word: new_pron[-1] = "ʂ" continue elif p in {"t", "ʈ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word: new_pron[-1] = "ʈ" continue elif p in {"d", "ɖ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word: new_pron[-1] = "ɖ" continue elif ( p in {"n", "ɳ"} == "n" and len(new_pron) and new_pron[-1] == "r" and "rr" not in word ): new_pron[-1] = "ɳ" continue elif p in {"l", "ɭ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word: new_pron[-1] = "ɭ" continue elif p in {"tʰ", "ʈʰ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word: new_pron[-1] = "ʈʰ" continue elif p in {"s", "ʂ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word: new_pron[-1] = "ʂ" continue elif p == "aʊ": new_pron.append("a") new_pron.append("ʊ") continue elif p in {"r", "n", "l", "t", "k", "ɡ"} and len(new_pron) and new_pron[-1] == "ə": new_pron[-1] = "ɛ" elif p in {"t", "k", "p", "ʈ"} and not len(new_pron): p += "ʰ" elif ( not VOWEL_PATTERNS[lang].match(p) and len(new_pron) and new_pron[-1] in {"tʰ", "kʰ", "pʰ", "ʈʰ"} ): print(new_pron[-1], p) new_pron[-1] = new_pron[-1][0] elif ( p in {"tʰ", "kʰ", "pʰ", "ʈʰ"} and len(new_pron) and (new_pron[-1] in {"ʂ", "s"} or i == len(phones) - 1) ): p = p[0] elif p == "ə" and i == len(phones) - 1: p = "e" elif p in {"r"} and len(new_pron) and new_pron[-1] == "ɜ": new_pron[-1] = "æː" elif p == "ɜ" and i == len(phones) - 1: p = "e" elif lang == "tamil": if p in {"ʊ", "ɪ"} and len(new_pron) and new_pron[-1] == "a": new_pron[-1] += p continue elif lang in ["spanish_spain", "spanish_latin_america"]: if p in {"n", "m", "ɲ"} and len(new_pron) and new_pron[-1] in {"n", "m", "ɲ"}: new_pron[-1] = p continue if p in {"s", "z"} and len(new_pron) and new_pron[-1] in {"s", "z"}: new_pron[-1] = p continue if p in {"x", "k", "ɡ"} and len(new_pron) and new_pron[-1] == "n": new_pron[-1] = "ŋ" elif ( p in {"ɟʝ", "ʝ", "j", "tʃ", "i", "ĩ"} and len(new_pron) and new_pron[-1] in {"n"} ): new_pron[-1] = "ɲ" elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"k"}: new_pron[-1] = "c" elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"x"}: new_pron[-1] = "ç" elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"ɡ"}: new_pron[-1] = "ɟ" elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"ɣ"}: new_pron[-1] = "ʝ" elif ( p in {"ɟʝ", "ʝ", "j", "tʃ", "i", "ĩ"} and len(new_pron) and new_pron[-1] in {"l"} ): new_pron[-1] = "ʎ" elif ( p in { "β", "b", "p", } and len(new_pron) and new_pron[-1] == "n" ): new_pron[-1] = "m" elif ( p in { "f", "v", } and len(new_pron) and new_pron[-1] in {"n", "m", "n̪"} ): new_pron[-1] = "ɱ" elif lang == "thai": if p in {"a"} and len(new_pron) and new_pron[-1] in {"i", "iː", "ɯ", "ɯː", "u", "uː"}: new_pron[-1] += p continue elif lang == "turkish": if p == "ʃ" and len(new_pron) and new_pron[-1] in {"t"}: new_pron[-1] += p continue elif p == "ʒ" and len(new_pron) and new_pron[-1] in {"d"}: new_pron[-1] += p continue elif p in {"e", "i", "œ", "y"} and len(new_pron) and new_pron[-1] in {"k"}: new_pron[-1] = "c" elif p in {"e", "i", "œ", "y"} and len(new_pron) and new_pron[-1] in {"ɡ"}: new_pron[-1] = "ɟ" elif p in {"a", "ɯ", "o", "u"} and len(new_pron) and new_pron[-1] in {"l"}: new_pron[-1] = "ɫ" elif p in {"i", "e", "œ", "y"} and len(new_pron) and new_pron[-1] in {"ɫ"}: new_pron[-1] = "l" elif lang == "portuguese_brazil": if p == "ʃ" and len(new_pron) and new_pron[-1] in {"t"}: new_pron[-1] += p continue elif p == "ʒ" and len(new_pron) and new_pron[-1] in {"d"}: new_pron[-1] += p continue elif lang == "russian": voiced_set = { "v", "bʲ", "b", "bː", "d", "dz", "dzʲ", "dʐ", "dʲ", "dʲː", "dː", "v", "vʲ", "vʲː", "vː", "z", "zʲ", "zʲː", "zː", "ɡ", "ɡʲ", "ɡː", "ɣ", "ʐ", "ʐː", "ʑː", } if p in {"ʔ"}: continue elif lang == "japanese": for k, v in LANG_MAPPING[lang].items(): if p == k: p = v break if p in {".", "˕", "}", "˦˨˦", "˥", "˨˩", "˧", "꜔", "˩", "ʔ", "%", "˩˥"}: continue elif p in { "̥", "̥̥", } and len(new_pron): new_pron[-1] += "̥" continue elif p in {"ᵝ̥"} and len(new_pron): if "̥" not in new_pron[-1] and "ː" not in new_pron[-1]: new_pron[-1] += "̥" continue elif p in {"ː̥"} and len(new_pron): new_pron[-1] += "ː" continue elif ( p in {"j"} and len(new_pron) and new_pron[-1] in {"ɾ", "p", "m", "b", "k", "t", "d", "ç", "ɡ"} ): new_pron[-1] += "ʲ" continue elif p in {"h"} and len(new_pron) and new_pron[-1] in {"c"}: new_pron[-1] = "tɕ" continue elif p in {"p", "pʲ"} and len(new_pron) and new_pron[-1] in {"p̚"}: new_pron[-1] = p + "ː" continue elif p in {"b"} and len(new_pron) and new_pron[-1] in {"b̚"}: new_pron[-1] = p + "ː" continue elif p in {"ɾ", "ɾʲ"} and len(new_pron) and new_pron[-1] in {"ɾ̚"}: new_pron[-1] = p + "ː" continue elif p in {"k", "kʲ"} and len(new_pron) and new_pron[-1] in {"k̚"}: new_pron[-1] = p + "ː" continue elif p in {"t", "tʲ"} and len(new_pron) and new_pron[-1] in {"ʔ̥", "t̚"}: new_pron[-1] = p + "ː" continue elif p in {"tɕ", "ts"} and len(new_pron) and new_pron[-1] in {"t̚"}: new_pron[-1] = p + "ː" continue elif p in {"ɡ"} and len(new_pron) and new_pron[-1] in {"ɡ̚"}: new_pron[-1] = p + "ː" continue elif p in {"d", "dz", "ʑ", "dʑ"} and len(new_pron) and new_pron[-1] in {"d̚"}: new_pron[-1] = p + "ː" continue elif p in {"i", "iː", "i̥"} and len(new_pron) and "ʲ" in new_pron[-1]: if len(new_pron) > 2 and "ʲ" in new_pron[-2]: new_pron[-2] = new_pron[-2].replace("ʲ", "") new_pron[-1] = new_pron[-1].replace("ʲ", "") elif p in {"i"} and len(new_pron) and new_pron[-1] == "n": new_pron[-1] = "ɲ" elif False and p in {"k", "ɡ"} and len(new_pron) and new_pron[-1] == "ɲ": new_pron[-1] = "ŋ" elif False and p in {"t", "d"} and len(new_pron) and new_pron[-1] == "ɲ": new_pron[-1] = "n" elif p in {"dz", "dʑ"} and len(new_pron) and new_pron[-1] not in {"ɲ", "n"}: p = p[1] elif p in { "ɯ̟̃ᵝ", "ɯ̟̊ᵝ", "ɯ̟ᵝː", "ɯ̟ᵝ", "ɨ̥ᵝ", "ɨᵝ", "ɨ̃ᵝ", "ɨᵝː", "ɨ̥", "ɨ̥ː", "ɯ̥ː", "ɯ̥", }: if len(new_pron) and new_pron[-1] in { "t", "tː", "s", "sː", "z", "zː", "ɲː", "ɲ", "ç", "çː", "n", "nː", "ts", "tsː", "ɕ", "tɕ", "tɕː", "ʑ", "ɕː", "ʑː", "ɡʲ", "ɡʲː", "kʲ", "kʲː", "bʲ", "bʲː", "pʲ", "pʲː", "mʲ", "mʲː", "ɾʲː", "ɾʲ", "j", }: new_p = "ɨ" else: new_p = "ɯ" if "̥" in p or "̊" in p: new_p += "̥" if "ː" in p: new_p += "ː" p = new_p if len(new_pron) and new_pron[-1] == "n": new_pron[-1] = "ɲ" elif lang == "korean_hangul": for k, v in LANG_MAPPING[lang].items(): if p == k: p = v break # if p in {'e', 'ɛː', 'ɛ', 'a', 'o', 'u', 'ʌ'} and len(new_pron) and new_pron[-1] in {'j'}: # new_pron[-1] += p # continue # elif p in {'i'} and len(new_pron) and new_pron[-1] in {'w', 'ɥ'}: # new_pron[-1] = 'ɥi' # continue # elif p in {'e', 'ɛː', 'ɛ', 'a', 'o', 'i', 'ʌ'} and len(new_pron) and new_pron[-1] in {'w'}: # new_pron[-1] += p # continue # elif p in {'i'} and len(new_pron) and new_pron[-1] in {'ɰ'}: # new_pron[-1] += p # continue if p == "t͈" and "ᄄ" not in jamo.h2j(word): if len(new_pron) and "̚" in new_pron[-1]: p = "tʰ" else: p = "t" elif p == "tɕ͈" and "ᄍ" not in jamo.h2j(word): if len(new_pron) and "̚" in new_pron[-1]: p = "tɕʰ" else: p = "tɕ" elif p == "k͈" and "ᄁ" not in jamo.h2j(word): if len(new_pron) and "̚" in new_pron[-1]: p = "kʰ" else: p = "k" elif p == "p͈" and "ᄈ" not in jamo.h2j(word): if len(new_pron) and "̚" in new_pron[-1]: p = "pʰ" else: p = "p" elif p == "s͈" and "ᄊ" not in jamo.h2j(word): if len(new_pron) and "̚" in new_pron[-1]: p = "sʰ" else: p = "s" elif p == "x" and len(new_pron) and new_pron[-1] == "k": new_pron[-1] += "ʰ" continue elif lang in ["vietnamese_hanoi", "vietnamese_hue", "vietnamese_hochiminhcity"]: vowel_pattern = re.compile(rf'^[{"".join(VOWELS[lang])}]+$') tone_pattern = re.compile(r"^[˥˩˦˥˨˧˨˩˩˩ˀ˦]+$") if p in {"j", "w"} and len(new_pron) and vowel_pattern.match(new_pron[-1]): new_pron[-1] += p continue elif vowel_pattern.match(p) and len(new_pron) and vowel_pattern.match(new_pron[-1]): new_pron[-1] += p continue elif p in {"ɗ", "ɓ"} and len(new_pron) and new_pron[-1] == "ʔ": new_pron[-1] = p continue elif ( p == "ʔ" and len(new_pron) and tone_pattern.match(new_pron[-1]) and not (i < len(phones) - 1 and phones[i + 1] in {"ɗ", "ɓ"}) ): new_pron[-1] += "ˀ" continue if lang in LANG_MAPPING: for k, v in LANG_MAPPING[lang].items(): if p == k: p = v break if not p: continue new_pron.append(p) tone_mapping = { "⁰": "", "¹": "˩", "²": "˨", "³": "˧", "⁴": "˦", "⁵": "˥", "˧": "˧", "˨˩": "˨˩", "˥˩": "˥˩", "˦˥": "˦˥", "˩˩˦": "˩˩˦", } if lang == "thai": phones = new_pron new_pron = [] tone_symbols = {"˥˩", "˦˥", "˧", "˨˩", "˩˩˦"} vowel_set = {x for x in VOWELS[lang]} vowel_set |= {x + y for x, y in itertools.product(VOWELS[lang], VOWELS[lang])} vowel_set |= { x + y + z for x, y, z in itertools.product(VOWELS[lang], VOWELS[lang], VOWELS[lang]) } for i, p in enumerate(phones): if p in tone_symbols: for j in range(len(new_pron) - 1, 0, -1): if new_pron[j] in vowel_set and new_pron[j] not in {"w", "j"}: new_pron[j] += tone_mapping[p] break else: new_pron.append(p) # split off tone for G2P # for i, p in enumerate(new_pron): # for tone in tone_mapping: # if p.endswith(tone): # new_pron[i] = p.replace(tone, ' ') + tone elif lang in ["vietnamese_hanoi", "vietnamese_hue", "vietnamese_hochiminhcity"]: phones = new_pron new_pron = [] vowel_pattern = re.compile(rf'^[{"".join(VOWELS[lang])}]+[wj]?$') tone_symbols = { "˦ˀ˥", "˧˦", "˧˧", "˧˨", "˧˩", "˨˩", "˦˧˥", "˦˩", "˧˧", "˧˨", "˨˩", "˨˩˦", "˦˥", "˨˩˨", } tone_pattern = re.compile(r"^[˥˩˦˥˨˧˨˩˩˩ˀ˦]+$") for i, p in enumerate(phones): if tone_pattern.match(p): for j in range(len(new_pron) - 1, 0, -1): if vowel_pattern.match(new_pron[j]): new_pron[j] += p break else: new_pron.append(p) elif lang.startswith("mandarin_hani"): mapping = { "²¹⁴": "˨˩˦", "⁵⁵": "˥˥", "³⁵": "˧˥", "⁵¹": "˥˩", "⁰": "", "¹": "˩", "²": "˨", "³": "˨", "⁴": "˦", "⁵": "˥", } tone_symbols = {"²", "³", "¹", "⁴", "⁵", "⁰"} for i, p in enumerate(new_pron): if any(x in p for x in tone_symbols): for k, v in mapping.items(): if k in new_pron[i]: new_pron[i] = new_pron[i].replace(k, v) # if any(x in new_pron[i] for x in tone_symbols): # return None elif lang == "swedish": for i, p in enumerate(new_pron): if p == "êː": new_pron[i] = "eː˧˩" elif p == "â": new_pron[i] = "a˧˩" elif p == "ɛ̂": new_pron[i] = "ɛ˧˩" elif p == "ɑ̂ː": new_pron[i] = "ɑː˧˩" elif p == "ûː": new_pron[i] = "uː˧˩" elif p == "ʉ̂ː": new_pron[i] = "ʉː˧˩" elif p == "ɵ̂": new_pron[i] = "ɵ˧˩" elif p == "ʉ̂ː": new_pron[i] = "ʉː˧˩" elif p == "ʉ̟ː˥˩": new_pron[i] = "ʉː˥˩" elif p == "ǎ": new_pron[i] = "a˥˧" elif p == "ʉ̟ː˧˩": new_pron[i] = "ʉː˧˩" elif p == "ø̀ː": new_pron[i] = "øː˩" elif p == "ɑ̀ː": new_pron[i] = "ɑː˩" elif p == "ỳː": new_pron[i] = "yː˩" elif p == "ỳː˧˩": new_pron[i] = "yː˧˩" elif lang == "hausa": phone_mapping = { "á": "a", "áː": "aː", "é": "e", "éː": "eː", "í": "i", "íː": "iː", "ó": "o", "óː": "oː", "úː": "uː", "à": "a", "àː": "aː", "è": "e", "èː": "eː", "ì": "i", "ìː": "iː", "ò": "o", "òː": "oː", "ùː": "uː", "â": "a", "âː": "aː", "ê": "e", "êː": "eː", "î": "i", "îː": "iː", "ô": "o", "ôː": "oː", "ûː": "uː", } for i, p in enumerate(new_pron): if p in {"á", "áː", "é", "éː", "í", "íː", "ó", "óː", "úː"} or "́" in p: # High tone if p in phone_mapping: new_pron[i] = phone_mapping[p] else: new_pron[i] = p.replace("́", "") new_pron[i] += "˥" elif p in {"à", "àː", "è", "èː", "ì", "ìː", "ò", "òː", "ùː"} or "̀" in p: # Low tone if p in phone_mapping: new_pron[i] = phone_mapping[p] else: new_pron[i] = p.replace("̀", "") new_pron[i] += "˩" elif ( p in { "â", "âː", "ê", "êː", "î", "îː", "ôː", "ûː", } or "̂" in p ): # Falling tone if p in phone_mapping: new_pron[i] = phone_mapping[p] else: new_pron[i] = p.replace("̂", "") new_pron[i] += "˥˦" return new_pron def convert_second_round(word, phones, lang): if lang not in ["english_us", "english_uk"]: return phones new_pron = [] stressed_vowels = {} if lang == "english_uk": stressed_vowels = { "aɪ", "aʊ", "eɪ", "i", "iː", "oɪ", "oʊ", "u", "uː", "æ", "ɑ", "ɑː", "ɒ", "ɔ", "ɔɪ", "ɔː", "ɛ", "ɛː", "ɜ", "ɜː", "ʊ", "ʌ", } elif lang == "english_us": stressed_vowels = { "aɪ", "aʊ", "eɪ", "i", "iː", "oɪ", "oʊ", "u", "uː", "æ", "ɑ", "ɑː", "ɔ", "ɔɪ", "ɔː", "ɛ", "ɝ", "ɝː", "ʊ", "ʌ", } all_syllabics = {"ɪ", "ə", "ɚ", "n̩", "m̩", "l̩", "ɫ̩"} | VOWELS[lang] for i, p in enumerate(phones): if lang in ["english_us", "english_uk"]: if ( p == "l" and 2 < i < len(phones) - 1 and new_pron[-1] == "ə" and phones[i + 1] not in all_syllabics ): new_pron[-1] = "ɫ̩" continue elif ( p == "m" and 2 < i < len(phones) - 1 and new_pron[-1] == "ə" and phones[i + 1] not in all_syllabics ): new_pron[-1] = "m̩" continue elif ( p == "n" and 2 < i < len(phones) - 1 and new_pron[-1] == "ə" and phones[i + 1] not in all_syllabics ): new_pron[-1] = "n̩" continue elif p == "l̩" and 1 < i < len(phones) - 1 and phones[i + 1] in all_syllabics: new_pron.append("ə") p = "l" elif p == "l̩": p = "ɫ̩" elif p == "l" and i == len(phones) - 1: p = "ɫ" elif p == "l" and 1 < i < len(phones) - 1 and phones[i + 1] not in all_syllabics: p = "ɫ" elif ( p in {"t", "p", "k"} and i == 0 and i < len(phones) - 1 and phones[i + 1] in stressed_vowels | {"ɪ", "ə", "ɚ"} ): p += "ʰ" elif ( p == "ə" and 1 < i == len(phones) - 2 and phones[i - 1] in {"d", "t"} and phones[i + 1] == "d" ): p = "ɪ" elif ( p == "ə" and 1 < i == len(phones) - 2 and phones[i - 1] in {"s", "z", "ʃ", "ʒ", "tʃ", "dʒ"} and phones[i + 1] == "z" ): p = "ɪ" if lang == "english_us": if ( p == "ɹ" and 2 < i < len(phones) - 1 and new_pron[-1] == "ə" and phones[i + 1] not in all_syllabics ): new_pron[-1] = "ɚ" continue elif ( p in {"d", "t"} and 1 < i < len(phones) - 1 and phones[i - 1] in all_syllabics and phones[i + 1] in {"n̩", "m̩", "l̩", "ɚ", "ə", "ɫ̩"} ): p = "ɾ" elif ( p in {"t", "d"} and 1 < i < len(phones) - 2 and phones[i - 1] in all_syllabics and phones[i + 1] == "ɪ" and phones[i + 2] == "d" ): p = "ɾ" elif ( p in {"t", "d"} and i > 1 and i == len(phones) - 2 and phones[i - 1] in all_syllabics and phones[i + 1] == "i" ): p = "ɾ" elif ( p in {"t", "d"} and i > 1 and i == len(phones) - 3 and phones[i - 1] in all_syllabics and phones[i + 1] in {"i", "ɪ"} and phones[i + 2] == "z" ): p = "ɾ" elif ( p in {"t", "d"} and i > 1 and i == len(phones) - 3 and phones[i - 1] in all_syllabics and phones[i + 1] == "ɪ" and phones[i + 2] == "ŋ" ): p = "ɾ" elif ( p in {"t", "p", "k"} and i == 0 and i < len(phones) - 1 and phones[i + 1] in stressed_vowels | {"ɪ", "ə", "ɚ"} ): p += "ʰ" elif ( p in {"t", "p", "k"} and i > 0 and phones[i - 1] not in {"s", "ʃ"} and i < len(phones) - 1 and phones[i + 1] in stressed_vowels ): p += "ʰ" elif p == "l̩" and 1 < i < len(phones) - 1 and phones[i + 1] in all_syllabics: new_pron.append("ə") p = "l" elif p == "l̩": p = "ɫ̩" elif p == "l" and i == len(phones) - 1: p = "ɫ" elif ( p == "l" and 1 < i < len(phones) - 1 and phones[i + 1] not in {"ɪ", "ə", "ɚ", "n̩", "m̩", "l̩", "ɫ̩"} | VOWELS[lang] ): p = "ɫ" elif ( p == "ə" and 1 < i == len(phones) - 2 and phones[i - 1] in {"d", "t", "ɾ"} and phones[i + 1] == "d" ): p = "ɪ" elif ( p == "ə" and 1 < i == len(phones) - 2 and phones[i - 1] in {"s", "z", "ʃ", "ʒ", "tʃ", "dʒ"} and phones[i + 1] == "z" ): p = "ɪ" elif ( p in {"t", "p", "k"} and i > 0 and phones[i - 1] not in {"s", "ʃ"} and i < len(phones) - 1 and phones[i + 1] in stressed_vowels ): p += "ʰ" elif lang == "english_uk": if p not in VOWELS[lang] and len(new_pron) and new_pron[-1] == "ɹ": new_pron[-1] = p continue elif ( p in {"t", "p", "k"} and i > 0 and phones[i - 1] not in {"s", "ʃ"} and i < len(phones) - 1 and phones[i + 1] in stressed_vowels ): p += "ʰ" new_pron.append(p) return new_pron def fix_pronunciations(dictionary, lang): filtered_dictionary = [] for word, pronunciation in dictionary: if lang == "polish": if "ü" in word: continue for i, p in enumerate(pronunciation): if p in LANG_MAPPING[lang]: continue if p in GLOBAL_REMAPPING: pronunciation[i] = GLOBAL_REMAPPING[p] elif "̯" in p: pronunciation[i] = p.replace("̯", "") elif "͡" in p: pronunciation[i] = p.replace("͡", "") elif "‿" in p: pronunciation[i] = p.replace("‿", "") elif "͜" in p: pronunciation[i] = p.replace("͜", "") elif "g" in p: pronunciation[i] = p.replace("g", "ɡ") # Language specific conversions new_pron = convert_language_specific(word, pronunciation, lang) new_pron = convert_second_round(word, new_pron, lang) if new_pron is None: continue if (word, new_pron) not in filtered_dictionary: filtered_dictionary.append((word, new_pron)) return filtered_dictionary def process_language(lang): print("Processing", lang) if lang == "japanese": dictionary, input_graphemes, input_phones = read_source(lang + "_hiragana") d, g, p = read_source(lang + "_katakana") dictionary.extend(d) input_graphemes.update(g) input_phones.update(p) word_set = {x[0] for x in dictionary} d, g, p = read_source(lang) dictionary.extend([x for x in d if x[0] not in word_set]) input_graphemes.update(g) input_phones.update(p) else: dictionary, input_graphemes, input_phones = read_source(lang) print("Input graphemes", sorted(input_graphemes)) print("Input phones", sorted(input_phones)) filtered = fix_pronunciations(dictionary, lang) save_dictionary(filtered, lang) if __name__ == "__main__": for code in LANG_CODES: process_language(code)