MFA / scripts /dictionary_processing /clean_wikipron.py
niobures's picture
MFA
2f6b10b verified
import collections
import itertools
import os
import re
try:
import jamo
except ImportError:
jamo = None
WIKIPRON_DIR = r"C:\Users\micha\Documents\Dev\wikipron\data\scrape\tsv"
OUTPUT_DIR = r""
LANG_PATHS = {"french": "fra_latn_broad_filtered.tsv"}
# Full list of languages
# LANG_CODES = ['bulgarian', 'czech', 'french', 'german', 'mandarin_hani', 'polish', 'portuguese_brazil',
# 'portuguese_portugal', 'russian', 'spanish_spain', 'spanish_latin_america', 'swedish',
# 'tamil', 'thai', 'turkish', 'ukrainian', 'mandarin_hani_beijing', 'mandarin_hani_taiwan', 'mandarin_hani_standard',
# 'korean_hangul', 'hausa', 'japanese', 'vietnamese_hanoi', 'vietnamese_hue', 'vietnamese_hochiminhcity',
# 'serbocroatian_croatian', 'serbocroatian_serbian']
LANG_CODES = ["czech"]
BAD_GRAPHEMES = {
"english_us": {
"%",
"/",
"@",
"²",
"à",
"á",
"â",
"ä",
"æ",
"ç",
"è",
"é",
"ê",
"ë",
"í",
"î",
"ï",
"ñ",
"ó",
"ô",
"õ",
"ö",
"ø",
"ù",
"ú",
"ü",
"ā",
"ą",
"č",
"ē",
"ę",
"ğ",
"ı",
"ł",
"ń",
"ō",
"ő",
"œ",
"ř",
"ū",
"ș",
"ț",
"ʼ",
"ṭ",
"₂",
},
"english_uk": {
"%",
"/",
"@",
"²",
"à",
"á",
"â",
"ä",
"æ",
"ç",
"è",
"é",
"ê",
"ë",
"í",
"î",
"ï",
"ñ",
"ó",
"ô",
"õ",
"ö",
"ø",
"ù",
"ú",
"ü",
"ā",
"ą",
"č",
"ē",
"ę",
"ğ",
"ı",
"ł",
"ń",
"ō",
"ő",
"œ",
"ř",
"ū",
"ș",
"ț",
"ʼ",
"ṭ",
"₂",
"ã",
"å",
"û",
"ī",
"ž",
".",
},
"polish": {"+", ".", "ü", "ö", "ø", "ƶ", "ñ", "ç", "à", "á", "è", "é", "í"},
"french": {".", "/", "º", "å", "æ", "ÿ", "ș"},
"japanese": {" ", "&", "+", "、", "〆", "〼", "〼", "=", "𫡤", "・", "×", "ゞ", "ゟ", "ゑ", "ゐ", "ヲ"},
"mandarin_hani_beijing": {
"A",
"B",
"C",
"D",
"E",
"G",
"H",
"I",
"K",
"M",
"N",
"O",
"P",
"Q",
"S",
"T",
"U",
"V",
"X",
"Y",
"Z",
"e",
"p",
"u",
"·",
"α",
"β",
"γ",
"…",
"⿰",
"ㄅ",
"ㄆ",
"ㄇ",
"ㄈ",
"𰚼",
"𰯼",
"𫇦",
},
"mandarin_hani_taiwan": {
"A",
"B",
"C",
"D",
"E",
"G",
"H",
"I",
"K",
"M",
"N",
"O",
"P",
"Q",
"S",
"T",
"U",
"V",
"X",
"Y",
"Z",
"e",
"p",
"u",
"·",
"α",
"β",
"γ",
"…",
"⿰",
"ㄅ",
"ㄆ",
"ㄇ",
"ㄈ",
"𰚼",
"𰯼",
"𫇦",
},
"mandarin_hani_standard": {
"A",
"B",
"C",
"D",
"E",
"G",
"H",
"I",
"K",
"M",
"N",
"O",
"P",
"Q",
"S",
"T",
"U",
"V",
"X",
"Y",
"Z",
"e",
"p",
"u",
"·",
"α",
"β",
"γ",
"…",
"⿰",
"ㄅ",
"ㄆ",
"ㄇ",
"ㄈ",
"𰚼",
"𰯼",
"𫇦",
},
"german": {"'", ".", "@", "à", "á", "ç", "è", "é", "ê", "ó", "ø", "œ", "í", "ë"},
"portuguese_brazil": {"'", "."},
"portuguese_portugal": {"'", "."},
"russian": {"'", ".", "/", "ѳ"},
"spanish_spain": {"'", ".", "ö", "ꝇ", "î", "ç"},
"spanish_latin_america": {"'", ".", "ö", "ꝇ", "î", "ç"},
"thai": {"…", "'", "/"},
"turkish": {"̇", "'"},
"tamil": {"ࢳ", "ࢳ", "ࢴ", "ࢴ", "ஃ"},
"vietnamese_hanoi": {
"'",
".",
",",
},
"vietnamese_hue": {
"'",
".",
",",
},
"vietnamese_hochiminhcity": {
"'",
".",
",",
},
}
BAD_PHONES = {
"english_uk": {"ɪː", "aː", "eː", "a", "o", "oː", "eː", "e"},
"english_us": {"ɒ", "aː", "a", "o", "oː", "eː", "e", "ɪː", "ɛː"},
"german": {"ʊɪ"},
"czech": {"ə"},
"spanish_latin_america": {"ɹ", "ɚ", "ʒ", "ə", "ɪ"},
"spanish_spain": {"ɹ", "ɚ", "ʒ", "ə", "ɪ"},
"mandarin_hani_taiwan": {"ai", "a", "ei", "o", "ə", "z̩", "ʐ̩"},
"mandarin_hani_standard": {"ai", "a", "ei", "o", "ə", "z̩", "ʐ̩"},
"mandarin_hani_beijing": {"ai", "a", "ei", "o", "ə", "z̩", "ʐ̩"},
}
VOWELS = {
"english_us": {
"aɪ",
"aʊ",
"eɪ",
"i",
"iː",
"oɪ",
"oʊ",
"u",
"uː",
"æ",
"ɑ",
"ɑː",
"ɔ",
"ɔɪ",
"ɔː",
"ə",
"ɚ",
"ɛ",
"ɝ",
"ɝː",
"ɪ",
"ʊ",
"ʌ",
},
"english_uk": {
"aɪ",
"aʊ",
"eɪ",
"i",
"iː",
"oɪ",
"oʊ",
"u",
"uː",
"æ",
"ɑ",
"ɑː",
"ɔ",
"ɔɪ",
"ɔː",
"ə",
"ɚ",
"ɛ",
"ɝ",
"ɝː",
"ɪ",
"ʊ",
"ʌ",
"aɪ",
"aʊ",
"eɪ",
"i",
"iː",
"oɪ",
"oʊ",
"u",
"uː",
"æ",
"ɑ",
"ɑː",
"ɒ",
"ɔ",
"ɔɪ",
"ɔː",
"ɛ",
"ɛː",
"ɜ",
"ɜː",
"ʊ",
"ʌ",
},
"vietnamese_hanoi": {"a", "aː", "e", "i", "o", "u", "ɔ", "ə", "əː", "ɛ", "ɨ", "ʊ", "ɪ"},
"vietnamese_hue": {"a", "aː", "e", "i", "o", "u", "ɔ", "ə", "əː", "ɛ", "ɨ", "ʊ", "ɪ"},
"vietnamese_hochiminhcity": {
"a",
"aː",
"e",
"i",
"o",
"u",
"ɔ",
"ə",
"əː",
"ɛ",
"ɨ",
"ʊ",
"ɪ",
},
"mandarin_hani": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"},
"mandarin_hani_standard": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"},
"mandarin_hani_taiwan": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"},
"mandarin_hani_beijing": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"},
"thai": {
"a",
"aː",
"e",
"eː",
"i",
"iː",
"o",
"oː",
"u",
"ə",
"uː",
"ɔ",
"ɔː",
"ɛ",
"ɛː",
"ɤ",
"ɤː",
"ɯ",
"ɯː",
},
"swedish": {
"a",
"aʊ",
"aː",
"e",
"eː",
"i",
"iː",
"o",
"oː",
"u",
"uː",
"y",
"yʷ",
"yː",
"æ",
"æː",
"êː",
"ø",
"øː",
"ø̀ː",
"œ",
"œː",
"œ̞",
"œ̞ː",
"ɑ",
"ɑː",
"ɒː",
"ɒ̀ː",
"ɔ",
"ə",
"ɚ",
"ɛ",
"ɛɵ",
"ɛː",
"ɛ̂",
"ɛ̄",
"ɜ",
"ɝ",
"ɪ",
"ɵ",
"ɵː",
"ɵ̄",
"ɶ",
"ɶː",
"ʉ",
"ʉː",
"ʉ̂ː",
"ʉ̟ː",
"ʊ",
"ʊː",
"ʏ",
"ỳː",
"ỵː",
},
}
VOWEL_PATTERNS = {"swedish": re.compile(r"^[aeiɛøæuoʊêɔɪœɑʉɵɶ̂œ̞ː˧˩ɒyʏʉ̟ː˧˩əː˧˩˥]+$")}
LANG_MAPPING = {
"bulgarian": {
"d̪": "d",
"t̪": "t",
"ɐ": "a",
"æ": "a",
"a̟": "a",
"e": "ɛ",
"ə": "ɤ",
"o̝": "ɔ",
"o̟": "ɔ",
"u̟": "u",
"ʉ": "u",
"ʊ": "u",
"ɤ̞": "ɤ",
"ɤ̟": "ɤ",
"lʲ": "ʎ",
"l": "ɫ",
"ɾ": "r",
"iː": "i j",
"s̪": "s",
"n̪": "n",
"ɾʲ": "rʲ",
"nʲ": "ɲ",
"ɡʲ": "ɟ",
"kʲ": "c",
},
"russian": {},
"czech": {
"ɫ": "l",
"ɾ": "r",
"ɔ": "o",
"ɔː": "oː",
},
"serbocroatian_croatian": {
"ʋ": "v",
"ɕ": "ʃ",
"ʑ": "ʒ",
"ô": "o˦˨",
"ôː": "oː˦˨",
"ûː": "uː˦˨",
"û": "u˦˨",
"î": "i˦˨",
"îː": "iː˦˨",
"êː": "eː˦˨",
"ê": "e˦˨",
"âː": "aː˦˨",
"â": "a˦˨",
"r̂": "r̩˦˨",
"r̂ː": "r̩ː˦˨",
"řː": "r̩ː˨˦",
"ř": "r̩˨˦",
"ěː": "eː˨˦",
"ě": "e˨˦",
"ǎ": "a˨˦",
"ǎː": "aː˨˦",
"ǐː": "iː˨˦",
"ǐ": "i˨˦",
"ǒ": "o˨˦",
"ǒː": "oː˨˦",
"ǔː": "uː˨˦",
"ǔ": "u˨˦",
},
"serbocroatian_serbian": {
"ʋ": "v",
"ɕ": "ʃ",
"ʑ": "ʒ",
"ô": "o˦˨",
"ôː": "oː˦˨",
"ûː": "uː˦˨",
"û": "u˦˨",
"î": "i˦˨",
"îː": "iː˦˨",
"êː": "eː˦˨",
"ê": "e˦˨",
"âː": "aː˦˨",
"â": "a˦˨",
"r̂": "r̩˦˨",
"r̂ː": "r̩ː˦˨",
"řː": "r̩ː˨˦",
"ř": "r̩˨˦",
"ěː": "eː˨˦",
"ě": "e˨˦",
"ǎ": "a˨˦",
"ǎː": "aː˨˦",
"ǐː": "iː˨˦",
"ǐ": "i˨˦",
"ǒ": "o˨˦",
"ǒː": "oː˨˦",
"ǔː": "uː˨˦",
"ǔ": "u˨˦",
},
"french": {"r": "ʁ", "œ̃": "ɛ̃"},
"vietnamese_hanoi": {
"k̟̚": "k̚",
"ŋ̟": "ŋ",
"ï": "ɨ",
},
"german": {
"b̥": "b",
"d̥": "d",
"ɡ̊": "ɡ",
"r": "ʁ",
"ŋ̍": "n̩",
"ɱ̩": "n̩",
"ŋ̩": "n̩",
"ʀ": "ʁ",
"χ": "x",
"ʋ": "v",
"ɘ": "ə",
"i": "ɪ",
"ø": "øː",
"o": "ɔ",
"u": "ʊ",
"œː": "øː",
"y": "ʏ",
"e": "ɛ",
"ɛː": "eː",
"ɔː": "oː",
"ɑː": "aː",
"ɒː": "aː",
},
"mandarin_hani": {
"b̥": "p",
"d̥": "t",
"g̊": "k",
"ɡ̊": "k",
"ɖʐ̥": "ʈʂ",
"dz̥": "ts",
"dʑ̥": "tɕ",
"ä": "a",
"æ̃": "a",
"ɤ": "o",
"ɤ̃": "o",
"ʊ̃": "o",
"ɪ": "i",
"ɻʷ": "ɻ",
"ʊ": "u",
"ɛ": "e",
"ɑ": "a",
"ɑ̃": "a",
"ɔ": "o",
"ɔː": "o",
"⁵⁵": "˥",
"⁵¹": "˥˩",
"³⁵": "˧˥",
"²¹⁴": "˨˩˦",
},
"mandarin_hani_taiwan": {
"b̥": "p",
"d̥": "t",
"g̊": "k",
"ɡ̊": "k",
"ɖʐ̥": "ʈʂ",
"dz̥": "ts",
"dʑ̥": "tɕ",
"ä": "a",
"æ̃": "a",
"ɤ": "o",
"ɤ̃": "o",
"ʊ̃": "o",
"ɪ": "i",
"ɻʷ": "ɻ",
"ʊ": "u",
"ɛ": "e",
"ɑ": "a",
"ɑ̃": "a",
"ɔ": "o",
"ɔː": "o",
"⁵⁵": "˥",
"⁵¹": "˥˩",
"³⁵": "˧˥",
"²¹⁴": "˨˩˦",
},
"mandarin_hani_beijing": {
"b̥": "p",
"d̥": "t",
"g̊": "k",
"ɡ̊": "k",
"ɖʐ̥": "ʈʂ",
"dz̥": "ts",
"dʑ̥": "tɕ",
"ä": "a",
"æ̃": "a",
"ɤ": "o",
"ɤ̃": "o",
"ʊ̃": "o",
"ɪ": "i",
"ɻʷ": "ɻ",
"ʊ": "u",
"ɛ": "e",
"ɑ": "a",
"ɑ̃": "a",
"ɔ": "o",
"ɔː": "o",
"⁵⁵": "˥",
"⁵¹": "˥˩",
"³⁵": "˧˥",
"²¹⁴": "˨˩˦",
},
"mandarin_hani_standard": {
"b̥": "p",
"d̥": "t",
"g̊": "k",
"ɡ̊": "k",
"ɖʐ̥": "ʈʂ",
"dz̥": "ts",
"dʑ̥": "tɕ",
"ä": "a",
"æ̃": "a",
"ɤ": "o",
"ɤ̃": "o",
"ʊ̃": "o",
"ɪ": "i",
"ɻʷ": "ɻ",
"ʊ": "u",
"ɛ": "e",
"ɑ": "a",
"ɑ̃": "a",
"ɔ": "o",
"ɔː": "o",
"⁵⁵": "˥",
"⁵¹": "˥˩",
"³⁵": "˧˥",
"²¹⁴": "˨˩˦",
},
"polish": {
"s̪": "s",
"r̥ː": "r",
"r̥": "r",
"ɫ": "l",
"w̃": "n",
},
"portuguese_brazil": {
"ã": "ɐ̃",
"ɫ": "l",
"ʁ": "x",
"ɹ": "x",
"ɻ": "x",
"χ": "x",
"ɦ": "x",
"h": "x",
"r": "x",
"ɪ": "i",
"ʊ": "u",
},
"portuguese_portugal": {
"ã": "ɐ̃",
"ɫ": "l",
"r": "ʁ",
},
"swedish": {
"ɛ̄": "ɛ̂",
"ɵ̄": "ɵ̂",
"ɘ": "ɵ",
"ə": "ɛ",
"ʁ": "r",
"ɾ": "r",
"ɹ": "r",
"v": "ʋ",
"w": "ʋ",
"ɜ": "ɛ",
"æː": "ɛː",
"ø": "øː",
"æ": "ɛ",
"ˇl": "l",
"yʷ": "y",
"œ̞ː": "øː",
"œː": "øː",
"œ̞": "œ",
"ç": "ɕ",
"bː": "b", # removing length in consonants
"ɖː": "ɖ",
"ɖˑ": "ɖ",
"ˈt": "tʰ",
"ˈk": "kʰ",
"ˈp": "pʰ",
"dː": "d",
"jː": "j",
"kː": "kʰ",
"lː": "l",
"mː": "m",
"nː": "n",
"fː": "f",
"ɧː": "ɧ",
"pː": "pʰ",
"rː": "r",
"sː": "s",
"tˑ": "t",
"tʰː": "tʰ",
"pʰː": "pʰ",
"kʰː": "kʰ",
"tː": "tʰ",
"ŋː": "ŋ",
"ɲ": "ɳ",
"ɕː": "ɕ",
"ɡː": "ɡ",
"ʈː": "ʈʰ",
"ʈʰː": "ʈʰ",
"ʂː": "ʈ",
"ỵː": "yː",
"ʉ̟̂": "ʉ̂",
"ʉ̟ː": "ʉː",
"ʉ̂": "ʉ̂ː",
"ɒː": "ɑː",
"aː": "ɑː",
"ɑ": "ɑː",
"e": "eː",
"o": "oː",
"u": "uː",
"i": "iː",
"y": "yː",
"ɒ̀ː": "ɑ̀ː",
"ʊː": "ʊ",
"ʉ": "ʉː",
"ɵː": "uː",
"ɶː": "øː",
},
"tamil": {
"l̪": "l",
"l̪ː": "lː",
"r̥": "r",
"ɾ̪": "ɾ",
"h": "ɦ",
"tʃ": "tɕ",
"ɕ": "tɕ",
"tʃː": "tɕː",
},
"thai": {
"cʰ": "tɕʰ",
"c": "tɕ",
"ɔ̌": "ɔ˩˩˦",
"ǎː": "aː˩˩˦",
"áː": "aː˦˥",
"à": "a˨˩",
"ì": "i˨˩",
},
"ukrainian": {
"ɫ": "l",
"ʍ": "ʋ",
"w": "ʋ",
"v": "ʋ",
#'e': 'ɛ',
#'o': 'ɔ',
"ɫː": "lː",
},
"japanese": {
"o̞": "o",
"n̩": "n",
"ä": "a",
"ɡ̊": "ɡ",
"ḁ": "a",
"ẽ": "e",
"m̩ː": "mː",
"e̥": "e",
"u͍": "ɯ",
"ɯ̃ᵝ": "ɯ",
"u͍ː": "ɯː",
"w͍": "w",
"y": "j",
"r": "ɾ",
"ɽ": "ɾ",
"ɾ̥": "ɾ",
"ɯᵝ": "ɯ",
"ɯᵝː": "ɯː",
"ɯ̟̃ᵝː": "ɯː",
"ɯ̥ᵝ": "ɯ̥",
"ʲkʲ": "kʲ",
"nʲ": "ɲ",
"tɕʲ": "tɕ",
"ɕʲ": "ɕ",
"ĩː": "iː",
"õ̞ː": "oː",
"i̥̥": "i̥",
"e̞̊": "e",
"ẽ̞ː": "eː",
"ã̠ː": "aː",
"õ̞": "o",
"d̥": "d",
"b̥": "b",
"o̞ː": "oː",
"e̞ː": "eː",
"e̞": "e",
"ẽ̞": "e",
"ĩ": "i",
"ɸ̥": "ɸ",
"ɨ̃ᵝː": "ɨː",
"ĩ̥": "i",
"a̠ː": "aː",
"a̠": "a",
"o̞̊": "o",
"dʑʲ": "dʑ",
"ɾ̠": "ɾ",
"ã̠": "a",
"õ̥": "o",
"dʲ": "dʑ",
"tʲ": "tɕ",
# 'ɯ̟ᵝ': 'ɯ',
"ɰᵝ": "w",
"ɰᵝː": "wː",
# 'ɯ̟̊ᵝ': 'ɨ̥',
# 'ɯ̟ᵝː': 'ɨː',
# 'ɯ̟̃ᵝ': 'ɨ',
# 'ɨ̥ᵝ': 'ɨ̥',
# 'ɨᵝ': 'ɨ',
# 'ɨ̃ᵝ': 'ɨ',
# 'ɨᵝː': 'ɨː',
"ɯ̟̊": "ɯ̥",
"ɲ̟": "ɲ",
"ŋʲ": "ɲ",
"p̚ʲ": "p̚",
"k̚ʲ": "k̚",
"t̚ʲ": "t̚",
},
"turkish": {
"ɑ": "a",
"ɑː": "a",
"aː": "a",
"iː": "i",
"uː": "u",
"ɛ": "e",
"e̞": "e",
"ɔ": "o",
"ʊ": "u",
"ʏ": "y",
"β": "v",
"o̞": "o",
"ɪ": "i",
"ø": "œ",
"ɾ̝̊": "ɾ",
},
"korean_hangul": {
"a̠": "a",
"e̞": "e",
"e̞ː": "eː",
"a̠ː": "a",
"o̞": "o",
"o̞ː": "oː",
"ʌ̹": "ʌ",
"ɘː": "ʌː",
"ɦ": "h",
"ɸʷ": "ɸ",
"ʃʰ": "sʰ",
},
"english_uk": {
"ɝː": "ɜː",
"əː": "ɜː",
"æː": "æ",
"ɝ": "ɜ",
"ɚ": "ə",
"ɫ": "l",
"r": "ɹ",
"ʍ": "w",
},
"english_us": {
"ɫ": "l",
"r": "ɹ",
"ʍ": "w",
"æː": "æ",
},
"spanish_spain": {
"ɣ̞": "ɣ",
"β̞": "β",
"ð̞": "ð",
"θ̬": "θ",
"w̝": "w",
"nʲ": "ɲ",
"n̟": "n",
"lʲ": "ʎ",
"l̟": "l",
"i̯": "j",
"u̯": "w",
"h": "x",
"n̪": "n",
"d": "d̪",
},
"spanish_latin_america": {
"ɣ̞": "ɣ",
"β̞": "β",
"ð̞": "ð",
"w̝": "w",
"nʲ": "ɲ",
"lʲ": "ʎ",
"i̯": "j",
"u̯": "w",
"n̪": "n",
"l̪": "l",
"l̟": "l",
"h": "x",
"n̟": "n",
"d": "d̪",
},
}
GLOBAL_REMAPPING = {
"õ": "õ", # Fix glyphs to use diacritics
"ẽ": "ẽ",
"ũ": "ũ",
"ĩ": "ĩ",
"ã": "ã",
}
def read_source(lang):
graphemes = set()
phones = set()
dictionary = []
path = os.path.join(WIKIPRON_DIR, LANG_PATHS[lang])
with open(path, "r", encoding="utf8") as f:
for line in f:
line = line.strip()
if not line:
continue
if "\t" in line:
line = line.split("\t")
word = line[0]
pronunciation = line[1].split()
else:
line = line.split()
word = line[0]
pronunciation = line[1:]
word = word.lower()
if lang in BAD_GRAPHEMES:
if any(x in BAD_GRAPHEMES[lang] for x in word):
print(word)
continue
graphemes.update(word)
phones.update(pronunciation)
dictionary.append((word, pronunciation))
return dictionary, graphemes, phones
def save_dictionary(dictionary, lang):
deduplication = set()
final_phones = collections.Counter()
path = os.path.join(OUTPUT_DIR, f"{lang}_mfa.dict")
with open(path, "w", encoding="utf8") as f:
for w, p in sorted(dictionary):
final_phones.update(p)
p = " ".join(p)
if (w, p) in deduplication:
continue
f.write("{}\t{}\n".format(w, p))
deduplication.add((w, p))
print("Final phones:", sorted(final_phones))
print("Final phone counts:", sorted(final_phones.items(), key=lambda x: -x[1]))
def convert_language_specific(word, phones, lang):
new_pron = []
if lang == "swedish":
for i, p in enumerate(phones):
for k, v in LANG_MAPPING[lang].items():
if p == k:
phones[i] = v
break
for i, p in enumerate(phones):
if p == "¹":
found_first = False
found_second = False
for j in range(i + 1, len(phones)):
if VOWEL_PATTERNS[lang].match(phones[j]):
if not found_first:
phones[j] += "˥˧" # Falling tone
found_first = True
elif not found_second:
phones[j] += "˩" # Low tone
found_second = True
else:
break
continue
elif p == "²":
found_first = False
found_second = False
for j in range(i + 1, len(phones)):
if phones[j] in VOWELS[lang]:
if not found_first:
phones[j] += "˧˩" # Falling tone
found_first = True
elif not found_second:
phones[j] += "˥˩" # Falling tone
found_second = True
else:
break
continue
new_pron.append(p)
phones = new_pron
new_pron = []
for i, p in enumerate(phones):
if lang == "english_us":
if lang in LANG_MAPPING:
for k, v in LANG_MAPPING[lang].items():
if p == k:
p = v
break
if p == "ʒ" and len(new_pron) and new_pron[-1] == "d": # fix up affricates being split
new_pron[-1] = "dʒ"
continue
elif (
p == "ʃ" and len(new_pron) and new_pron[-1] == "t"
): # fix up affricates being split
new_pron[-1] = "tʃ"
continue
elif p in ["ɪ", "j"] and len(new_pron) and new_pron[-1] in {"e", "ɔ", "o"}:
new_pron[-1] += "ɪ"
continue
elif p in {"ʊ", "u"} and len(new_pron) and new_pron[-1] in {"a", "ɑ", "ʌ"}:
new_pron[-1] = "aʊ"
continue
elif p in ["ɪ", "j"] and len(new_pron) and new_pron[-1] in {"a", "ɑ", "ʌ"}:
new_pron[-1] = "aɪ"
continue
elif p in {"ʊ", "u"} and len(new_pron) and new_pron[-1] in {"ə", "o", "ɔ"}:
new_pron[-1] = "oʊ"
continue
elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"o", "ɔ"}:
new_pron[-1] = "ɔ"
p = "ɹ"
elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"i", "ɪː", "ɪ"}:
new_pron[-1] = "ɪ"
p = "ɹ"
elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"u", "ʊ"}:
new_pron[-1] = "ʊ"
p = "ɹ"
elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"e", "ɛ", "ɛː", "æ", "æː"}:
new_pron[-1] = "ɛ"
p = "ɹ"
elif p == "ɹ" and len(new_pron) and new_pron[-1] in ["ɜ", "ɜː"]:
new_pron[-1] = "ɝ"
continue
elif (
p == "ɹ"
and len(new_pron) > 1
and new_pron[-1] == "ə"
and new_pron[-2] in {"ɪ", "i", "ɪː"}
):
new_pron[-1] = "ɹ"
new_pron[-2] = "ɪ"
continue
elif (
p == "ɹ"
and len(new_pron) > 1
and new_pron[-1] == "ə"
and new_pron[-2] in {"ʊ", "u"}
):
new_pron[-1] = "ɹ"
new_pron[-2] = "ʊ"
continue
elif (
p == "ɹ"
and len(new_pron) > 1
and new_pron[-1] == "ə"
and new_pron[-2] in {"e", "ɛ", "ɛː"}
):
new_pron[-1] = "ɹ"
new_pron[-2] = "ɛ"
continue
elif p == "w" and len(new_pron) and new_pron[-1] == "h": # get rid of h w sequences
new_pron[-1] = "w"
continue
elif p in {"k", "ɡ"} and len(new_pron) and new_pron[-1] == "n":
new_pron[-1] = "ŋ"
continue
elif p in {"ɜ", "ɜː"} and (i == len(phones) - 1 or phones[i + 1] != "ɹ"):
p = "ɝ"
elif p == "ɪ" and i == len(phones) - 1:
p = "i"
elif (
p == "l" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1
): # final syllabic l's
new_pron[-1] = "l̩"
continue
elif (
p == "m" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1
): # final syllabic m's
new_pron[-1] = "m̩"
continue
elif (
p == "n" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1
): # final syllabic n's
new_pron[-1] = "n̩"
continue
elif (
p == "ɹ" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1
): # final syllabic r's
new_pron[-1] = "ɚ"
continue
elif p == "ŋ" and i == len(phones) - 1 and new_pron[-1] == "i":
new_pron[-1] = "ɪ"
continue
elif lang == "english_uk":
if lang in LANG_MAPPING:
for k, v in LANG_MAPPING[lang].items():
if p == k:
p = v
break
if p == "ɹ" and i == len(phones) - 1:
continue
elif p == "ɪ" and i == len(phones) - 1:
p = "i"
elif (
p in {"l", "m", "n"}
and i == len(phones) - 1
and len(new_pron)
and new_pron[-1] in {"ə", "əː"}
):
new_pron[-1] = p + "̩"
continue
elif p == "ɪ" and len(new_pron) and new_pron[-1] in {"e", "a", "ɔ", "o"}:
new_pron[-1] = new_pron[-1] + p
continue
elif p == "ʊ" and len(new_pron) and new_pron[-1] in {"e", "a"}:
new_pron[-1] = new_pron[-1] + p
continue
elif p in {"ʊ", "u"} and len(new_pron) and new_pron[-1] in {"ə", "o", "ɔ"}:
new_pron[-1] = "oʊ"
continue
elif (
p == "ʒ" and len(new_pron) and new_pron[-1] == "d"
): # fix up affricates being split
new_pron[-1] = "dʒ"
continue
elif (
p == "ʃ" and len(new_pron) and new_pron[-1] == "t"
): # fix up affricates being split
new_pron[-1] = "tʃ"
continue
elif p == "w" and len(new_pron) and new_pron[-1] == "h": # get rid of h w sequences
new_pron[-1] = "w"
continue
elif p in {"k", "ɡ"} and len(new_pron) and new_pron[-1] == "n":
new_pron[-1] = "ŋ"
continue
elif p == "ə" and len(new_pron) and new_pron[-1] == "ɛ":
new_pron[-1] = "ɛː"
continue
elif p == "ŋ" and i == len(phones) - 1 and new_pron[-1] == "i":
new_pron[-1] = "ɪ"
continue
elif (
p == "ɹ"
and len(new_pron) > 2
and new_pron[-1] == "ə"
and new_pron[-2] in {"e", "ɛ", "ʊ", "ɪ", "ɪː", "ɛː"}
):
new_pron[-1] = p
continue
elif lang == "bulgarian":
if p in {"s", "ʃ", "sʲ"} and len(new_pron) and new_pron[-1] == "t":
new_pron[-1] += p
continue
elif p == "ʒ" and len(new_pron) and new_pron[-1] == "d":
new_pron[-1] = "dʒ"
continue
elif p in {"ɡ", "k"} and len(new_pron) and new_pron[-1] in {"n"}:
new_pron[-1] = "ŋ"
elif p in {"v", "f"} and len(new_pron) and new_pron[-1] in {"n"}:
new_pron[-1] = "ɱ"
elif lang == "czech":
if p in {"u", "ʊ"} and len(new_pron) and new_pron[-1] in {"o", "ɔ"}:
new_pron[-1] = "ow"
continue
elif p in ["u", "ʊ"] and len(new_pron) and new_pron[-1] in {"a"}:
new_pron[-1] = "aw"
continue
elif p in {"u", "ʊ"} and len(new_pron) and new_pron[-1] in {"e", "ɛ"}:
new_pron[-1] = "ew"
continue
elif p in {"ʃ", "s"} and len(new_pron) and new_pron[-1] in {"t"}:
new_pron[-1] += p
continue
elif p in {"ʒ"} and len(new_pron) and new_pron[-1] in {"d"}:
new_pron[-1] += p
continue
elif p == "ʊ":
p = "u"
elif p == "e":
p = "ɛ"
elif lang.startswith("serbocroatian"):
if p in {"ɕ", "ʂ", "ʃ"} and len(new_pron) and new_pron[-1] == "t":
new_pron[-1] += p
continue
elif p in {"ʑ", "ʐ", "ʒ"} and len(new_pron) and new_pron[-1] == "d":
new_pron[-1] += p
continue
elif lang == "german":
if lang in LANG_MAPPING:
for k, v in LANG_MAPPING[lang].items():
if p == k:
p = v
break
if p in {"ʏ", "ɪ"} and len(new_pron) and new_pron[-1] in {"o", "ɔ"}:
new_pron[-1] = "ɔʏ"
continue
elif p == "ɪ" and len(new_pron) and new_pron[-1] == "a":
new_pron[-1] = "aɪ"
continue
elif p == "ɪ" and len(new_pron) and new_pron[-1] == "ʊ":
new_pron[-1] = "ʊɪ"
continue
elif p == "ʊ" and len(new_pron) and new_pron[-1] == "a":
new_pron[-1] = "aʊ"
continue
elif p == "e" and len(new_pron) and new_pron[-1] == "ɐ":
new_pron[-1] = "ɐ"
continue
elif p == "ʔ":
continue
elif p in {"tʰ", "kʰ", "pʰ"} and i == len(phones) - 1:
p = p[0]
elif (
p in {"tʰ", "kʰ", "pʰ"}
and len(new_pron)
and new_pron[-1] in {"s", "ts", "ʃ", "tʃ"}
):
p = p[0]
elif p in {"t", "k", "p"} and i == 0:
p += "ʰ"
elif p in {"s", "ʃ"} and i == 1 and new_pron[-1] in {"tʰ"}:
new_pron[-1] = "t" + p
continue
elif (
p in {"v", "s", "x", "ʁ", "l", "j"}
and len(new_pron)
and new_pron[-1] in {"tʰ", "kʰ", "pʰ"}
):
new_pron[-1] = new_pron[-1][0]
elif p == "s" and len(new_pron) and new_pron[-1] == "t":
if "z" in word or "c" in word:
new_pron[-1] = "ts"
continue
elif p == "õ":
new_pron.append("ɔ")
new_pron.append("n")
continue
elif p == "ɛ̃":
new_pron.append("eː")
new_pron.append("n")
continue
elif lang.startswith("mandarin_hani"):
vowel_pattern = re.compile(r"^[ayeiouəɚʊɤ̃]+[²³⁰¹⁴⁵]*$")
for k, v in LANG_MAPPING[lang].items():
if p == k:
p = v
break
if p in {"²", "³", "¹", "⁰", "⁴", "⁵", "⁻", "⁽", "⁾"} and len(new_pron):
index = -1
for j in range(len(new_pron) - 1, -1, -1):
if vowel_pattern.match(new_pron[j]) or "̩" in new_pron[j]:
index = j
break
if new_pron[index].endswith("²¹⁴"):
continue
new_pron[index] += p
continue
elif p.startswith("ˀ"):
new_pron.append("ʔ")
if p[1] in LANG_MAPPING[lang]:
new_pron.append(LANG_MAPPING[lang][p[1]])
else:
new_pron.append(p[1])
continue
elif (
any(p.startswith(x) for x in VOWELS[lang])
and len(new_pron)
and re.match(r"^[ayeiouəɚʊɤ̃]+$", new_pron[-1])
):
new_pron[-1] += p
continue
elif lang == "portuguese_brazil":
for k, v in LANG_MAPPING[lang].items():
if p == k:
p = v
break
if p == "w̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ"}:
new_pron[-1] += p
continue
elif p == "j̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ", "ẽ", "ũ"}:
new_pron[-1] += p
continue
elif (
p == "j" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "ɔ", "o", "u"}
):
new_pron[-1] += p
continue
elif p == "w" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "i"}:
new_pron[-1] += p
continue
elif lang == "portuguese_portugal":
for k, v in LANG_MAPPING[lang].items():
if p == k:
p = v
break
if p == "w̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ"}:
new_pron[-1] += p
continue
elif p == "j̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ", "ẽ", "ũ"}:
new_pron[-1] += p
continue
elif (
p == "j" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "ɔ", "o", "u"}
):
new_pron[-1] += p
continue
elif p == "w" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "i"}:
new_pron[-1] += p
continue
elif lang == "swedish":
if p == "ʒ" and len(new_pron) and new_pron[-1] == "d":
new_pron[-1] += p
continue
elif (
p in {"k", "kʰ", "ɡ"}
and len(new_pron)
and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"}
):
new_pron[-1] = "ŋ"
elif (
p in {"t", "tʰ", "d"}
and len(new_pron)
and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"}
):
new_pron[-1] = "n"
elif (
p in {"p", "pʰ", "b"}
and len(new_pron)
and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"}
):
new_pron[-1] = "m"
elif (
p in {"ʈ", "ʈʰ", "ɖ", "ʂ"}
and len(new_pron)
and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"}
):
new_pron[-1] = "ɳ"
elif p == "s" and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
new_pron[-1] = "ʂ"
continue
elif p in {"t", "ʈ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
new_pron[-1] = "ʈ"
continue
elif p in {"d", "ɖ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
new_pron[-1] = "ɖ"
continue
elif (
p in {"n", "ɳ"} == "n"
and len(new_pron)
and new_pron[-1] == "r"
and "rr" not in word
):
new_pron[-1] = "ɳ"
continue
elif p in {"l", "ɭ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
new_pron[-1] = "ɭ"
continue
elif p in {"tʰ", "ʈʰ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
new_pron[-1] = "ʈʰ"
continue
elif p in {"s", "ʂ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
new_pron[-1] = "ʂ"
continue
elif p == "aʊ":
new_pron.append("a")
new_pron.append("ʊ")
continue
elif p in {"r", "n", "l", "t", "k", "ɡ"} and len(new_pron) and new_pron[-1] == "ə":
new_pron[-1] = "ɛ"
elif p in {"t", "k", "p", "ʈ"} and not len(new_pron):
p += "ʰ"
elif (
not VOWEL_PATTERNS[lang].match(p)
and len(new_pron)
and new_pron[-1] in {"tʰ", "kʰ", "pʰ", "ʈʰ"}
):
print(new_pron[-1], p)
new_pron[-1] = new_pron[-1][0]
elif (
p in {"tʰ", "kʰ", "pʰ", "ʈʰ"}
and len(new_pron)
and (new_pron[-1] in {"ʂ", "s"} or i == len(phones) - 1)
):
p = p[0]
elif p == "ə" and i == len(phones) - 1:
p = "e"
elif p in {"r"} and len(new_pron) and new_pron[-1] == "ɜ":
new_pron[-1] = "æː"
elif p == "ɜ" and i == len(phones) - 1:
p = "e"
elif lang == "tamil":
if p in {"ʊ", "ɪ"} and len(new_pron) and new_pron[-1] == "a":
new_pron[-1] += p
continue
elif lang in ["spanish_spain", "spanish_latin_america"]:
if p in {"n", "m", "ɲ"} and len(new_pron) and new_pron[-1] in {"n", "m", "ɲ"}:
new_pron[-1] = p
continue
if p in {"s", "z"} and len(new_pron) and new_pron[-1] in {"s", "z"}:
new_pron[-1] = p
continue
if p in {"x", "k", "ɡ"} and len(new_pron) and new_pron[-1] == "n":
new_pron[-1] = "ŋ"
elif (
p in {"ɟʝ", "ʝ", "j", "tʃ", "i", "ĩ"} and len(new_pron) and new_pron[-1] in {"n"}
):
new_pron[-1] = "ɲ"
elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"k"}:
new_pron[-1] = "c"
elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"x"}:
new_pron[-1] = "ç"
elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"ɡ"}:
new_pron[-1] = "ɟ"
elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"ɣ"}:
new_pron[-1] = "ʝ"
elif (
p in {"ɟʝ", "ʝ", "j", "tʃ", "i", "ĩ"} and len(new_pron) and new_pron[-1] in {"l"}
):
new_pron[-1] = "ʎ"
elif (
p
in {
"β",
"b",
"p",
}
and len(new_pron)
and new_pron[-1] == "n"
):
new_pron[-1] = "m"
elif (
p
in {
"f",
"v",
}
and len(new_pron)
and new_pron[-1] in {"n", "m", "n̪"}
):
new_pron[-1] = "ɱ"
elif lang == "thai":
if p in {"a"} and len(new_pron) and new_pron[-1] in {"i", "iː", "ɯ", "ɯː", "u", "uː"}:
new_pron[-1] += p
continue
elif lang == "turkish":
if p == "ʃ" and len(new_pron) and new_pron[-1] in {"t"}:
new_pron[-1] += p
continue
elif p == "ʒ" and len(new_pron) and new_pron[-1] in {"d"}:
new_pron[-1] += p
continue
elif p in {"e", "i", "œ", "y"} and len(new_pron) and new_pron[-1] in {"k"}:
new_pron[-1] = "c"
elif p in {"e", "i", "œ", "y"} and len(new_pron) and new_pron[-1] in {"ɡ"}:
new_pron[-1] = "ɟ"
elif p in {"a", "ɯ", "o", "u"} and len(new_pron) and new_pron[-1] in {"l"}:
new_pron[-1] = "ɫ"
elif p in {"i", "e", "œ", "y"} and len(new_pron) and new_pron[-1] in {"ɫ"}:
new_pron[-1] = "l"
elif lang == "portuguese_brazil":
if p == "ʃ" and len(new_pron) and new_pron[-1] in {"t"}:
new_pron[-1] += p
continue
elif p == "ʒ" and len(new_pron) and new_pron[-1] in {"d"}:
new_pron[-1] += p
continue
elif lang == "russian":
voiced_set = {
"v",
"bʲ",
"b",
"bː",
"d",
"dz",
"dzʲ",
"dʐ",
"dʲ",
"dʲː",
"dː",
"v",
"vʲ",
"vʲː",
"vː",
"z",
"zʲ",
"zʲː",
"zː",
"ɡ",
"ɡʲ",
"ɡː",
"ɣ",
"ʐ",
"ʐː",
"ʑː",
}
if p in {"ʔ"}:
continue
elif lang == "japanese":
for k, v in LANG_MAPPING[lang].items():
if p == k:
p = v
break
if p in {".", "˕", "}", "˦˨˦", "˥", "˨˩", "˧", "꜔", "˩", "ʔ", "%", "˩˥"}:
continue
elif p in {
"̥",
"̥̥",
} and len(new_pron):
new_pron[-1] += "̥"
continue
elif p in {"ᵝ̥"} and len(new_pron):
if "̥" not in new_pron[-1] and "ː" not in new_pron[-1]:
new_pron[-1] += "̥"
continue
elif p in {"ː̥"} and len(new_pron):
new_pron[-1] += "ː"
continue
elif (
p in {"j"}
and len(new_pron)
and new_pron[-1] in {"ɾ", "p", "m", "b", "k", "t", "d", "ç", "ɡ"}
):
new_pron[-1] += "ʲ"
continue
elif p in {"h"} and len(new_pron) and new_pron[-1] in {"c"}:
new_pron[-1] = "tɕ"
continue
elif p in {"p", "pʲ"} and len(new_pron) and new_pron[-1] in {"p̚"}:
new_pron[-1] = p + "ː"
continue
elif p in {"b"} and len(new_pron) and new_pron[-1] in {"b̚"}:
new_pron[-1] = p + "ː"
continue
elif p in {"ɾ", "ɾʲ"} and len(new_pron) and new_pron[-1] in {"ɾ̚"}:
new_pron[-1] = p + "ː"
continue
elif p in {"k", "kʲ"} and len(new_pron) and new_pron[-1] in {"k̚"}:
new_pron[-1] = p + "ː"
continue
elif p in {"t", "tʲ"} and len(new_pron) and new_pron[-1] in {"ʔ̥", "t̚"}:
new_pron[-1] = p + "ː"
continue
elif p in {"tɕ", "ts"} and len(new_pron) and new_pron[-1] in {"t̚"}:
new_pron[-1] = p + "ː"
continue
elif p in {"ɡ"} and len(new_pron) and new_pron[-1] in {"ɡ̚"}:
new_pron[-1] = p + "ː"
continue
elif p in {"d", "dz", "ʑ", "dʑ"} and len(new_pron) and new_pron[-1] in {"d̚"}:
new_pron[-1] = p + "ː"
continue
elif p in {"i", "iː", "i̥"} and len(new_pron) and "ʲ" in new_pron[-1]:
if len(new_pron) > 2 and "ʲ" in new_pron[-2]:
new_pron[-2] = new_pron[-2].replace("ʲ", "")
new_pron[-1] = new_pron[-1].replace("ʲ", "")
elif p in {"i"} and len(new_pron) and new_pron[-1] == "n":
new_pron[-1] = "ɲ"
elif False and p in {"k", "ɡ"} and len(new_pron) and new_pron[-1] == "ɲ":
new_pron[-1] = "ŋ"
elif False and p in {"t", "d"} and len(new_pron) and new_pron[-1] == "ɲ":
new_pron[-1] = "n"
elif p in {"dz", "dʑ"} and len(new_pron) and new_pron[-1] not in {"ɲ", "n"}:
p = p[1]
elif p in {
"ɯ̟̃ᵝ",
"ɯ̟̊ᵝ",
"ɯ̟ᵝː",
"ɯ̟ᵝ",
"ɨ̥ᵝ",
"ɨᵝ",
"ɨ̃ᵝ",
"ɨᵝː",
"ɨ̥",
"ɨ̥ː",
"ɯ̥ː",
"ɯ̥",
}:
if len(new_pron) and new_pron[-1] in {
"t",
"tː",
"s",
"sː",
"z",
"zː",
"ɲː",
"ɲ",
"ç",
"çː",
"n",
"nː",
"ts",
"tsː",
"ɕ",
"tɕ",
"tɕː",
"ʑ",
"ɕː",
"ʑː",
"ɡʲ",
"ɡʲː",
"kʲ",
"kʲː",
"bʲ",
"bʲː",
"pʲ",
"pʲː",
"mʲ",
"mʲː",
"ɾʲː",
"ɾʲ",
"j",
}:
new_p = "ɨ"
else:
new_p = "ɯ"
if "̥" in p or "̊" in p:
new_p += "̥"
if "ː" in p:
new_p += "ː"
p = new_p
if len(new_pron) and new_pron[-1] == "n":
new_pron[-1] = "ɲ"
elif lang == "korean_hangul":
for k, v in LANG_MAPPING[lang].items():
if p == k:
p = v
break
# if p in {'e', 'ɛː', 'ɛ', 'a', 'o', 'u', 'ʌ'} and len(new_pron) and new_pron[-1] in {'j'}:
# new_pron[-1] += p
# continue
# elif p in {'i'} and len(new_pron) and new_pron[-1] in {'w', 'ɥ'}:
# new_pron[-1] = 'ɥi'
# continue
# elif p in {'e', 'ɛː', 'ɛ', 'a', 'o', 'i', 'ʌ'} and len(new_pron) and new_pron[-1] in {'w'}:
# new_pron[-1] += p
# continue
# elif p in {'i'} and len(new_pron) and new_pron[-1] in {'ɰ'}:
# new_pron[-1] += p
# continue
if p == "t͈" and "ᄄ" not in jamo.h2j(word):
if len(new_pron) and "̚" in new_pron[-1]:
p = "tʰ"
else:
p = "t"
elif p == "tɕ͈" and "ᄍ" not in jamo.h2j(word):
if len(new_pron) and "̚" in new_pron[-1]:
p = "tɕʰ"
else:
p = "tɕ"
elif p == "k͈" and "ᄁ" not in jamo.h2j(word):
if len(new_pron) and "̚" in new_pron[-1]:
p = "kʰ"
else:
p = "k"
elif p == "p͈" and "ᄈ" not in jamo.h2j(word):
if len(new_pron) and "̚" in new_pron[-1]:
p = "pʰ"
else:
p = "p"
elif p == "s͈" and "ᄊ" not in jamo.h2j(word):
if len(new_pron) and "̚" in new_pron[-1]:
p = "sʰ"
else:
p = "s"
elif p == "x" and len(new_pron) and new_pron[-1] == "k":
new_pron[-1] += "ʰ"
continue
elif lang in ["vietnamese_hanoi", "vietnamese_hue", "vietnamese_hochiminhcity"]:
vowel_pattern = re.compile(rf'^[{"".join(VOWELS[lang])}]+$')
tone_pattern = re.compile(r"^[˥˩˦˥˨˧˨˩˩˩ˀ˦]+$")
if p in {"j", "w"} and len(new_pron) and vowel_pattern.match(new_pron[-1]):
new_pron[-1] += p
continue
elif vowel_pattern.match(p) and len(new_pron) and vowel_pattern.match(new_pron[-1]):
new_pron[-1] += p
continue
elif p in {"ɗ", "ɓ"} and len(new_pron) and new_pron[-1] == "ʔ":
new_pron[-1] = p
continue
elif (
p == "ʔ"
and len(new_pron)
and tone_pattern.match(new_pron[-1])
and not (i < len(phones) - 1 and phones[i + 1] in {"ɗ", "ɓ"})
):
new_pron[-1] += "ˀ"
continue
if lang in LANG_MAPPING:
for k, v in LANG_MAPPING[lang].items():
if p == k:
p = v
break
if not p:
continue
new_pron.append(p)
tone_mapping = {
"⁰": "",
"¹": "˩",
"²": "˨",
"³": "˧",
"⁴": "˦",
"⁵": "˥",
"˧": "˧",
"˨˩": "˨˩",
"˥˩": "˥˩",
"˦˥": "˦˥",
"˩˩˦": "˩˩˦",
}
if lang == "thai":
phones = new_pron
new_pron = []
tone_symbols = {"˥˩", "˦˥", "˧", "˨˩", "˩˩˦"}
vowel_set = {x for x in VOWELS[lang]}
vowel_set |= {x + y for x, y in itertools.product(VOWELS[lang], VOWELS[lang])}
vowel_set |= {
x + y + z for x, y, z in itertools.product(VOWELS[lang], VOWELS[lang], VOWELS[lang])
}
for i, p in enumerate(phones):
if p in tone_symbols:
for j in range(len(new_pron) - 1, 0, -1):
if new_pron[j] in vowel_set and new_pron[j] not in {"w", "j"}:
new_pron[j] += tone_mapping[p]
break
else:
new_pron.append(p)
# split off tone for G2P
# for i, p in enumerate(new_pron):
# for tone in tone_mapping:
# if p.endswith(tone):
# new_pron[i] = p.replace(tone, ' ') + tone
elif lang in ["vietnamese_hanoi", "vietnamese_hue", "vietnamese_hochiminhcity"]:
phones = new_pron
new_pron = []
vowel_pattern = re.compile(rf'^[{"".join(VOWELS[lang])}]+[wj]?$')
tone_symbols = {
"˦ˀ˥",
"˧˦",
"˧˧",
"˧˨",
"˧˩",
"˨˩",
"˦˧˥",
"˦˩",
"˧˧",
"˧˨",
"˨˩",
"˨˩˦",
"˦˥",
"˨˩˨",
}
tone_pattern = re.compile(r"^[˥˩˦˥˨˧˨˩˩˩ˀ˦]+$")
for i, p in enumerate(phones):
if tone_pattern.match(p):
for j in range(len(new_pron) - 1, 0, -1):
if vowel_pattern.match(new_pron[j]):
new_pron[j] += p
break
else:
new_pron.append(p)
elif lang.startswith("mandarin_hani"):
mapping = {
"²¹⁴": "˨˩˦",
"⁵⁵": "˥˥",
"³⁵": "˧˥",
"⁵¹": "˥˩",
"⁰": "",
"¹": "˩",
"²": "˨",
"³": "˨",
"⁴": "˦",
"⁵": "˥",
}
tone_symbols = {"²", "³", "¹", "⁴", "⁵", "⁰"}
for i, p in enumerate(new_pron):
if any(x in p for x in tone_symbols):
for k, v in mapping.items():
if k in new_pron[i]:
new_pron[i] = new_pron[i].replace(k, v)
# if any(x in new_pron[i] for x in tone_symbols):
# return None
elif lang == "swedish":
for i, p in enumerate(new_pron):
if p == "êː":
new_pron[i] = "eː˧˩"
elif p == "â":
new_pron[i] = "a˧˩"
elif p == "ɛ̂":
new_pron[i] = "ɛ˧˩"
elif p == "ɑ̂ː":
new_pron[i] = "ɑː˧˩"
elif p == "ûː":
new_pron[i] = "uː˧˩"
elif p == "ʉ̂ː":
new_pron[i] = "ʉː˧˩"
elif p == "ɵ̂":
new_pron[i] = "ɵ˧˩"
elif p == "ʉ̂ː":
new_pron[i] = "ʉː˧˩"
elif p == "ʉ̟ː˥˩":
new_pron[i] = "ʉː˥˩"
elif p == "ǎ":
new_pron[i] = "a˥˧"
elif p == "ʉ̟ː˧˩":
new_pron[i] = "ʉː˧˩"
elif p == "ø̀ː":
new_pron[i] = "øː˩"
elif p == "ɑ̀ː":
new_pron[i] = "ɑː˩"
elif p == "ỳː":
new_pron[i] = "yː˩"
elif p == "ỳː˧˩":
new_pron[i] = "yː˧˩"
elif lang == "hausa":
phone_mapping = {
"á": "a",
"áː": "aː",
"é": "e",
"éː": "eː",
"í": "i",
"íː": "iː",
"ó": "o",
"óː": "oː",
"úː": "uː",
"à": "a",
"àː": "aː",
"è": "e",
"èː": "eː",
"ì": "i",
"ìː": "iː",
"ò": "o",
"òː": "oː",
"ùː": "uː",
"â": "a",
"âː": "aː",
"ê": "e",
"êː": "eː",
"î": "i",
"îː": "iː",
"ô": "o",
"ôː": "oː",
"ûː": "uː",
}
for i, p in enumerate(new_pron):
if p in {"á", "áː", "é", "éː", "í", "íː", "ó", "óː", "úː"} or "́" in p: # High tone
if p in phone_mapping:
new_pron[i] = phone_mapping[p]
else:
new_pron[i] = p.replace("́", "")
new_pron[i] += "˥"
elif p in {"à", "àː", "è", "èː", "ì", "ìː", "ò", "òː", "ùː"} or "̀" in p: # Low tone
if p in phone_mapping:
new_pron[i] = phone_mapping[p]
else:
new_pron[i] = p.replace("̀", "")
new_pron[i] += "˩"
elif (
p
in {
"â",
"âː",
"ê",
"êː",
"î",
"îː",
"ôː",
"ûː",
}
or "̂" in p
): # Falling tone
if p in phone_mapping:
new_pron[i] = phone_mapping[p]
else:
new_pron[i] = p.replace("̂", "")
new_pron[i] += "˥˦"
return new_pron
def convert_second_round(word, phones, lang):
if lang not in ["english_us", "english_uk"]:
return phones
new_pron = []
stressed_vowels = {}
if lang == "english_uk":
stressed_vowels = {
"aɪ",
"aʊ",
"eɪ",
"i",
"iː",
"oɪ",
"oʊ",
"u",
"uː",
"æ",
"ɑ",
"ɑː",
"ɒ",
"ɔ",
"ɔɪ",
"ɔː",
"ɛ",
"ɛː",
"ɜ",
"ɜː",
"ʊ",
"ʌ",
}
elif lang == "english_us":
stressed_vowels = {
"aɪ",
"aʊ",
"eɪ",
"i",
"iː",
"oɪ",
"oʊ",
"u",
"uː",
"æ",
"ɑ",
"ɑː",
"ɔ",
"ɔɪ",
"ɔː",
"ɛ",
"ɝ",
"ɝː",
"ʊ",
"ʌ",
}
all_syllabics = {"ɪ", "ə", "ɚ", "n̩", "m̩", "l̩", "ɫ̩"} | VOWELS[lang]
for i, p in enumerate(phones):
if lang in ["english_us", "english_uk"]:
if (
p == "l"
and 2 < i < len(phones) - 1
and new_pron[-1] == "ə"
and phones[i + 1] not in all_syllabics
):
new_pron[-1] = "ɫ̩"
continue
elif (
p == "m"
and 2 < i < len(phones) - 1
and new_pron[-1] == "ə"
and phones[i + 1] not in all_syllabics
):
new_pron[-1] = "m̩"
continue
elif (
p == "n"
and 2 < i < len(phones) - 1
and new_pron[-1] == "ə"
and phones[i + 1] not in all_syllabics
):
new_pron[-1] = "n̩"
continue
elif p == "l̩" and 1 < i < len(phones) - 1 and phones[i + 1] in all_syllabics:
new_pron.append("ə")
p = "l"
elif p == "l̩":
p = "ɫ̩"
elif p == "l" and i == len(phones) - 1:
p = "ɫ"
elif p == "l" and 1 < i < len(phones) - 1 and phones[i + 1] not in all_syllabics:
p = "ɫ"
elif (
p in {"t", "p", "k"}
and i == 0
and i < len(phones) - 1
and phones[i + 1] in stressed_vowels | {"ɪ", "ə", "ɚ"}
):
p += "ʰ"
elif (
p == "ə"
and 1 < i == len(phones) - 2
and phones[i - 1] in {"d", "t"}
and phones[i + 1] == "d"
):
p = "ɪ"
elif (
p == "ə"
and 1 < i == len(phones) - 2
and phones[i - 1] in {"s", "z", "ʃ", "ʒ", "tʃ", "dʒ"}
and phones[i + 1] == "z"
):
p = "ɪ"
if lang == "english_us":
if (
p == "ɹ"
and 2 < i < len(phones) - 1
and new_pron[-1] == "ə"
and phones[i + 1] not in all_syllabics
):
new_pron[-1] = "ɚ"
continue
elif (
p in {"d", "t"}
and 1 < i < len(phones) - 1
and phones[i - 1] in all_syllabics
and phones[i + 1] in {"n̩", "m̩", "l̩", "ɚ", "ə", "ɫ̩"}
):
p = "ɾ"
elif (
p in {"t", "d"}
and 1 < i < len(phones) - 2
and phones[i - 1] in all_syllabics
and phones[i + 1] == "ɪ"
and phones[i + 2] == "d"
):
p = "ɾ"
elif (
p in {"t", "d"}
and i > 1
and i == len(phones) - 2
and phones[i - 1] in all_syllabics
and phones[i + 1] == "i"
):
p = "ɾ"
elif (
p in {"t", "d"}
and i > 1
and i == len(phones) - 3
and phones[i - 1] in all_syllabics
and phones[i + 1] in {"i", "ɪ"}
and phones[i + 2] == "z"
):
p = "ɾ"
elif (
p in {"t", "d"}
and i > 1
and i == len(phones) - 3
and phones[i - 1] in all_syllabics
and phones[i + 1] == "ɪ"
and phones[i + 2] == "ŋ"
):
p = "ɾ"
elif (
p in {"t", "p", "k"}
and i == 0
and i < len(phones) - 1
and phones[i + 1] in stressed_vowels | {"ɪ", "ə", "ɚ"}
):
p += "ʰ"
elif (
p in {"t", "p", "k"}
and i > 0
and phones[i - 1] not in {"s", "ʃ"}
and i < len(phones) - 1
and phones[i + 1] in stressed_vowels
):
p += "ʰ"
elif p == "l̩" and 1 < i < len(phones) - 1 and phones[i + 1] in all_syllabics:
new_pron.append("ə")
p = "l"
elif p == "l̩":
p = "ɫ̩"
elif p == "l" and i == len(phones) - 1:
p = "ɫ"
elif (
p == "l"
and 1 < i < len(phones) - 1
and phones[i + 1] not in {"ɪ", "ə", "ɚ", "n̩", "m̩", "l̩", "ɫ̩"} | VOWELS[lang]
):
p = "ɫ"
elif (
p == "ə"
and 1 < i == len(phones) - 2
and phones[i - 1] in {"d", "t", "ɾ"}
and phones[i + 1] == "d"
):
p = "ɪ"
elif (
p == "ə"
and 1 < i == len(phones) - 2
and phones[i - 1] in {"s", "z", "ʃ", "ʒ", "tʃ", "dʒ"}
and phones[i + 1] == "z"
):
p = "ɪ"
elif (
p in {"t", "p", "k"}
and i > 0
and phones[i - 1] not in {"s", "ʃ"}
and i < len(phones) - 1
and phones[i + 1] in stressed_vowels
):
p += "ʰ"
elif lang == "english_uk":
if p not in VOWELS[lang] and len(new_pron) and new_pron[-1] == "ɹ":
new_pron[-1] = p
continue
elif (
p in {"t", "p", "k"}
and i > 0
and phones[i - 1] not in {"s", "ʃ"}
and i < len(phones) - 1
and phones[i + 1] in stressed_vowels
):
p += "ʰ"
new_pron.append(p)
return new_pron
def fix_pronunciations(dictionary, lang):
filtered_dictionary = []
for word, pronunciation in dictionary:
if lang == "polish":
if "ü" in word:
continue
for i, p in enumerate(pronunciation):
if p in LANG_MAPPING[lang]:
continue
if p in GLOBAL_REMAPPING:
pronunciation[i] = GLOBAL_REMAPPING[p]
elif "̯" in p:
pronunciation[i] = p.replace("̯", "")
elif "͡" in p:
pronunciation[i] = p.replace("͡", "")
elif "‿" in p:
pronunciation[i] = p.replace("‿", "")
elif "͜" in p:
pronunciation[i] = p.replace("͜", "")
elif "g" in p:
pronunciation[i] = p.replace("g", "ɡ")
# Language specific conversions
new_pron = convert_language_specific(word, pronunciation, lang)
new_pron = convert_second_round(word, new_pron, lang)
if new_pron is None:
continue
if (word, new_pron) not in filtered_dictionary:
filtered_dictionary.append((word, new_pron))
return filtered_dictionary
def process_language(lang):
print("Processing", lang)
if lang == "japanese":
dictionary, input_graphemes, input_phones = read_source(lang + "_hiragana")
d, g, p = read_source(lang + "_katakana")
dictionary.extend(d)
input_graphemes.update(g)
input_phones.update(p)
word_set = {x[0] for x in dictionary}
d, g, p = read_source(lang)
dictionary.extend([x for x in d if x[0] not in word_set])
input_graphemes.update(g)
input_phones.update(p)
else:
dictionary, input_graphemes, input_phones = read_source(lang)
print("Input graphemes", sorted(input_graphemes))
print("Input phones", sorted(input_phones))
filtered = fix_pronunciations(dictionary, lang)
save_dictionary(filtered, lang)
if __name__ == "__main__":
for code in LANG_CODES:
process_language(code)