import collections
import itertools
import os
import re

try:
    import jamo
except ImportError:
    jamo = None

WIKIPRON_DIR = r"C:\Users\micha\Documents\Dev\wikipron\data\scrape\tsv"
OUTPUT_DIR = r""

LANG_PATHS = {"french": "fra_latn_broad_filtered.tsv"}

# Full list of languages
# LANG_CODES = ['bulgarian', 'czech', 'french', 'german', 'mandarin_hani', 'polish', 'portuguese_brazil',
#             'portuguese_portugal', 'russian', 'spanish_spain', 'spanish_latin_america', 'swedish',
#              'tamil', 'thai', 'turkish', 'ukrainian', 'mandarin_hani_beijing', 'mandarin_hani_taiwan', 'mandarin_hani_standard',
#              'korean_hangul', 'hausa', 'japanese', 'vietnamese_hanoi', 'vietnamese_hue', 'vietnamese_hochiminhcity',
#              'serbocroatian_croatian', 'serbocroatian_serbian']
LANG_CODES = ["czech"]

BAD_GRAPHEMES = {
    "english_us": {
        "%",
        "/",
        "@",
        "²",
        "à",
        "á",
        "â",
        "ä",
        "æ",
        "ç",
        "è",
        "é",
        "ê",
        "ë",
        "í",
        "î",
        "ï",
        "ñ",
        "ó",
        "ô",
        "õ",
        "ö",
        "ø",
        "ù",
        "ú",
        "ü",
        "ā",
        "ą",
        "č",
        "ē",
        "ę",
        "ğ",
        "ı",
        "ł",
        "ń",
        "ō",
        "ő",
        "œ",
        "ř",
        "ū",
        "ș",
        "ț",
        "ʼ",
        "ṭ",
        "₂",
    },
    "english_uk": {
        "%",
        "/",
        "@",
        "²",
        "à",
        "á",
        "â",
        "ä",
        "æ",
        "ç",
        "è",
        "é",
        "ê",
        "ë",
        "í",
        "î",
        "ï",
        "ñ",
        "ó",
        "ô",
        "õ",
        "ö",
        "ø",
        "ù",
        "ú",
        "ü",
        "ā",
        "ą",
        "č",
        "ē",
        "ę",
        "ğ",
        "ı",
        "ł",
        "ń",
        "ō",
        "ő",
        "œ",
        "ř",
        "ū",
        "ș",
        "ț",
        "ʼ",
        "ṭ",
        "₂",
        "ã",
        "å",
        "û",
        "ī",
        "ž",
        ".",
    },
    "polish": {"+", ".", "ü", "ö", "ø", "ƶ", "ñ", "ç", "à", "á", "è", "é", "í"},
    "french": {".", "/", "º", "å", "æ", "ÿ", "ș"},
    "japanese": {" ", "&", "+", "、", "〆", "〼", "〼", "＝", "𫡤", "・", "×", "ゞ", "ゟ", "ゑ", "ゐ", "ヲ"},
    "mandarin_hani_beijing": {
        "A",
        "B",
        "C",
        "D",
        "E",
        "G",
        "H",
        "I",
        "K",
        "M",
        "N",
        "O",
        "P",
        "Q",
        "S",
        "T",
        "U",
        "V",
        "X",
        "Y",
        "Z",
        "e",
        "p",
        "u",
        "·",
        "α",
        "β",
        "γ",
        "…",
        "⿰",
        "ㄅ",
        "ㄆ",
        "ㄇ",
        "ㄈ",
        "𰚼",
        "𰯼",
        "𫇦",
    },
    "mandarin_hani_taiwan": {
        "A",
        "B",
        "C",
        "D",
        "E",
        "G",
        "H",
        "I",
        "K",
        "M",
        "N",
        "O",
        "P",
        "Q",
        "S",
        "T",
        "U",
        "V",
        "X",
        "Y",
        "Z",
        "e",
        "p",
        "u",
        "·",
        "α",
        "β",
        "γ",
        "…",
        "⿰",
        "ㄅ",
        "ㄆ",
        "ㄇ",
        "ㄈ",
        "𰚼",
        "𰯼",
        "𫇦",
    },
    "mandarin_hani_standard": {
        "A",
        "B",
        "C",
        "D",
        "E",
        "G",
        "H",
        "I",
        "K",
        "M",
        "N",
        "O",
        "P",
        "Q",
        "S",
        "T",
        "U",
        "V",
        "X",
        "Y",
        "Z",
        "e",
        "p",
        "u",
        "·",
        "α",
        "β",
        "γ",
        "…",
        "⿰",
        "ㄅ",
        "ㄆ",
        "ㄇ",
        "ㄈ",
        "𰚼",
        "𰯼",
        "𫇦",
    },
    "german": {"'", ".", "@", "à", "á", "ç", "è", "é", "ê", "ó", "ø", "œ", "í", "ë"},
    "portuguese_brazil": {"'", "."},
    "portuguese_portugal": {"'", "."},
    "russian": {"'", ".", "/", "ѳ"},
    "spanish_spain": {"'", ".", "ö", "ꝇ", "î", "ç"},
    "spanish_latin_america": {"'", ".", "ö", "ꝇ", "î", "ç"},
    "thai": {"…", "'", "/"},
    "turkish": {"̇", "'"},
    "tamil": {"ࢳ", "ࢳ", "ࢴ", "ࢴ", "ஃ"},
    "vietnamese_hanoi": {
        "'",
        ".",
        ",",
    },
    "vietnamese_hue": {
        "'",
        ".",
        ",",
    },
    "vietnamese_hochiminhcity": {
        "'",
        ".",
        ",",
    },
}


BAD_PHONES = {
    "english_uk": {"ɪː", "aː", "eː", "a", "o", "oː", "eː", "e"},
    "english_us": {"ɒ", "aː", "a", "o", "oː", "eː", "e", "ɪː", "ɛː"},
    "german": {"ʊɪ"},
    "czech": {"ə"},
    "spanish_latin_america": {"ɹ", "ɚ", "ʒ", "ə", "ɪ"},
    "spanish_spain": {"ɹ", "ɚ", "ʒ", "ə", "ɪ"},
    "mandarin_hani_taiwan": {"ai", "a", "ei", "o", "ə", "z̩", "ʐ̩"},
    "mandarin_hani_standard": {"ai", "a", "ei", "o", "ə", "z̩", "ʐ̩"},
    "mandarin_hani_beijing": {"ai", "a", "ei", "o", "ə", "z̩", "ʐ̩"},
}

VOWELS = {
    "english_us": {
        "aɪ",
        "aʊ",
        "eɪ",
        "i",
        "iː",
        "oɪ",
        "oʊ",
        "u",
        "uː",
        "æ",
        "ɑ",
        "ɑː",
        "ɔ",
        "ɔɪ",
        "ɔː",
        "ə",
        "ɚ",
        "ɛ",
        "ɝ",
        "ɝː",
        "ɪ",
        "ʊ",
        "ʌ",
    },
    "english_uk": {
        "aɪ",
        "aʊ",
        "eɪ",
        "i",
        "iː",
        "oɪ",
        "oʊ",
        "u",
        "uː",
        "æ",
        "ɑ",
        "ɑː",
        "ɔ",
        "ɔɪ",
        "ɔː",
        "ə",
        "ɚ",
        "ɛ",
        "ɝ",
        "ɝː",
        "ɪ",
        "ʊ",
        "ʌ",
        "aɪ",
        "aʊ",
        "eɪ",
        "i",
        "iː",
        "oɪ",
        "oʊ",
        "u",
        "uː",
        "æ",
        "ɑ",
        "ɑː",
        "ɒ",
        "ɔ",
        "ɔɪ",
        "ɔː",
        "ɛ",
        "ɛː",
        "ɜ",
        "ɜː",
        "ʊ",
        "ʌ",
    },
    "vietnamese_hanoi": {"a", "aː", "e", "i", "o", "u", "ɔ", "ə", "əː", "ɛ", "ɨ", "ʊ", "ɪ"},
    "vietnamese_hue": {"a", "aː", "e", "i", "o", "u", "ɔ", "ə", "əː", "ɛ", "ɨ", "ʊ", "ɪ"},
    "vietnamese_hochiminhcity": {
        "a",
        "aː",
        "e",
        "i",
        "o",
        "u",
        "ɔ",
        "ə",
        "əː",
        "ɛ",
        "ɨ",
        "ʊ",
        "ɪ",
    },
    "mandarin_hani": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"},
    "mandarin_hani_standard": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"},
    "mandarin_hani_taiwan": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"},
    "mandarin_hani_beijing": {"a", "e", "o", "i", "u", "y", "ə", "ɚ", "ɤ̃", "ʊ̃"},
    "thai": {
        "a",
        "aː",
        "e",
        "eː",
        "i",
        "iː",
        "o",
        "oː",
        "u",
        "ə",
        "uː",
        "ɔ",
        "ɔː",
        "ɛ",
        "ɛː",
        "ɤ",
        "ɤː",
        "ɯ",
        "ɯː",
    },
    "swedish": {
        "a",
        "aʊ",
        "aː",
        "e",
        "eː",
        "i",
        "iː",
        "o",
        "oː",
        "u",
        "uː",
        "y",
        "yʷ",
        "yː",
        "æ",
        "æː",
        "êː",
        "ø",
        "øː",
        "ø̀ː",
        "œ",
        "œː",
        "œ̞",
        "œ̞ː",
        "ɑ",
        "ɑː",
        "ɒː",
        "ɒ̀ː",
        "ɔ",
        "ə",
        "ɚ",
        "ɛ",
        "ɛɵ",
        "ɛː",
        "ɛ̂",
        "ɛ̄",
        "ɜ",
        "ɝ",
        "ɪ",
        "ɵ",
        "ɵː",
        "ɵ̄",
        "ɶ",
        "ɶː",
        "ʉ",
        "ʉː",
        "ʉ̂ː",
        "ʉ̟ː",
        "ʊ",
        "ʊː",
        "ʏ",
        "ỳː",
        "ỵː",
    },
}

VOWEL_PATTERNS = {"swedish": re.compile(r"^[aeiɛøæuoʊêɔɪœɑʉɵɶ̂œ̞ː˧˩ɒyʏʉ̟ː˧˩əː˧˩˥]+$")}


LANG_MAPPING = {
    "bulgarian": {
        "d̪": "d",
        "t̪": "t",
        "ɐ": "a",
        "æ": "a",
        "a̟": "a",
        "e": "ɛ",
        "ə": "ɤ",
        "o̝": "ɔ",
        "o̟": "ɔ",
        "u̟": "u",
        "ʉ": "u",
        "ʊ": "u",
        "ɤ̞": "ɤ",
        "ɤ̟": "ɤ",
        "lʲ": "ʎ",
        "l": "ɫ",
        "ɾ": "r",
        "iː": "i j",
        "s̪": "s",
        "n̪": "n",
        "ɾʲ": "rʲ",
        "nʲ": "ɲ",
        "ɡʲ": "ɟ",
        "kʲ": "c",
    },
    "russian": {},
    "czech": {
        "ɫ": "l",
        "ɾ": "r",
        "ɔ": "o",
        "ɔː": "oː",
    },
    "serbocroatian_croatian": {
        "ʋ": "v",
        "ɕ": "ʃ",
        "ʑ": "ʒ",
        "ô": "o˦˨",
        "ôː": "oː˦˨",
        "ûː": "uː˦˨",
        "û": "u˦˨",
        "î": "i˦˨",
        "îː": "iː˦˨",
        "êː": "eː˦˨",
        "ê": "e˦˨",
        "âː": "aː˦˨",
        "â": "a˦˨",
        "r̂": "r̩˦˨",
        "r̂ː": "r̩ː˦˨",
        "řː": "r̩ː˨˦",
        "ř": "r̩˨˦",
        "ěː": "eː˨˦",
        "ě": "e˨˦",
        "ǎ": "a˨˦",
        "ǎː": "aː˨˦",
        "ǐː": "iː˨˦",
        "ǐ": "i˨˦",
        "ǒ": "o˨˦",
        "ǒː": "oː˨˦",
        "ǔː": "uː˨˦",
        "ǔ": "u˨˦",
    },
    "serbocroatian_serbian": {
        "ʋ": "v",
        "ɕ": "ʃ",
        "ʑ": "ʒ",
        "ô": "o˦˨",
        "ôː": "oː˦˨",
        "ûː": "uː˦˨",
        "û": "u˦˨",
        "î": "i˦˨",
        "îː": "iː˦˨",
        "êː": "eː˦˨",
        "ê": "e˦˨",
        "âː": "aː˦˨",
        "â": "a˦˨",
        "r̂": "r̩˦˨",
        "r̂ː": "r̩ː˦˨",
        "řː": "r̩ː˨˦",
        "ř": "r̩˨˦",
        "ěː": "eː˨˦",
        "ě": "e˨˦",
        "ǎ": "a˨˦",
        "ǎː": "aː˨˦",
        "ǐː": "iː˨˦",
        "ǐ": "i˨˦",
        "ǒ": "o˨˦",
        "ǒː": "oː˨˦",
        "ǔː": "uː˨˦",
        "ǔ": "u˨˦",
    },
    "french": {"r": "ʁ", "œ̃": "ɛ̃"},
    "vietnamese_hanoi": {
        "k̟̚": "k̚",
        "ŋ̟": "ŋ",
        "ï": "ɨ",
    },
    "german": {
        "b̥": "b",
        "d̥": "d",
        "ɡ̊": "ɡ",
        "r": "ʁ",
        "ŋ̍": "n̩",
        "ɱ̩": "n̩",
        "ŋ̩": "n̩",
        "ʀ": "ʁ",
        "χ": "x",
        "ʋ": "v",
        "ɘ": "ə",
        "i": "ɪ",
        "ø": "øː",
        "o": "ɔ",
        "u": "ʊ",
        "œː": "øː",
        "y": "ʏ",
        "e": "ɛ",
        "ɛː": "eː",
        "ɔː": "oː",
        "ɑː": "aː",
        "ɒː": "aː",
    },
    "mandarin_hani": {
        "b̥": "p",
        "d̥": "t",
        "g̊": "k",
        "ɡ̊": "k",
        "ɖʐ̥": "ʈʂ",
        "dz̥": "ts",
        "dʑ̥": "tɕ",
        "ä": "a",
        "æ̃": "a",
        "ɤ": "o",
        "ɤ̃": "o",
        "ʊ̃": "o",
        "ɪ": "i",
        "ɻʷ": "ɻ",
        "ʊ": "u",
        "ɛ": "e",
        "ɑ": "a",
        "ɑ̃": "a",
        "ɔ": "o",
        "ɔː": "o",
        "⁵⁵": "˥",
        "⁵¹": "˥˩",
        "³⁵": "˧˥",
        "²¹⁴": "˨˩˦",
    },
    "mandarin_hani_taiwan": {
        "b̥": "p",
        "d̥": "t",
        "g̊": "k",
        "ɡ̊": "k",
        "ɖʐ̥": "ʈʂ",
        "dz̥": "ts",
        "dʑ̥": "tɕ",
        "ä": "a",
        "æ̃": "a",
        "ɤ": "o",
        "ɤ̃": "o",
        "ʊ̃": "o",
        "ɪ": "i",
        "ɻʷ": "ɻ",
        "ʊ": "u",
        "ɛ": "e",
        "ɑ": "a",
        "ɑ̃": "a",
        "ɔ": "o",
        "ɔː": "o",
        "⁵⁵": "˥",
        "⁵¹": "˥˩",
        "³⁵": "˧˥",
        "²¹⁴": "˨˩˦",
    },
    "mandarin_hani_beijing": {
        "b̥": "p",
        "d̥": "t",
        "g̊": "k",
        "ɡ̊": "k",
        "ɖʐ̥": "ʈʂ",
        "dz̥": "ts",
        "dʑ̥": "tɕ",
        "ä": "a",
        "æ̃": "a",
        "ɤ": "o",
        "ɤ̃": "o",
        "ʊ̃": "o",
        "ɪ": "i",
        "ɻʷ": "ɻ",
        "ʊ": "u",
        "ɛ": "e",
        "ɑ": "a",
        "ɑ̃": "a",
        "ɔ": "o",
        "ɔː": "o",
        "⁵⁵": "˥",
        "⁵¹": "˥˩",
        "³⁵": "˧˥",
        "²¹⁴": "˨˩˦",
    },
    "mandarin_hani_standard": {
        "b̥": "p",
        "d̥": "t",
        "g̊": "k",
        "ɡ̊": "k",
        "ɖʐ̥": "ʈʂ",
        "dz̥": "ts",
        "dʑ̥": "tɕ",
        "ä": "a",
        "æ̃": "a",
        "ɤ": "o",
        "ɤ̃": "o",
        "ʊ̃": "o",
        "ɪ": "i",
        "ɻʷ": "ɻ",
        "ʊ": "u",
        "ɛ": "e",
        "ɑ": "a",
        "ɑ̃": "a",
        "ɔ": "o",
        "ɔː": "o",
        "⁵⁵": "˥",
        "⁵¹": "˥˩",
        "³⁵": "˧˥",
        "²¹⁴": "˨˩˦",
    },
    "polish": {
        "s̪": "s",
        "r̥ː": "r",
        "r̥": "r",
        "ɫ": "l",
        "w̃": "n",
    },
    "portuguese_brazil": {
        "ã": "ɐ̃",
        "ɫ": "l",
        "ʁ": "x",
        "ɹ": "x",
        "ɻ": "x",
        "χ": "x",
        "ɦ": "x",
        "h": "x",
        "r": "x",
        "ɪ": "i",
        "ʊ": "u",
    },
    "portuguese_portugal": {
        "ã": "ɐ̃",
        "ɫ": "l",
        "r": "ʁ",
    },
    "swedish": {
        "ɛ̄": "ɛ̂",
        "ɵ̄": "ɵ̂",
        "ɘ": "ɵ",
        "ə": "ɛ",
        "ʁ": "r",
        "ɾ": "r",
        "ɹ": "r",
        "v": "ʋ",
        "w": "ʋ",
        "ɜ": "ɛ",
        "æː": "ɛː",
        "ø": "øː",
        "æ": "ɛ",
        "ˇl": "l",
        "yʷ": "y",
        "œ̞ː": "øː",
        "œː": "øː",
        "œ̞": "œ",
        "ç": "ɕ",
        "bː": "b",  # removing length in consonants
        "ɖː": "ɖ",
        "ɖˑ": "ɖ",
        "ˈt": "tʰ",
        "ˈk": "kʰ",
        "ˈp": "pʰ",
        "dː": "d",
        "jː": "j",
        "kː": "kʰ",
        "lː": "l",
        "mː": "m",
        "nː": "n",
        "fː": "f",
        "ɧː": "ɧ",
        "pː": "pʰ",
        "rː": "r",
        "sː": "s",
        "tˑ": "t",
        "tʰː": "tʰ",
        "pʰː": "pʰ",
        "kʰː": "kʰ",
        "tː": "tʰ",
        "ŋː": "ŋ",
        "ɲ": "ɳ",
        "ɕː": "ɕ",
        "ɡː": "ɡ",
        "ʈː": "ʈʰ",
        "ʈʰː": "ʈʰ",
        "ʂː": "ʈ",
        "ỵː": "yː",
        "ʉ̟̂": "ʉ̂",
        "ʉ̟ː": "ʉː",
        "ʉ̂": "ʉ̂ː",
        "ɒː": "ɑː",
        "aː": "ɑː",
        "ɑ": "ɑː",
        "e": "eː",
        "o": "oː",
        "u": "uː",
        "i": "iː",
        "y": "yː",
        "ɒ̀ː": "ɑ̀ː",
        "ʊː": "ʊ",
        "ʉ": "ʉː",
        "ɵː": "uː",
        "ɶː": "øː",
    },
    "tamil": {
        "l̪": "l",
        "l̪ː": "lː",
        "r̥": "r",
        "ɾ̪": "ɾ",
        "h": "ɦ",
        "tʃ": "tɕ",
        "ɕ": "tɕ",
        "tʃː": "tɕː",
    },
    "thai": {
        "cʰ": "tɕʰ",
        "c": "tɕ",
        "ɔ̌": "ɔ˩˩˦",
        "ǎː": "aː˩˩˦",
        "áː": "aː˦˥",
        "à": "a˨˩",
        "ì": "i˨˩",
    },
    "ukrainian": {
        "ɫ": "l",
        "ʍ": "ʋ",
        "w": "ʋ",
        "v": "ʋ",
        #'e': 'ɛ',
        #'o': 'ɔ',
        "ɫː": "lː",
    },
    "japanese": {
        "o̞": "o",
        "n̩": "n",
        "ä": "a",
        "ɡ̊": "ɡ",
        "ḁ": "a",
        "ẽ": "e",
        "m̩ː": "mː",
        "e̥": "e",
        "u͍": "ɯ",
        "ɯ̃ᵝ": "ɯ",
        "u͍ː": "ɯː",
        "w͍": "w",
        "y": "j",
        "r": "ɾ",
        "ɽ": "ɾ",
        "ɾ̥": "ɾ",
        "ɯᵝ": "ɯ",
        "ɯᵝː": "ɯː",
        "ɯ̟̃ᵝː": "ɯː",
        "ɯ̥ᵝ": "ɯ̥",
        "ʲkʲ": "kʲ",
        "nʲ": "ɲ",
        "tɕʲ": "tɕ",
        "ɕʲ": "ɕ",
        "ĩː": "iː",
        "õ̞ː": "oː",
        "i̥̥": "i̥",
        "e̞̊": "e",
        "ẽ̞ː": "eː",
        "ã̠ː": "aː",
        "õ̞": "o",
        "d̥": "d",
        "b̥": "b",
        "o̞ː": "oː",
        "e̞ː": "eː",
        "e̞": "e",
        "ẽ̞": "e",
        "ĩ": "i",
        "ɸ̥": "ɸ",
        "ɨ̃ᵝː": "ɨː",
        "ĩ̥": "i",
        "a̠ː": "aː",
        "a̠": "a",
        "o̞̊": "o",
        "dʑʲ": "dʑ",
        "ɾ̠": "ɾ",
        "ã̠": "a",
        "õ̥": "o",
        "dʲ": "dʑ",
        "tʲ": "tɕ",
        # 'ɯ̟ᵝ': 'ɯ',
        "ɰᵝ": "w",
        "ɰᵝː": "wː",
        # 'ɯ̟̊ᵝ': 'ɨ̥',
        # 'ɯ̟ᵝː': 'ɨː',
        # 'ɯ̟̃ᵝ': 'ɨ',
        # 'ɨ̥ᵝ': 'ɨ̥',
        # 'ɨᵝ': 'ɨ',
        # 'ɨ̃ᵝ': 'ɨ',
        # 'ɨᵝː': 'ɨː',
        "ɯ̟̊": "ɯ̥",
        "ɲ̟": "ɲ",
        "ŋʲ": "ɲ",
        "p̚ʲ": "p̚",
        "k̚ʲ": "k̚",
        "t̚ʲ": "t̚",
    },
    "turkish": {
        "ɑ": "a",
        "ɑː": "a",
        "aː": "a",
        "iː": "i",
        "uː": "u",
        "ɛ": "e",
        "e̞": "e",
        "ɔ": "o",
        "ʊ": "u",
        "ʏ": "y",
        "β": "v",
        "o̞": "o",
        "ɪ": "i",
        "ø": "œ",
        "ɾ̝̊": "ɾ",
    },
    "korean_hangul": {
        "a̠": "a",
        "e̞": "e",
        "e̞ː": "eː",
        "a̠ː": "a",
        "o̞": "o",
        "o̞ː": "oː",
        "ʌ̹": "ʌ",
        "ɘː": "ʌː",
        "ɦ": "h",
        "ɸʷ": "ɸ",
        "ʃʰ": "sʰ",
    },
    "english_uk": {
        "ɝː": "ɜː",
        "əː": "ɜː",
        "æː": "æ",
        "ɝ": "ɜ",
        "ɚ": "ə",
        "ɫ": "l",
        "r": "ɹ",
        "ʍ": "w",
    },
    "english_us": {
        "ɫ": "l",
        "r": "ɹ",
        "ʍ": "w",
        "æː": "æ",
    },
    "spanish_spain": {
        "ɣ̞": "ɣ",
        "β̞": "β",
        "ð̞": "ð",
        "θ̬": "θ",
        "w̝": "w",
        "nʲ": "ɲ",
        "n̟": "n",
        "lʲ": "ʎ",
        "l̟": "l",
        "i̯": "j",
        "u̯": "w",
        "h": "x",
        "n̪": "n",
        "d": "d̪",
    },
    "spanish_latin_america": {
        "ɣ̞": "ɣ",
        "β̞": "β",
        "ð̞": "ð",
        "w̝": "w",
        "nʲ": "ɲ",
        "lʲ": "ʎ",
        "i̯": "j",
        "u̯": "w",
        "n̪": "n",
        "l̪": "l",
        "l̟": "l",
        "h": "x",
        "n̟": "n",
        "d": "d̪",
    },
}

GLOBAL_REMAPPING = {
    "õ": "õ",  # Fix glyphs to use diacritics
    "ẽ": "ẽ",
    "ũ": "ũ",
    "ĩ": "ĩ",
    "ã": "ã",
}


def read_source(lang):
    graphemes = set()
    phones = set()
    dictionary = []
    path = os.path.join(WIKIPRON_DIR, LANG_PATHS[lang])
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if "\t" in line:
                line = line.split("\t")
                word = line[0]
                pronunciation = line[1].split()
            else:
                line = line.split()
                word = line[0]
                pronunciation = line[1:]
            word = word.lower()
            if lang in BAD_GRAPHEMES:
                if any(x in BAD_GRAPHEMES[lang] for x in word):
                    print(word)
                    continue
            graphemes.update(word)
            phones.update(pronunciation)
            dictionary.append((word, pronunciation))
    return dictionary, graphemes, phones


def save_dictionary(dictionary, lang):
    deduplication = set()
    final_phones = collections.Counter()
    path = os.path.join(OUTPUT_DIR, f"{lang}_mfa.dict")
    with open(path, "w", encoding="utf8") as f:
        for w, p in sorted(dictionary):
            final_phones.update(p)
            p = " ".join(p)
            if (w, p) in deduplication:
                continue
            f.write("{}\t{}\n".format(w, p))
            deduplication.add((w, p))
    print("Final phones:", sorted(final_phones))
    print("Final phone counts:", sorted(final_phones.items(), key=lambda x: -x[1]))


def convert_language_specific(word, phones, lang):
    new_pron = []
    if lang == "swedish":
        for i, p in enumerate(phones):
            for k, v in LANG_MAPPING[lang].items():
                if p == k:
                    phones[i] = v
                    break

        for i, p in enumerate(phones):
            if p == "¹":
                found_first = False
                found_second = False
                for j in range(i + 1, len(phones)):
                    if VOWEL_PATTERNS[lang].match(phones[j]):
                        if not found_first:
                            phones[j] += "˥˧"  # Falling tone
                            found_first = True
                        elif not found_second:
                            phones[j] += "˩"  # Low tone
                            found_second = True
                        else:
                            break
                continue
            elif p == "²":
                found_first = False
                found_second = False
                for j in range(i + 1, len(phones)):
                    if phones[j] in VOWELS[lang]:
                        if not found_first:
                            phones[j] += "˧˩"  # Falling tone
                            found_first = True
                        elif not found_second:
                            phones[j] += "˥˩"  # Falling tone
                            found_second = True
                        else:
                            break
                continue
            new_pron.append(p)
        phones = new_pron
    new_pron = []
    for i, p in enumerate(phones):
        if lang == "english_us":
            if lang in LANG_MAPPING:
                for k, v in LANG_MAPPING[lang].items():
                    if p == k:
                        p = v
                        break

            if p == "ʒ" and len(new_pron) and new_pron[-1] == "d":  # fix up affricates being split
                new_pron[-1] = "dʒ"
                continue
            elif (
                p == "ʃ" and len(new_pron) and new_pron[-1] == "t"
            ):  # fix up affricates being split
                new_pron[-1] = "tʃ"
                continue
            elif p in ["ɪ", "j"] and len(new_pron) and new_pron[-1] in {"e", "ɔ", "o"}:
                new_pron[-1] += "ɪ"
                continue
            elif p in {"ʊ", "u"} and len(new_pron) and new_pron[-1] in {"a", "ɑ", "ʌ"}:
                new_pron[-1] = "aʊ"
                continue
            elif p in ["ɪ", "j"] and len(new_pron) and new_pron[-1] in {"a", "ɑ", "ʌ"}:
                new_pron[-1] = "aɪ"
                continue
            elif p in {"ʊ", "u"} and len(new_pron) and new_pron[-1] in {"ə", "o", "ɔ"}:
                new_pron[-1] = "oʊ"
                continue
            elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"o", "ɔ"}:
                new_pron[-1] = "ɔ"
                p = "ɹ"
            elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"i", "ɪː", "ɪ"}:
                new_pron[-1] = "ɪ"
                p = "ɹ"
            elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"u", "ʊ"}:
                new_pron[-1] = "ʊ"
                p = "ɹ"
            elif p in {"ɹ", "ɚ"} and len(new_pron) and new_pron[-1] in {"e", "ɛ", "ɛː", "æ", "æː"}:
                new_pron[-1] = "ɛ"
                p = "ɹ"
            elif p == "ɹ" and len(new_pron) and new_pron[-1] in ["ɜ", "ɜː"]:
                new_pron[-1] = "ɝ"
                continue
            elif (
                p == "ɹ"
                and len(new_pron) > 1
                and new_pron[-1] == "ə"
                and new_pron[-2] in {"ɪ", "i", "ɪː"}
            ):
                new_pron[-1] = "ɹ"
                new_pron[-2] = "ɪ"
                continue
            elif (
                p == "ɹ"
                and len(new_pron) > 1
                and new_pron[-1] == "ə"
                and new_pron[-2] in {"ʊ", "u"}
            ):
                new_pron[-1] = "ɹ"
                new_pron[-2] = "ʊ"
                continue
            elif (
                p == "ɹ"
                and len(new_pron) > 1
                and new_pron[-1] == "ə"
                and new_pron[-2] in {"e", "ɛ", "ɛː"}
            ):
                new_pron[-1] = "ɹ"
                new_pron[-2] = "ɛ"
                continue
            elif p == "w" and len(new_pron) and new_pron[-1] == "h":  # get rid of h w sequences
                new_pron[-1] = "w"
                continue
            elif p in {"k", "ɡ"} and len(new_pron) and new_pron[-1] == "n":
                new_pron[-1] = "ŋ"
                continue
            elif p in {"ɜ", "ɜː"} and (i == len(phones) - 1 or phones[i + 1] != "ɹ"):
                p = "ɝ"
            elif p == "ɪ" and i == len(phones) - 1:
                p = "i"
            elif (
                p == "l" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1
            ):  # final syllabic l's
                new_pron[-1] = "l̩"
                continue
            elif (
                p == "m" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1
            ):  # final syllabic m's
                new_pron[-1] = "m̩"
                continue
            elif (
                p == "n" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1
            ):  # final syllabic n's
                new_pron[-1] = "n̩"
                continue
            elif (
                p == "ɹ" and len(new_pron) and new_pron[-1] == "ə" and i == len(phones) - 1
            ):  # final syllabic r's
                new_pron[-1] = "ɚ"
                continue
            elif p == "ŋ" and i == len(phones) - 1 and new_pron[-1] == "i":
                new_pron[-1] = "ɪ"
                continue
        elif lang == "english_uk":
            if lang in LANG_MAPPING:
                for k, v in LANG_MAPPING[lang].items():
                    if p == k:
                        p = v
                        break
            if p == "ɹ" and i == len(phones) - 1:
                continue
            elif p == "ɪ" and i == len(phones) - 1:
                p = "i"
            elif (
                p in {"l", "m", "n"}
                and i == len(phones) - 1
                and len(new_pron)
                and new_pron[-1] in {"ə", "əː"}
            ):
                new_pron[-1] = p + "̩"
                continue
            elif p == "ɪ" and len(new_pron) and new_pron[-1] in {"e", "a", "ɔ", "o"}:
                new_pron[-1] = new_pron[-1] + p
                continue
            elif p == "ʊ" and len(new_pron) and new_pron[-1] in {"e", "a"}:
                new_pron[-1] = new_pron[-1] + p
                continue
            elif p in {"ʊ", "u"} and len(new_pron) and new_pron[-1] in {"ə", "o", "ɔ"}:
                new_pron[-1] = "oʊ"
                continue
            elif (
                p == "ʒ" and len(new_pron) and new_pron[-1] == "d"
            ):  # fix up affricates being split
                new_pron[-1] = "dʒ"
                continue
            elif (
                p == "ʃ" and len(new_pron) and new_pron[-1] == "t"
            ):  # fix up affricates being split
                new_pron[-1] = "tʃ"
                continue
            elif p == "w" and len(new_pron) and new_pron[-1] == "h":  # get rid of h w sequences
                new_pron[-1] = "w"
                continue
            elif p in {"k", "ɡ"} and len(new_pron) and new_pron[-1] == "n":
                new_pron[-1] = "ŋ"
                continue
            elif p == "ə" and len(new_pron) and new_pron[-1] == "ɛ":
                new_pron[-1] = "ɛː"
                continue
            elif p == "ŋ" and i == len(phones) - 1 and new_pron[-1] == "i":
                new_pron[-1] = "ɪ"
                continue
            elif (
                p == "ɹ"
                and len(new_pron) > 2
                and new_pron[-1] == "ə"
                and new_pron[-2] in {"e", "ɛ", "ʊ", "ɪ", "ɪː", "ɛː"}
            ):
                new_pron[-1] = p
                continue
        elif lang == "bulgarian":
            if p in {"s", "ʃ", "sʲ"} and len(new_pron) and new_pron[-1] == "t":
                new_pron[-1] += p
                continue
            elif p == "ʒ" and len(new_pron) and new_pron[-1] == "d":
                new_pron[-1] = "dʒ"
                continue
            elif p in {"ɡ", "k"} and len(new_pron) and new_pron[-1] in {"n"}:
                new_pron[-1] = "ŋ"
            elif p in {"v", "f"} and len(new_pron) and new_pron[-1] in {"n"}:
                new_pron[-1] = "ɱ"
        elif lang == "czech":
            if p in {"u", "ʊ"} and len(new_pron) and new_pron[-1] in {"o", "ɔ"}:
                new_pron[-1] = "ow"
                continue
            elif p in ["u", "ʊ"] and len(new_pron) and new_pron[-1] in {"a"}:
                new_pron[-1] = "aw"
                continue
            elif p in {"u", "ʊ"} and len(new_pron) and new_pron[-1] in {"e", "ɛ"}:
                new_pron[-1] = "ew"
                continue
            elif p in {"ʃ", "s"} and len(new_pron) and new_pron[-1] in {"t"}:
                new_pron[-1] += p
                continue
            elif p in {"ʒ"} and len(new_pron) and new_pron[-1] in {"d"}:
                new_pron[-1] += p
                continue
            elif p == "ʊ":
                p = "u"
            elif p == "e":
                p = "ɛ"
        elif lang.startswith("serbocroatian"):
            if p in {"ɕ", "ʂ", "ʃ"} and len(new_pron) and new_pron[-1] == "t":
                new_pron[-1] += p
                continue
            elif p in {"ʑ", "ʐ", "ʒ"} and len(new_pron) and new_pron[-1] == "d":
                new_pron[-1] += p
                continue
        elif lang == "german":
            if lang in LANG_MAPPING:
                for k, v in LANG_MAPPING[lang].items():
                    if p == k:
                        p = v
                        break
            if p in {"ʏ", "ɪ"} and len(new_pron) and new_pron[-1] in {"o", "ɔ"}:
                new_pron[-1] = "ɔʏ"
                continue
            elif p == "ɪ" and len(new_pron) and new_pron[-1] == "a":
                new_pron[-1] = "aɪ"
                continue
            elif p == "ɪ" and len(new_pron) and new_pron[-1] == "ʊ":
                new_pron[-1] = "ʊɪ"
                continue
            elif p == "ʊ" and len(new_pron) and new_pron[-1] == "a":
                new_pron[-1] = "aʊ"
                continue
            elif p == "e" and len(new_pron) and new_pron[-1] == "ɐ":
                new_pron[-1] = "ɐ"
                continue
            elif p == "ʔ":
                continue
            elif p in {"tʰ", "kʰ", "pʰ"} and i == len(phones) - 1:
                p = p[0]
            elif (
                p in {"tʰ", "kʰ", "pʰ"}
                and len(new_pron)
                and new_pron[-1] in {"s", "ts", "ʃ", "tʃ"}
            ):
                p = p[0]
            elif p in {"t", "k", "p"} and i == 0:
                p += "ʰ"
            elif p in {"s", "ʃ"} and i == 1 and new_pron[-1] in {"tʰ"}:
                new_pron[-1] = "t" + p
                continue
            elif (
                p in {"v", "s", "x", "ʁ", "l", "j"}
                and len(new_pron)
                and new_pron[-1] in {"tʰ", "kʰ", "pʰ"}
            ):
                new_pron[-1] = new_pron[-1][0]
            elif p == "s" and len(new_pron) and new_pron[-1] == "t":
                if "z" in word or "c" in word:
                    new_pron[-1] = "ts"
                    continue
            elif p == "õ":
                new_pron.append("ɔ")
                new_pron.append("n")
                continue
            elif p == "ɛ̃":
                new_pron.append("eː")
                new_pron.append("n")
                continue
        elif lang.startswith("mandarin_hani"):
            vowel_pattern = re.compile(r"^[ayeiouəɚʊɤ̃]+[²³⁰¹⁴⁵]*$")
            for k, v in LANG_MAPPING[lang].items():
                if p == k:
                    p = v
                    break
            if p in {"²", "³", "¹", "⁰", "⁴", "⁵", "⁻", "⁽", "⁾"} and len(new_pron):
                index = -1
                for j in range(len(new_pron) - 1, -1, -1):
                    if vowel_pattern.match(new_pron[j]) or "̩" in new_pron[j]:
                        index = j
                        break
                if new_pron[index].endswith("²¹⁴"):
                    continue
                new_pron[index] += p
                continue
            elif p.startswith("ˀ"):
                new_pron.append("ʔ")
                if p[1] in LANG_MAPPING[lang]:
                    new_pron.append(LANG_MAPPING[lang][p[1]])
                else:
                    new_pron.append(p[1])
                continue
            elif (
                any(p.startswith(x) for x in VOWELS[lang])
                and len(new_pron)
                and re.match(r"^[ayeiouəɚʊɤ̃]+$", new_pron[-1])
            ):
                new_pron[-1] += p
                continue
        elif lang == "portuguese_brazil":
            for k, v in LANG_MAPPING[lang].items():
                if p == k:
                    p = v
                    break
            if p == "w̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ"}:
                new_pron[-1] += p
                continue
            elif p == "j̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ", "ẽ", "ũ"}:
                new_pron[-1] += p
                continue
            elif (
                p == "j" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "ɔ", "o", "u"}
            ):
                new_pron[-1] += p
                continue
            elif p == "w" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "i"}:
                new_pron[-1] += p
                continue
        elif lang == "portuguese_portugal":
            for k, v in LANG_MAPPING[lang].items():
                if p == k:
                    p = v
                    break
            if p == "w̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ"}:
                new_pron[-1] += p
                continue
            elif p == "j̃" and len(new_pron) and new_pron[-1] in {"ɐ̃", "õ", "ẽ", "ũ"}:
                new_pron[-1] += p
                continue
            elif (
                p == "j" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "ɔ", "o", "u"}
            ):
                new_pron[-1] += p
                continue
            elif p == "w" and len(new_pron) and new_pron[-1] in {"a", "ɛ", "e", "ɐ", "i"}:
                new_pron[-1] += p
                continue
        elif lang == "swedish":
            if p == "ʒ" and len(new_pron) and new_pron[-1] == "d":
                new_pron[-1] += p
                continue
            elif (
                p in {"k", "kʰ", "ɡ"}
                and len(new_pron)
                and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"}
            ):
                new_pron[-1] = "ŋ"
            elif (
                p in {"t", "tʰ", "d"}
                and len(new_pron)
                and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"}
            ):
                new_pron[-1] = "n"
            elif (
                p in {"p", "pʰ", "b"}
                and len(new_pron)
                and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"}
            ):
                new_pron[-1] = "m"
            elif (
                p in {"ʈ", "ʈʰ", "ɖ", "ʂ"}
                and len(new_pron)
                and new_pron[-1] in {"m", "n", "ɳ", "ɲ", "ŋ"}
            ):
                new_pron[-1] = "ɳ"
            elif p == "s" and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
                new_pron[-1] = "ʂ"
                continue
            elif p in {"t", "ʈ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
                new_pron[-1] = "ʈ"
                continue
            elif p in {"d", "ɖ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
                new_pron[-1] = "ɖ"
                continue
            elif (
                p in {"n", "ɳ"} == "n"
                and len(new_pron)
                and new_pron[-1] == "r"
                and "rr" not in word
            ):
                new_pron[-1] = "ɳ"
                continue
            elif p in {"l", "ɭ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
                new_pron[-1] = "ɭ"
                continue
            elif p in {"tʰ", "ʈʰ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
                new_pron[-1] = "ʈʰ"
                continue
            elif p in {"s", "ʂ"} and len(new_pron) and new_pron[-1] == "r" and "rr" not in word:
                new_pron[-1] = "ʂ"
                continue
            elif p == "aʊ":
                new_pron.append("a")
                new_pron.append("ʊ")
                continue
            elif p in {"r", "n", "l", "t", "k", "ɡ"} and len(new_pron) and new_pron[-1] == "ə":
                new_pron[-1] = "ɛ"
            elif p in {"t", "k", "p", "ʈ"} and not len(new_pron):
                p += "ʰ"
            elif (
                not VOWEL_PATTERNS[lang].match(p)
                and len(new_pron)
                and new_pron[-1] in {"tʰ", "kʰ", "pʰ", "ʈʰ"}
            ):
                print(new_pron[-1], p)
                new_pron[-1] = new_pron[-1][0]
            elif (
                p in {"tʰ", "kʰ", "pʰ", "ʈʰ"}
                and len(new_pron)
                and (new_pron[-1] in {"ʂ", "s"} or i == len(phones) - 1)
            ):
                p = p[0]
            elif p == "ə" and i == len(phones) - 1:
                p = "e"
            elif p in {"r"} and len(new_pron) and new_pron[-1] == "ɜ":
                new_pron[-1] = "æː"
            elif p == "ɜ" and i == len(phones) - 1:
                p = "e"
        elif lang == "tamil":
            if p in {"ʊ", "ɪ"} and len(new_pron) and new_pron[-1] == "a":
                new_pron[-1] += p
                continue
        elif lang in ["spanish_spain", "spanish_latin_america"]:
            if p in {"n", "m", "ɲ"} and len(new_pron) and new_pron[-1] in {"n", "m", "ɲ"}:
                new_pron[-1] = p
                continue
            if p in {"s", "z"} and len(new_pron) and new_pron[-1] in {"s", "z"}:
                new_pron[-1] = p
                continue
            if p in {"x", "k", "ɡ"} and len(new_pron) and new_pron[-1] == "n":
                new_pron[-1] = "ŋ"
            elif (
                p in {"ɟʝ", "ʝ", "j", "tʃ", "i", "ĩ"} and len(new_pron) and new_pron[-1] in {"n"}
            ):
                new_pron[-1] = "ɲ"
            elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"k"}:
                new_pron[-1] = "c"
            elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"x"}:
                new_pron[-1] = "ç"
            elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"ɡ"}:
                new_pron[-1] = "ɟ"
            elif p in {"j", "i", "ĩ", "e", "ẽ"} and len(new_pron) and new_pron[-1] in {"ɣ"}:
                new_pron[-1] = "ʝ"
            elif (
                p in {"ɟʝ", "ʝ", "j", "tʃ", "i", "ĩ"} and len(new_pron) and new_pron[-1] in {"l"}
            ):
                new_pron[-1] = "ʎ"
            elif (
                p
                in {
                    "β",
                    "b",
                    "p",
                }
                and len(new_pron)
                and new_pron[-1] == "n"
            ):
                new_pron[-1] = "m"
            elif (
                p
                in {
                    "f",
                    "v",
                }
                and len(new_pron)
                and new_pron[-1] in {"n", "m", "n̪"}
            ):
                new_pron[-1] = "ɱ"
        elif lang == "thai":
            if p in {"a"} and len(new_pron) and new_pron[-1] in {"i", "iː", "ɯ", "ɯː", "u", "uː"}:
                new_pron[-1] += p
                continue
        elif lang == "turkish":
            if p == "ʃ" and len(new_pron) and new_pron[-1] in {"t"}:
                new_pron[-1] += p
                continue
            elif p == "ʒ" and len(new_pron) and new_pron[-1] in {"d"}:
                new_pron[-1] += p
                continue
            elif p in {"e", "i", "œ", "y"} and len(new_pron) and new_pron[-1] in {"k"}:
                new_pron[-1] = "c"
            elif p in {"e", "i", "œ", "y"} and len(new_pron) and new_pron[-1] in {"ɡ"}:
                new_pron[-1] = "ɟ"
            elif p in {"a", "ɯ", "o", "u"} and len(new_pron) and new_pron[-1] in {"l"}:
                new_pron[-1] = "ɫ"
            elif p in {"i", "e", "œ", "y"} and len(new_pron) and new_pron[-1] in {"ɫ"}:
                new_pron[-1] = "l"
        elif lang == "portuguese_brazil":
            if p == "ʃ" and len(new_pron) and new_pron[-1] in {"t"}:
                new_pron[-1] += p
                continue
            elif p == "ʒ" and len(new_pron) and new_pron[-1] in {"d"}:
                new_pron[-1] += p
                continue
        elif lang == "russian":
            voiced_set = {
                "v",
                "bʲ",
                "b",
                "bː",
                "d",
                "dz",
                "dzʲ",
                "dʐ",
                "dʲ",
                "dʲː",
                "dː",
                "v",
                "vʲ",
                "vʲː",
                "vː",
                "z",
                "zʲ",
                "zʲː",
                "zː",
                "ɡ",
                "ɡʲ",
                "ɡː",
                "ɣ",
                "ʐ",
                "ʐː",
                "ʑː",
            }
            if p in {"ʔ"}:
                continue

        elif lang == "japanese":
            for k, v in LANG_MAPPING[lang].items():
                if p == k:
                    p = v
                    break
            if p in {".", "˕", "}", "˦˨˦", "˥", "˨˩", "˧", "꜔", "˩", "ʔ", "%", "˩˥"}:
                continue
            elif p in {
                "̥",
                "̥̥",
            } and len(new_pron):
                new_pron[-1] += "̥"
                continue
            elif p in {"ᵝ̥"} and len(new_pron):
                if "̥" not in new_pron[-1] and "ː" not in new_pron[-1]:
                    new_pron[-1] += "̥"
                continue
            elif p in {"ː̥"} and len(new_pron):
                new_pron[-1] += "ː"
                continue
            elif (
                p in {"j"}
                and len(new_pron)
                and new_pron[-1] in {"ɾ", "p", "m", "b", "k", "t", "d", "ç", "ɡ"}
            ):
                new_pron[-1] += "ʲ"
                continue
            elif p in {"h"} and len(new_pron) and new_pron[-1] in {"c"}:
                new_pron[-1] = "tɕ"
                continue
            elif p in {"p", "pʲ"} and len(new_pron) and new_pron[-1] in {"p̚"}:
                new_pron[-1] = p + "ː"
                continue
            elif p in {"b"} and len(new_pron) and new_pron[-1] in {"b̚"}:
                new_pron[-1] = p + "ː"
                continue
            elif p in {"ɾ", "ɾʲ"} and len(new_pron) and new_pron[-1] in {"ɾ̚"}:
                new_pron[-1] = p + "ː"
                continue
            elif p in {"k", "kʲ"} and len(new_pron) and new_pron[-1] in {"k̚"}:
                new_pron[-1] = p + "ː"
                continue
            elif p in {"t", "tʲ"} and len(new_pron) and new_pron[-1] in {"ʔ̥", "t̚"}:
                new_pron[-1] = p + "ː"
                continue
            elif p in {"tɕ", "ts"} and len(new_pron) and new_pron[-1] in {"t̚"}:
                new_pron[-1] = p + "ː"
                continue
            elif p in {"ɡ"} and len(new_pron) and new_pron[-1] in {"ɡ̚"}:
                new_pron[-1] = p + "ː"
                continue
            elif p in {"d", "dz", "ʑ", "dʑ"} and len(new_pron) and new_pron[-1] in {"d̚"}:
                new_pron[-1] = p + "ː"
                continue
            elif p in {"i", "iː", "i̥"} and len(new_pron) and "ʲ" in new_pron[-1]:
                if len(new_pron) > 2 and "ʲ" in new_pron[-2]:
                    new_pron[-2] = new_pron[-2].replace("ʲ", "")

                new_pron[-1] = new_pron[-1].replace("ʲ", "")
            elif p in {"i"} and len(new_pron) and new_pron[-1] == "n":
                new_pron[-1] = "ɲ"
            elif False and p in {"k", "ɡ"} and len(new_pron) and new_pron[-1] == "ɲ":
                new_pron[-1] = "ŋ"
            elif False and p in {"t", "d"} and len(new_pron) and new_pron[-1] == "ɲ":
                new_pron[-1] = "n"
            elif p in {"dz", "dʑ"} and len(new_pron) and new_pron[-1] not in {"ɲ", "n"}:
                p = p[1]
            elif p in {
                "ɯ̟̃ᵝ",
                "ɯ̟̊ᵝ",
                "ɯ̟ᵝː",
                "ɯ̟ᵝ",
                "ɨ̥ᵝ",
                "ɨᵝ",
                "ɨ̃ᵝ",
                "ɨᵝː",
                "ɨ̥",
                "ɨ̥ː",
                "ɯ̥ː",
                "ɯ̥",
            }:
                if len(new_pron) and new_pron[-1] in {
                    "t",
                    "tː",
                    "s",
                    "sː",
                    "z",
                    "zː",
                    "ɲː",
                    "ɲ",
                    "ç",
                    "çː",
                    "n",
                    "nː",
                    "ts",
                    "tsː",
                    "ɕ",
                    "tɕ",
                    "tɕː",
                    "ʑ",
                    "ɕː",
                    "ʑː",
                    "ɡʲ",
                    "ɡʲː",
                    "kʲ",
                    "kʲː",
                    "bʲ",
                    "bʲː",
                    "pʲ",
                    "pʲː",
                    "mʲ",
                    "mʲː",
                    "ɾʲː",
                    "ɾʲ",
                    "j",
                }:
                    new_p = "ɨ"
                else:
                    new_p = "ɯ"
                if "̥" in p or "̊" in p:
                    new_p += "̥"
                if "ː" in p:
                    new_p += "ː"
                p = new_p
                if len(new_pron) and new_pron[-1] == "n":
                    new_pron[-1] = "ɲ"

        elif lang == "korean_hangul":
            for k, v in LANG_MAPPING[lang].items():
                if p == k:
                    p = v
                    break
            # if p in {'e', 'ɛː', 'ɛ', 'a', 'o', 'u', 'ʌ'} and len(new_pron) and new_pron[-1] in {'j'}:
            #    new_pron[-1] += p
            #    continue
            # elif p in {'i'} and len(new_pron) and new_pron[-1] in {'w', 'ɥ'}:
            #    new_pron[-1] = 'ɥi'
            #    continue
            # elif p in {'e', 'ɛː', 'ɛ', 'a', 'o', 'i', 'ʌ'} and len(new_pron) and new_pron[-1] in {'w'}:
            #    new_pron[-1] += p
            #    continue
            # elif p in {'i'} and len(new_pron) and new_pron[-1] in {'ɰ'}:
            #    new_pron[-1] += p
            #    continue
            if p == "t͈" and "ᄄ" not in jamo.h2j(word):
                if len(new_pron) and "̚" in new_pron[-1]:
                    p = "tʰ"
                else:
                    p = "t"
            elif p == "tɕ͈" and "ᄍ" not in jamo.h2j(word):
                if len(new_pron) and "̚" in new_pron[-1]:
                    p = "tɕʰ"
                else:
                    p = "tɕ"
            elif p == "k͈" and "ᄁ" not in jamo.h2j(word):
                if len(new_pron) and "̚" in new_pron[-1]:
                    p = "kʰ"
                else:
                    p = "k"
            elif p == "p͈" and "ᄈ" not in jamo.h2j(word):
                if len(new_pron) and "̚" in new_pron[-1]:
                    p = "pʰ"
                else:
                    p = "p"
            elif p == "s͈" and "ᄊ" not in jamo.h2j(word):
                if len(new_pron) and "̚" in new_pron[-1]:
                    p = "sʰ"
                else:
                    p = "s"
            elif p == "x" and len(new_pron) and new_pron[-1] == "k":
                new_pron[-1] += "ʰ"
                continue

        elif lang in ["vietnamese_hanoi", "vietnamese_hue", "vietnamese_hochiminhcity"]:
            vowel_pattern = re.compile(rf'^[{"".join(VOWELS[lang])}]+$')
            tone_pattern = re.compile(r"^[˥˩˦˥˨˧˨˩˩˩ˀ˦]+$")
            if p in {"j", "w"} and len(new_pron) and vowel_pattern.match(new_pron[-1]):
                new_pron[-1] += p
                continue
            elif vowel_pattern.match(p) and len(new_pron) and vowel_pattern.match(new_pron[-1]):
                new_pron[-1] += p
                continue
            elif p in {"ɗ", "ɓ"} and len(new_pron) and new_pron[-1] == "ʔ":
                new_pron[-1] = p
                continue
            elif (
                p == "ʔ"
                and len(new_pron)
                and tone_pattern.match(new_pron[-1])
                and not (i < len(phones) - 1 and phones[i + 1] in {"ɗ", "ɓ"})
            ):
                new_pron[-1] += "ˀ"
                continue
        if lang in LANG_MAPPING:
            for k, v in LANG_MAPPING[lang].items():
                if p == k:
                    p = v
                    break
        if not p:
            continue
        new_pron.append(p)
    tone_mapping = {
        "⁰": "",
        "¹": "˩",
        "²": "˨",
        "³": "˧",
        "⁴": "˦",
        "⁵": "˥",
        "˧": "˧",
        "˨˩": "˨˩",
        "˥˩": "˥˩",
        "˦˥": "˦˥",
        "˩˩˦": "˩˩˦",
    }
    if lang == "thai":
        phones = new_pron
        new_pron = []
        tone_symbols = {"˥˩", "˦˥", "˧", "˨˩", "˩˩˦"}
        vowel_set = {x for x in VOWELS[lang]}
        vowel_set |= {x + y for x, y in itertools.product(VOWELS[lang], VOWELS[lang])}
        vowel_set |= {
            x + y + z for x, y, z in itertools.product(VOWELS[lang], VOWELS[lang], VOWELS[lang])
        }
        for i, p in enumerate(phones):
            if p in tone_symbols:
                for j in range(len(new_pron) - 1, 0, -1):
                    if new_pron[j] in vowel_set and new_pron[j] not in {"w", "j"}:
                        new_pron[j] += tone_mapping[p]
                        break
            else:
                new_pron.append(p)
        # split off tone for G2P
        # for i, p in enumerate(new_pron):
        #    for tone in tone_mapping:
        #        if p.endswith(tone):
        #            new_pron[i] = p.replace(tone, ' ') + tone

    elif lang in ["vietnamese_hanoi", "vietnamese_hue", "vietnamese_hochiminhcity"]:
        phones = new_pron
        new_pron = []
        vowel_pattern = re.compile(rf'^[{"".join(VOWELS[lang])}]+[wj]?$')
        tone_symbols = {
            "˦ˀ˥",
            "˧˦",
            "˧˧",
            "˧˨",
            "˧˩",
            "˨˩",
            "˦˧˥",
            "˦˩",
            "˧˧",
            "˧˨",
            "˨˩",
            "˨˩˦",
            "˦˥",
            "˨˩˨",
        }
        tone_pattern = re.compile(r"^[˥˩˦˥˨˧˨˩˩˩ˀ˦]+$")
        for i, p in enumerate(phones):
            if tone_pattern.match(p):
                for j in range(len(new_pron) - 1, 0, -1):
                    if vowel_pattern.match(new_pron[j]):
                        new_pron[j] += p
                        break
            else:
                new_pron.append(p)
    elif lang.startswith("mandarin_hani"):
        mapping = {
            "²¹⁴": "˨˩˦",
            "⁵⁵": "˥˥",
            "³⁵": "˧˥",
            "⁵¹": "˥˩",
            "⁰": "",
            "¹": "˩",
            "²": "˨",
            "³": "˨",
            "⁴": "˦",
            "⁵": "˥",
        }
        tone_symbols = {"²", "³", "¹", "⁴", "⁵", "⁰"}
        for i, p in enumerate(new_pron):
            if any(x in p for x in tone_symbols):
                for k, v in mapping.items():
                    if k in new_pron[i]:
                        new_pron[i] = new_pron[i].replace(k, v)
            # if any(x in new_pron[i] for x in tone_symbols):
            #    return None
    elif lang == "swedish":
        for i, p in enumerate(new_pron):
            if p == "êː":
                new_pron[i] = "eː˧˩"
            elif p == "â":
                new_pron[i] = "a˧˩"
            elif p == "ɛ̂":
                new_pron[i] = "ɛ˧˩"
            elif p == "ɑ̂ː":
                new_pron[i] = "ɑː˧˩"
            elif p == "ûː":
                new_pron[i] = "uː˧˩"
            elif p == "ʉ̂ː":
                new_pron[i] = "ʉː˧˩"
            elif p == "ɵ̂":
                new_pron[i] = "ɵ˧˩"
            elif p == "ʉ̂ː":
                new_pron[i] = "ʉː˧˩"
            elif p == "ʉ̟ː˥˩":
                new_pron[i] = "ʉː˥˩"
            elif p == "ǎ":
                new_pron[i] = "a˥˧"
            elif p == "ʉ̟ː˧˩":
                new_pron[i] = "ʉː˧˩"
            elif p == "ø̀ː":
                new_pron[i] = "øː˩"
            elif p == "ɑ̀ː":
                new_pron[i] = "ɑː˩"
            elif p == "ỳː":
                new_pron[i] = "yː˩"
            elif p == "ỳː˧˩":
                new_pron[i] = "yː˧˩"

    elif lang == "hausa":
        phone_mapping = {
            "á": "a",
            "áː": "aː",
            "é": "e",
            "éː": "eː",
            "í": "i",
            "íː": "iː",
            "ó": "o",
            "óː": "oː",
            "úː": "uː",
            "à": "a",
            "àː": "aː",
            "è": "e",
            "èː": "eː",
            "ì": "i",
            "ìː": "iː",
            "ò": "o",
            "òː": "oː",
            "ùː": "uː",
            "â": "a",
            "âː": "aː",
            "ê": "e",
            "êː": "eː",
            "î": "i",
            "îː": "iː",
            "ô": "o",
            "ôː": "oː",
            "ûː": "uː",
        }
        for i, p in enumerate(new_pron):
            if p in {"á", "áː", "é", "éː", "í", "íː", "ó", "óː", "úː"} or "́" in p:  # High tone
                if p in phone_mapping:
                    new_pron[i] = phone_mapping[p]
                else:
                    new_pron[i] = p.replace("́", "")
                new_pron[i] += "˥"
            elif p in {"à", "àː", "è", "èː", "ì", "ìː", "ò", "òː", "ùː"} or "̀" in p:  # Low tone
                if p in phone_mapping:
                    new_pron[i] = phone_mapping[p]
                else:
                    new_pron[i] = p.replace("̀", "")
                new_pron[i] += "˩"
            elif (
                p
                in {
                    "â",
                    "âː",
                    "ê",
                    "êː",
                    "î",
                    "îː",
                    "ôː",
                    "ûː",
                }
                or "̂" in p
            ):  # Falling tone
                if p in phone_mapping:
                    new_pron[i] = phone_mapping[p]
                else:
                    new_pron[i] = p.replace("̂", "")
                new_pron[i] += "˥˦"

    return new_pron


def convert_second_round(word, phones, lang):
    if lang not in ["english_us", "english_uk"]:
        return phones
    new_pron = []
    stressed_vowels = {}
    if lang == "english_uk":
        stressed_vowels = {
            "aɪ",
            "aʊ",
            "eɪ",
            "i",
            "iː",
            "oɪ",
            "oʊ",
            "u",
            "uː",
            "æ",
            "ɑ",
            "ɑː",
            "ɒ",
            "ɔ",
            "ɔɪ",
            "ɔː",
            "ɛ",
            "ɛː",
            "ɜ",
            "ɜː",
            "ʊ",
            "ʌ",
        }
    elif lang == "english_us":
        stressed_vowels = {
            "aɪ",
            "aʊ",
            "eɪ",
            "i",
            "iː",
            "oɪ",
            "oʊ",
            "u",
            "uː",
            "æ",
            "ɑ",
            "ɑː",
            "ɔ",
            "ɔɪ",
            "ɔː",
            "ɛ",
            "ɝ",
            "ɝː",
            "ʊ",
            "ʌ",
        }
    all_syllabics = {"ɪ", "ə", "ɚ", "n̩", "m̩", "l̩", "ɫ̩"} | VOWELS[lang]
    for i, p in enumerate(phones):
        if lang in ["english_us", "english_uk"]:
            if (
                p == "l"
                and 2 < i < len(phones) - 1
                and new_pron[-1] == "ə"
                and phones[i + 1] not in all_syllabics
            ):
                new_pron[-1] = "ɫ̩"
                continue
            elif (
                p == "m"
                and 2 < i < len(phones) - 1
                and new_pron[-1] == "ə"
                and phones[i + 1] not in all_syllabics
            ):
                new_pron[-1] = "m̩"
                continue
            elif (
                p == "n"
                and 2 < i < len(phones) - 1
                and new_pron[-1] == "ə"
                and phones[i + 1] not in all_syllabics
            ):
                new_pron[-1] = "n̩"
                continue
            elif p == "l̩" and 1 < i < len(phones) - 1 and phones[i + 1] in all_syllabics:
                new_pron.append("ə")
                p = "l"
            elif p == "l̩":
                p = "ɫ̩"
            elif p == "l" and i == len(phones) - 1:
                p = "ɫ"
            elif p == "l" and 1 < i < len(phones) - 1 and phones[i + 1] not in all_syllabics:
                p = "ɫ"
            elif (
                p in {"t", "p", "k"}
                and i == 0
                and i < len(phones) - 1
                and phones[i + 1] in stressed_vowels | {"ɪ", "ə", "ɚ"}
            ):
                p += "ʰ"
            elif (
                p == "ə"
                and 1 < i == len(phones) - 2
                and phones[i - 1] in {"d", "t"}
                and phones[i + 1] == "d"
            ):
                p = "ɪ"
            elif (
                p == "ə"
                and 1 < i == len(phones) - 2
                and phones[i - 1] in {"s", "z", "ʃ", "ʒ", "tʃ", "dʒ"}
                and phones[i + 1] == "z"
            ):
                p = "ɪ"
        if lang == "english_us":
            if (
                p == "ɹ"
                and 2 < i < len(phones) - 1
                and new_pron[-1] == "ə"
                and phones[i + 1] not in all_syllabics
            ):
                new_pron[-1] = "ɚ"
                continue
            elif (
                p in {"d", "t"}
                and 1 < i < len(phones) - 1
                and phones[i - 1] in all_syllabics
                and phones[i + 1] in {"n̩", "m̩", "l̩", "ɚ", "ə", "ɫ̩"}
            ):
                p = "ɾ"
            elif (
                p in {"t", "d"}
                and 1 < i < len(phones) - 2
                and phones[i - 1] in all_syllabics
                and phones[i + 1] == "ɪ"
                and phones[i + 2] == "d"
            ):
                p = "ɾ"
            elif (
                p in {"t", "d"}
                and i > 1
                and i == len(phones) - 2
                and phones[i - 1] in all_syllabics
                and phones[i + 1] == "i"
            ):
                p = "ɾ"
            elif (
                p in {"t", "d"}
                and i > 1
                and i == len(phones) - 3
                and phones[i - 1] in all_syllabics
                and phones[i + 1] in {"i", "ɪ"}
                and phones[i + 2] == "z"
            ):
                p = "ɾ"
            elif (
                p in {"t", "d"}
                and i > 1
                and i == len(phones) - 3
                and phones[i - 1] in all_syllabics
                and phones[i + 1] == "ɪ"
                and phones[i + 2] == "ŋ"
            ):
                p = "ɾ"
            elif (
                p in {"t", "p", "k"}
                and i == 0
                and i < len(phones) - 1
                and phones[i + 1] in stressed_vowels | {"ɪ", "ə", "ɚ"}
            ):
                p += "ʰ"
            elif (
                p in {"t", "p", "k"}
                and i > 0
                and phones[i - 1] not in {"s", "ʃ"}
                and i < len(phones) - 1
                and phones[i + 1] in stressed_vowels
            ):
                p += "ʰ"
            elif p == "l̩" and 1 < i < len(phones) - 1 and phones[i + 1] in all_syllabics:
                new_pron.append("ə")
                p = "l"
            elif p == "l̩":
                p = "ɫ̩"
            elif p == "l" and i == len(phones) - 1:
                p = "ɫ"
            elif (
                p == "l"
                and 1 < i < len(phones) - 1
                and phones[i + 1] not in {"ɪ", "ə", "ɚ", "n̩", "m̩", "l̩", "ɫ̩"} | VOWELS[lang]
            ):
                p = "ɫ"
            elif (
                p == "ə"
                and 1 < i == len(phones) - 2
                and phones[i - 1] in {"d", "t", "ɾ"}
                and phones[i + 1] == "d"
            ):
                p = "ɪ"
            elif (
                p == "ə"
                and 1 < i == len(phones) - 2
                and phones[i - 1] in {"s", "z", "ʃ", "ʒ", "tʃ", "dʒ"}
                and phones[i + 1] == "z"
            ):
                p = "ɪ"
            elif (
                p in {"t", "p", "k"}
                and i > 0
                and phones[i - 1] not in {"s", "ʃ"}
                and i < len(phones) - 1
                and phones[i + 1] in stressed_vowels
            ):
                p += "ʰ"
        elif lang == "english_uk":
            if p not in VOWELS[lang] and len(new_pron) and new_pron[-1] == "ɹ":
                new_pron[-1] = p
                continue
            elif (
                p in {"t", "p", "k"}
                and i > 0
                and phones[i - 1] not in {"s", "ʃ"}
                and i < len(phones) - 1
                and phones[i + 1] in stressed_vowels
            ):
                p += "ʰ"
        new_pron.append(p)
    return new_pron


def fix_pronunciations(dictionary, lang):
    filtered_dictionary = []
    for word, pronunciation in dictionary:
        if lang == "polish":
            if "ü" in word:
                continue
        for i, p in enumerate(pronunciation):
            if p in LANG_MAPPING[lang]:
                continue
            if p in GLOBAL_REMAPPING:
                pronunciation[i] = GLOBAL_REMAPPING[p]
            elif "̯" in p:
                pronunciation[i] = p.replace("̯", "")
            elif "͡" in p:
                pronunciation[i] = p.replace("͡", "")
            elif "‿" in p:
                pronunciation[i] = p.replace("‿", "")
            elif "͜" in p:
                pronunciation[i] = p.replace("͜", "")
            elif "g" in p:
                pronunciation[i] = p.replace("g", "ɡ")
        # Language specific conversions
        new_pron = convert_language_specific(word, pronunciation, lang)
        new_pron = convert_second_round(word, new_pron, lang)
        if new_pron is None:
            continue
        if (word, new_pron) not in filtered_dictionary:
            filtered_dictionary.append((word, new_pron))
    return filtered_dictionary


def process_language(lang):
    print("Processing", lang)
    if lang == "japanese":
        dictionary, input_graphemes, input_phones = read_source(lang + "_hiragana")
        d, g, p = read_source(lang + "_katakana")
        dictionary.extend(d)
        input_graphemes.update(g)
        input_phones.update(p)
        word_set = {x[0] for x in dictionary}
        d, g, p = read_source(lang)
        dictionary.extend([x for x in d if x[0] not in word_set])
        input_graphemes.update(g)
        input_phones.update(p)

    else:
        dictionary, input_graphemes, input_phones = read_source(lang)

    print("Input graphemes", sorted(input_graphemes))
    print("Input phones", sorted(input_phones))
    filtered = fix_pronunciations(dictionary, lang)
    save_dictionary(filtered, lang)


if __name__ == "__main__":
    for code in LANG_CODES:
        process_language(code)