Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import re | |
| ############################################################### | |
| # X-SAMPA | |
| _xsampa2ipa = { | |
| k:re.sub(r'◌','',v) for (k,v) in { | |
| '#':'#', | |
| '=':'◌̩', | |
| '>':'◌ʼ', | |
| '`':'◌˞', | |
| '~':'◌̃', | |
| 'a':'a', | |
| 'b':'b', | |
| 'b_<':'ɓ', | |
| 'c':'c', | |
| 'd':'d', | |
| 'd`':'ɖ', | |
| 'd_<':'ɗ', | |
| 'e':'e', | |
| 'f':'f', | |
| 'g':'ɡ', | |
| 'g_<':'ɠ', | |
| 'h':'h', | |
| 'h\\':'ɦ', | |
| 'i':'i', | |
| 'j':'j', | |
| 'j\\':'ʝ', | |
| 'k':'k', | |
| 'l':'l', | |
| 'l`':'ɭ', | |
| 'l\\':'ɺ', | |
| 'm':'m', | |
| 'n':'n', | |
| 'n_d':'nd', | |
| 'n`':'ɳ', | |
| 'o':'o', | |
| 'p':'p', | |
| 'p\\':'ɸ', | |
| 'p_<':'ɓ̥', | |
| 'q':'q', | |
| 'r':'r', | |
| 'r`':'ɽ', | |
| 'r\\':'ɹ', | |
| 'r\\`':'ɻ', | |
| 's':'s', | |
| 's`':'ʂ', | |
| 's\\':'ɕ', | |
| 't':'t', | |
| 't`':'ʈ', | |
| 'u':'u', | |
| 'v':'v', | |
| 'v\\':'ʋ', | |
| 'w':'w', | |
| 'x':'x', | |
| 'x\\':'ɧ', | |
| 'y':'y', | |
| 'z':'z', | |
| 'z`':'ʐ', | |
| 'z\\':'ʑ', | |
| 'A':'ɑ', | |
| 'B':'β', | |
| 'B\\':'ʙ', | |
| 'C':'ç', | |
| 'D':'ð', | |
| 'E':'ɛ', | |
| 'F':'ɱ', | |
| 'G':'ɣ', | |
| 'G\\':'ɢ', | |
| 'G\\_<':'ʛ', | |
| 'H':'ɥ', | |
| 'H\\':'ʜ', | |
| 'I':'ɪ', | |
| 'I\\':'ɪ̈ ', | |
| 'J':'ɲ', | |
| 'J\\':'ɟ', | |
| 'J\\_<':'ʄ', | |
| 'K':'ɬ', | |
| 'K\\':'ɮ', | |
| 'L':'ʎ', | |
| 'L\\':'ʟ', | |
| 'M':'ɯ', | |
| 'M\\':'ɰ', | |
| 'N':'ŋ', | |
| 'N_g':'ŋɡ', | |
| 'N\\':'ɴ', | |
| 'O':'ɔ', | |
| 'O\\':'ʘ', | |
| 'P':'ʋ', | |
| 'Q':'ɒ', | |
| 'R':'ʁ', | |
| 'R\\':'ʀ', | |
| 'S':'ʃ', | |
| 'T':'θ', | |
| 'U':'ʊ', | |
| 'U\\':'ʊ̈ ', | |
| 'V':'ʌ', | |
| 'W':'ʍ', | |
| 'X':'χ', | |
| 'X\\':'ħ', | |
| 'Y':'ʏ', | |
| 'Z':'ʒ', | |
| '.':'.', | |
| '"':'ˈ', | |
| '%':'ˌ', | |
| '\'':'ʲ', | |
| ':':'ː', | |
| ':\\':'ˑ', | |
| '-':'', | |
| '@':'ə', | |
| '@\\':'ɘ', | |
| '{':'æ', | |
| '}':'ʉ', | |
| '1':'ɨ', | |
| '2':'ø', | |
| '3':'ɜ', | |
| '3\\':'ɞ', | |
| '4':'ɾ', | |
| '5':'ɫ', | |
| '6':'ɐ', | |
| '7':'ɤ', | |
| '8':'ɵ', | |
| '9':'œ', | |
| '&':'ɶ', | |
| '?':'ʔ', | |
| '?\\':'ʕ', | |
| '*':'', | |
| '/':'', | |
| '<\\':'ʢ', | |
| '>\\':'ʡ', | |
| '^':'ꜛ', | |
| '!':'ꜜ', | |
| '!\\':'ǃ', | |
| '|':'|', | |
| '|\\':'ǀ', | |
| '||':'‖', | |
| '|\\|\\':'ǁ', | |
| '=\\':'ǂ', | |
| '-\\':'‿' | |
| }.items() | |
| } | |
| _xsampa_vowels=set('aeiouyAEIOUYQV@123}{6789&')|set(('I\\','U\\','@\\','3\\')) | |
| _xdiacritics2ipa = { | |
| k:re.sub(r'◌','',v) for (k,v) in { | |
| '"':'◌̈', | |
| '+':'◌̟', | |
| '-':'◌̠', | |
| '/':'◌̌', | |
| '0':'◌̥', | |
| '=':'◌̩', | |
| '>':'◌ʼ', | |
| '?\\':'◌ˤ', | |
| '\\':'◌̂', | |
| '^':'◌̯', | |
| '}':'◌̚', | |
| '`':'◌˞', | |
| '~':'◌̃', | |
| 'A':'◌̘', | |
| 'a':'◌̺', | |
| 'B':'◌̏', | |
| 'B_L':'◌᷅', | |
| 'c':'◌̜', | |
| 'd':'◌̪', | |
| 'e':'◌̴', | |
| 'F':'◌̂', | |
| 'G':'◌ˠ', | |
| 'H':'◌́', | |
| 'H_T':'◌᷄', | |
| 'h':'◌ʰ', | |
| 'j':'◌ʲ', | |
| 'k':'◌̰', | |
| 'L':'◌̀', | |
| 'l':'◌ˡ', | |
| 'M':'◌̄', | |
| 'm':'◌̻', | |
| 'N':'◌̼', | |
| 'n':'◌ⁿ', | |
| 'O':'◌̹', | |
| 'o':'◌̞', | |
| 'q':'◌̙', | |
| 'R':'◌̌', | |
| 'R_F':'◌᷈', | |
| 'r':'◌̝', | |
| 'T':'◌̋', | |
| 't':'◌̤', | |
| 'v':'◌̬', | |
| 'w':'◌ʷ', | |
| 'X':'◌̆', | |
| 'x':'◌̽', | |
| '1':'˥', | |
| '2':'˦', | |
| '3':'˧', | |
| '4':'˨', | |
| '5':'˩', | |
| }.items() | |
| } | |
| # Create and _xsampa2ipa with '_'+k for each diacritic | |
| _xsampa_and_diac2ipa = _xsampa2ipa.copy() | |
| _xsampa_and_diac2ipa.update({ ('_'+k):v for (k,v) in _xdiacritics2ipa.items() }) | |
| _ipa2xsampa = { v:k for (k,v) in _xsampa_and_diac2ipa.items() } | |
| ################################################################## | |
| # Language-dependent tone numbers | |
| _tone2ipa = { | |
| 'arz' : { '0':'', '1':'ˈ', '2':'ˌ' }, | |
| 'eng' : { '0':'', '1':'ˈ', '2':'ˌ' }, | |
| 'yue' : { '0':'', '1':'˥', '2':'˧˥', '3':'˧', '4':'˨˩', '5':'˩˧', '6':'˨' }, | |
| 'lao' : { '0':'', '1':'˧', '2':'˥˧', '3':'˧˩', '4':'˥', '5':'˩˧', '6':'˩' }, | |
| 'cmn' : { '0':'', '1':'˥', '2':'˧˥', '3':'˨˩˦', '4':'˥˩', '5':'' }, | |
| 'spa' : { '0':'', '1':'ˈ', '2':'ˌ' }, | |
| 'vie' : { '0':'', '1':'˧', '2':'˨˩h', '3':'˧˥', '4':'˨˩˨', '5':'˧ʔ˥', '6':'˧˨ʔ' }, | |
| } | |
| ##################################################################### | |
| # DISC, the code used by CELEX, | |
| # is a kind of modified X-SAMPA, | |
| # modified to include a lot of one-character shortcuts for phones | |
| # that would require two characters in X-SAMPA. | |
| # Some of the one-character shortcuts are language-dependent, | |
| # in the sense that the same ASCII character is re-used for different IPA | |
| # symbols in different languages. | |
| # The language-independent table, below, includes only the symbols that | |
| # are not part of X-SAMPA. | |
| _disc2ipa = { | |
| k:re.sub(r'◌','',v) for (k,v) in { | |
| '_':'dʒ', | |
| 'a':'aː', | |
| 'b':'b', | |
| 'c':'æ◌̃', | |
| 'd':'d', | |
| 'e':'eː', | |
| 'f':'f', | |
| 'g':'ɡ', | |
| 'h':'h', | |
| 'i':'iː', | |
| 'j':'j', | |
| 'k':'k', | |
| 'l':'l', | |
| 'm':'m', | |
| 'n':'n', | |
| 'o':'oː', | |
| 'p':'p', | |
| 'q':'ɑ◌̃ː', | |
| 'r':'r', | |
| 's':'s', | |
| 't':'t', | |
| 'u':'uː', | |
| 'v':'v', | |
| 'w':'w', | |
| 'x':'x', | |
| 'y':'y', | |
| 'y':'yː', | |
| 'A':'ɑ', | |
| 'B':'au', | |
| 'C':'ŋ◌̩', | |
| 'D':'ð', | |
| 'E':'ɛ', | |
| 'F':'m◌̩', | |
| 'G':'ɣ', | |
| 'H':'n◌̩', | |
| 'I':'ɪ', | |
| 'J':'ɲ', | |
| 'K':'ɛɪ', | |
| 'L':'œɪ', | |
| 'M':'ɯ', | |
| 'N':'ŋ', | |
| 'O':'ɔ', | |
| 'P':'ʋ', | |
| 'P':'l◌̩', | |
| 'Q':'ɒ', | |
| 'R':'ɜ◌˞', | |
| 'S':'ʃ', | |
| 'T':'θ', | |
| 'U':'ʊ', | |
| 'V':'ʌ', | |
| 'W':'ai', | |
| 'X':'ɔy', | |
| 'Y':'ʏ', | |
| 'Z':'ʒ', | |
| '0':'æ◌̃ː', | |
| '1':'eɪ', | |
| '2':'aɪ', | |
| '3':'ɜː', | |
| '4':'ɔɪ', | |
| '5':'əʊ', | |
| '6':'aʊ', | |
| '7':'ɪə', | |
| '8':'ɛə', | |
| '9':'ʊə', | |
| '|':'øː', | |
| '!':'iːː', | |
| '(':'yːː', | |
| ')':'ɛː', | |
| '*':'œː', | |
| '<':'ɒː', | |
| '+':'pf', | |
| '=':'ts', | |
| '-':'.', | |
| '#':'ɑː', | |
| '$':'ɔː', | |
| '&':'a', | |
| '^':'œ◌̃', | |
| '~':'ɔ◌̃ː', | |
| "'":'ˈ', | |
| '@':'ə', | |
| '{':'æ', | |
| '}':'ʉ', | |
| }.items() | |
| } | |
| _disc_vowels=_xsampa_vowels|set('|!()*KL#$WBX^46cq~CFHPR5789') | |
| _ipa2disc = { v:k for (k,v) in _disc2ipa.items() } | |
| _ipa2disc['#'] = '' | |
| _disc2ipa_dutch = _disc2ipa.copy() | |
| _disc2ipa_dutch['w']='ʋ' | |
| _ipa2disc['ʋ']='w' | |
| _disc2ipa_english = _disc2ipa.copy() | |
| _disc2ipa_english['r']='ɻ' | |
| _ipa2disc['ɻ']='r' | |
| ####################################################################### | |
| # Callhome phone codes are completely language-dependent. | |
| # I know of three: Egyptian Arabic, Mandarin, and Spanish | |
| _callhome2ipa = {} | |
| _callhome2ipa['arz'] = { | |
| 'C':'ʔ', | |
| 'b':'b', | |
| 't':'t', | |
| 'g':'g', | |
| 'H':'ħ', | |
| 'x':'x', | |
| 'd':'d', | |
| 'r':'ɾ', | |
| 'z':'z', | |
| 's':'s', | |
| '$':'ʃ', | |
| 'S':'sˤ', | |
| 'D':'dˤ', | |
| 'T':'tˤ', | |
| 'Z':'ðˤ', | |
| 'c':'ʕ', | |
| 'G':'ɣ', | |
| 'f':'f', | |
| 'q':'ʔ', | |
| 'Q':'q', | |
| 'k':'k', | |
| 'l':'l', | |
| 'm':'m', | |
| 'n':'n', | |
| 'h':'h', | |
| 'w':'w', | |
| 'y':'j', | |
| 'v':'v', | |
| 'j':'dʒ', | |
| '@':'æ', | |
| 'a':'a', | |
| 'B':'a', | |
| 'i':'i', | |
| 'u':'u', | |
| '%':'æː', | |
| 'A':'aː', | |
| 'I':'iː', | |
| 'O':'oː', | |
| 'U':'uː', | |
| 'E':'eː', | |
| 'ay':'aj', | |
| 'aw':'aw' | |
| } | |
| _callhome2ipa['arz'].update(_tone2ipa['arz']) | |
| _callhome_vowels = dict() | |
| _callhome_vowels['arz'] = set('@aBiu%AIOUE')|set(('ay','aw')) | |
| _callhome2ipa['cmn'] = { | |
| 'b':'p', | |
| 'p':'pʰ', | |
| 'm':'m', | |
| 'd':'t', | |
| 't':'tʰ', | |
| 'l':'l', | |
| 'n':'n', | |
| 'g':'k', | |
| 'k':'kʰ', | |
| 'h':'h', | |
| 'N':'ŋ', | |
| 'z':'ts', | |
| 'c':'tsʰ', | |
| 's':'s', | |
| 'j':'tɕ', | |
| 'q':'tɕʰ', | |
| 'x':'ɕ', | |
| 'r':'ɻ', | |
| 'Z':'ʈʂ', | |
| 'C':'ʈʂʰ', | |
| 'S':'ʂ', | |
| 'f':'f', | |
| 'y':'j', | |
| 'w':'w', | |
| 'W':'ɥ', | |
| 'i':'i', | |
| 'I':'ɨ', | |
| '%':'ɯ', | |
| 'e':'e', | |
| 'E':'ɛ', | |
| 'U':'y', | |
| '&':'ə', | |
| 'a':'ɑ', | |
| '@':'a', | |
| 'o':'o', | |
| '>':'ɔ', | |
| 'u':'u', | |
| 'R':'ɚ' | |
| } | |
| _callhome2ipa['cmn'].update(_tone2ipa['cmn']) | |
| _callhome_vowels['cmn']=set('iI%eEU&a@o>uR') | |
| _callhome2ipa['spa'] = { | |
| 'a':'a', | |
| 'i':'i', | |
| 'e':'e', | |
| 'o':'o', | |
| 'u':'u', | |
| 'h':'h', | |
| 'p':'p', | |
| 'b':'b', | |
| 'B':'β', | |
| 'f':'f', | |
| 'v':'v', | |
| 'l':'l', | |
| 'm':'m', | |
| 'w':'w', | |
| 't':'t', | |
| 'd':'d', | |
| 'D':'ð', | |
| 's':'s', | |
| 'S':'ʃ', | |
| 'C':'tʃ', | |
| 'J':'dʒ', | |
| 'n':'n', | |
| 'y':'j', | |
| 'r':'ɾ', | |
| 'R':'r', | |
| 'x':'x', | |
| 'N':'ɲ', | |
| 'k':'k', | |
| 'g':'g', | |
| 'G':'ɣ', | |
| '9':'ŋ', | |
| 'z':'z' | |
| } | |
| _callhome2ipa['spa'].update(_tone2ipa['spa']) | |
| _callhome_vowels['spa']=set('aieou') | |
| _ipa2callhome={l:{v:k for (k,v) in d.items()}for (l,d) in _callhome2ipa.items()} | |
| #special cases, e.g., define best choice for ambiguous mappings | |
| _ipa2callhome['arz']['a']='a' | |
| ######################################################################## | |
| # ARPABET was invented for English. | |
| # The standard dictionary written in ARPABET is the CMU dictionary. | |
| # TIMIT is written in a variant of ARPABET that includes a couple | |
| # of non-standard allophones, and most significantly, includes | |
| # separate symbols for the closure and release portions of each stop. | |
| _arpabet2ipa = { | |
| 'AA':'ɑ', | |
| 'AE':'æ', | |
| 'AH':'ʌ', | |
| 'AH0':'ə', | |
| 'AO':'ɔ', | |
| 'AW':'aʊ', | |
| 'AY':'aɪ', | |
| 'EH':'ɛ', | |
| 'ER':'ɝ', | |
| 'ER0':'ɚ', | |
| 'EY':'eɪ', | |
| 'IH':'ɪ', | |
| 'IH0':'ɨ', | |
| 'IY':'i', | |
| 'OW':'oʊ', | |
| 'OY':'ɔɪ', | |
| 'UH':'ʊ', | |
| 'UW':'u', | |
| 'B':'b', | |
| 'CH':'tʃ', | |
| 'D':'d', | |
| 'DH':'ð', | |
| 'EL':'l̩ ', | |
| 'EM':'m̩', | |
| 'EN':'n̩', | |
| 'F':'f', | |
| 'G':'ɡ', | |
| 'HH':'h', | |
| 'JH':'dʒ', | |
| 'K':'k', | |
| 'L':'l', | |
| 'M':'m', | |
| 'N':'n', | |
| 'NG':'ŋ', | |
| 'P':'p', | |
| 'Q':'ʔ', | |
| 'R':'ɹ', | |
| 'S':'s', | |
| 'SH':'ʃ', | |
| 'T':'t', | |
| 'TH':'θ', | |
| 'V':'v', | |
| 'W':'w', | |
| 'WH':'ʍ', | |
| 'Y':'j', | |
| 'Z':'z', | |
| 'ZH':'ʒ' | |
| } | |
| _arpabet2ipa.update(_tone2ipa['eng']) # Add the English stress labels | |
| _arpabet_vowels=set((k for k in _arpabet2ipa.keys() if k[0] in 'AEIOU')) | |
| _ipa2arpabet = { v: k for k, v in _arpabet2ipa.items() } | |
| _ipa2tone = {l:{v:k for k,v in d.items()} for l,d in _tone2ipa.items()} | |
| _timit2ipa = _arpabet2ipa.copy() | |
| _timit2ipa.update({ | |
| 'AX':'ə', | |
| 'AX-H':'ə̥', | |
| 'AXR':'ɚ', | |
| 'B':'', | |
| 'BCL':'b', | |
| 'D':'', | |
| 'DCL':'d', | |
| 'DX':'ɾ', | |
| 'ENG':'ŋ̍', | |
| 'EPI':'', | |
| 'G':'', | |
| 'GCL':'g', | |
| 'HV':'ɦ', | |
| 'H#':'', | |
| 'IX':'ɨ', | |
| 'KCL':'k', | |
| 'K':'', | |
| 'NX':'ɾ̃', | |
| 'P':'', | |
| 'PAU':'', | |
| 'PCL':'p', | |
| 'T':'', | |
| 'TCL':'t', | |
| 'UX':'ʉ', | |
| }) | |
| ####################################################################### | |
| # IPA | |
| _ipa_vowels = set('aeiouyɑɒɛɪɔʘʊʌʏəɘæʉɨøɜɞɐɤɵœɶ')|set(('ɪ̈','ʊ̈')) | |
| _ipa_consonants = set('bɓcdɖɗfɡɠhɦjʝklɭɺmnɳpɸqrɽɹɻsʂɕtʈvʋwxɧzʐʑβʙçðɱɣɢʛɥʜɲɟʄɬɮʎʟɯɰŋɴʋɒʁʀʃθʍχħʒɾɫʔʕʢʡꜛꜜǃ|ǀ‖ǁǂ') | |
| _ipa_diacritics = set(re.sub(r'◌','','◌̈◌̟◌̠◌̌◌̥◌̩◌◌◌̂◌̯◌̚◌◌̃◌̘◌̺◌̏◌◌̜◌̪◌̴◌̂◌◌́◌◌◌◌̰◌̀◌◌̄◌̻◌̼◌◌̹◌̞◌̙◌̌◌◌̝◌̋◌̤◌̬◌◌̆◌̽ːʰˀʷʱʼʲˤ')) | |
| _ipa_stressmarkers = set("ˈˌ") | |
| _ipa_tonecharacters = set('˥˦˧˨˩˥˧') | |
| # A bit of recursion to generate all tones of up to 4 components | |
| _ipa_tones = _ipa_tonecharacters.copy() | |
| _ipa_tones |= set(x+y for x in _ipa_tones for y in _ipa_tones) | |
| _ipa_tones |= set(x+y for x in _ipa_tones for y in _ipa_tones) | |
| _ipa_symbols=_ipa_vowels|_ipa_consonants|_ipa_diacritics |