|
|
|
|
| import re
|
| from jamo import h2j, j2hcj
|
| import ko_pron
|
| from g2pk2 import G2p
|
|
|
| from text.symbols2 import symbols
|
|
|
|
|
| _korean_classifiers = '๊ตฐ๋ฐ ๊ถ ๊ฐ ๊ทธ๋ฃจ ๋ข ๋ ๋ ๋ง๋ฆฌ ๋ชจ ๋ชจ๊ธ ๋ญ ๋ฐ ๋ฐ์ง ๋ฐฉ ๋ฒ ๋ฒ ๋ณด๋ฃจ ์ด ์ ์ ์ ์ ์ํผ ์ ์ง ์ฑ ์ฒ ์ฒฉ ์ถ ์ผค๋ ํจ ํต'
|
|
|
|
|
| _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| ('ใ
', 'ใ
ใ
'),
|
| ('ใ
', 'ใ
ใ
'),
|
| ('ใ
', 'ใ
ใ
ฃ'),
|
| ('ใ
', 'ใ
ใ
'),
|
| ('ใ
', 'ใ
ใ
'),
|
| ('ใ
', 'ใ
ใ
ฃ'),
|
| ('ใ
ข', 'ใ
กใ
ฃ'),
|
| ('ใ
', 'ใ
ฃใ
'),
|
| ('ใ
', 'ใ
ฃใ
'),
|
| ('ใ
', 'ใ
ฃใ
'),
|
| ('ใ
', 'ใ
ฃใ
'),
|
| ('ใ
', 'ใ
ฃใ
'),
|
| ('ใ
', 'ใ
ฃใ
')
|
| ]]
|
|
|
|
|
| _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
| ('a', '์์ด'),
|
| ('b', '๋น'),
|
| ('c', '์'),
|
| ('d', '๋'),
|
| ('e', '์ด'),
|
| ('f', '์ํ'),
|
| ('g', '์ง'),
|
| ('h', '์์ด์น'),
|
| ('i', '์์ด'),
|
| ('j', '์ ์ด'),
|
| ('k', '์ผ์ด'),
|
| ('l', '์'),
|
| ('m', '์ '),
|
| ('n', '์'),
|
| ('o', '์ค'),
|
| ('p', 'ํผ'),
|
| ('q', 'ํ'),
|
| ('r', '์๋ฅด'),
|
| ('s', '์์ค'),
|
| ('t', 'ํฐ'),
|
| ('u', '์ '),
|
| ('v', '๋ธ์ด'),
|
| ('w', '๋๋ธ์ '),
|
| ('x', '์์ค'),
|
| ('y', '์์ด'),
|
| ('z', '์ ํธ')
|
| ]]
|
|
|
|
|
| _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
| ('tอกษ','สง'),
|
| ('dอกส','สฅ'),
|
| ('ษฒ','n^'),
|
| ('ษ','ส'),
|
| ('สท','w'),
|
| ('ษญ','l`'),
|
| ('ส','ษพ'),
|
| ('ษฃ','ล'),
|
| ('ษฐ','ษฏ'),
|
| ('ส','j'),
|
| ('ส','ษ'),
|
| ('ษก','g'),
|
| ('\u031a','#'),
|
| ('\u0348','='),
|
| ('\u031e',''),
|
| ('\u0320',''),
|
| ('\u0339','')
|
| ]]
|
|
|
|
|
| def fix_g2pk2_error(text):
|
| new_text = ""
|
| i = 0
|
| while i < len(text) - 4:
|
| if (text[i:i+3] == 'ใ
ใ
กใน' or text[i:i+3] == 'ในใ
กใน') and text[i+3] == ' ' and text[i+4] == 'ใน':
|
| new_text += text[i:i+3] + ' ' + 'ใด'
|
| i += 5
|
| else:
|
| new_text += text[i]
|
| i += 1
|
|
|
| new_text += text[i:]
|
| return new_text
|
|
|
|
|
| def latin_to_hangul(text):
|
| for regex, replacement in _latin_to_hangul:
|
| text = re.sub(regex, replacement, text)
|
| return text
|
|
|
|
|
| def divide_hangul(text):
|
| text = j2hcj(h2j(text))
|
| for regex, replacement in _hangul_divided:
|
| text = re.sub(regex, replacement, text)
|
| return text
|
|
|
|
|
| def hangul_number(num, sino=True):
|
| '''Reference https://github.com/Kyubyong/g2pK'''
|
| num = re.sub(',', '', num)
|
|
|
| if num == '0':
|
| return '์'
|
| if not sino and num == '20':
|
| return '์ค๋ฌด'
|
|
|
| digits = '123456789'
|
| names = '์ผ์ด์ผ์ฌ์ค์ก์น ํ๊ตฌ'
|
| digit2name = {d: n for d, n in zip(digits, names)}
|
|
|
| modifiers = 'ํ ๋ ์ธ ๋ค ๋ค์ฏ ์ฌ์ฏ ์ผ๊ณฑ ์ฌ๋ ์ํ'
|
| decimals = '์ด ์ค๋ฌผ ์๋ฅธ ๋งํ ์ฐ ์์ ์ผํ ์ฌ๋ ์ํ'
|
| digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
|
| digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
|
|
|
| spelledout = []
|
| for i, digit in enumerate(num):
|
| i = len(num) - i - 1
|
| if sino:
|
| if i == 0:
|
| name = digit2name.get(digit, '')
|
| elif i == 1:
|
| name = digit2name.get(digit, '') + '์ญ'
|
| name = name.replace('์ผ์ญ', '์ญ')
|
| else:
|
| if i == 0:
|
| name = digit2mod.get(digit, '')
|
| elif i == 1:
|
| name = digit2dec.get(digit, '')
|
| if digit == '0':
|
| if i % 4 == 0:
|
| last_three = spelledout[-min(3, len(spelledout)):]
|
| if ''.join(last_three) == '':
|
| spelledout.append('')
|
| continue
|
| else:
|
| spelledout.append('')
|
| continue
|
| if i == 2:
|
| name = digit2name.get(digit, '') + '๋ฐฑ'
|
| name = name.replace('์ผ๋ฐฑ', '๋ฐฑ')
|
| elif i == 3:
|
| name = digit2name.get(digit, '') + '์ฒ'
|
| name = name.replace('์ผ์ฒ', '์ฒ')
|
| elif i == 4:
|
| name = digit2name.get(digit, '') + '๋ง'
|
| name = name.replace('์ผ๋ง', '๋ง')
|
| elif i == 5:
|
| name = digit2name.get(digit, '') + '์ญ'
|
| name = name.replace('์ผ์ญ', '์ญ')
|
| elif i == 6:
|
| name = digit2name.get(digit, '') + '๋ฐฑ'
|
| name = name.replace('์ผ๋ฐฑ', '๋ฐฑ')
|
| elif i == 7:
|
| name = digit2name.get(digit, '') + '์ฒ'
|
| name = name.replace('์ผ์ฒ', '์ฒ')
|
| elif i == 8:
|
| name = digit2name.get(digit, '') + '์ต'
|
| elif i == 9:
|
| name = digit2name.get(digit, '') + '์ญ'
|
| elif i == 10:
|
| name = digit2name.get(digit, '') + '๋ฐฑ'
|
| elif i == 11:
|
| name = digit2name.get(digit, '') + '์ฒ'
|
| elif i == 12:
|
| name = digit2name.get(digit, '') + '์กฐ'
|
| elif i == 13:
|
| name = digit2name.get(digit, '') + '์ญ'
|
| elif i == 14:
|
| name = digit2name.get(digit, '') + '๋ฐฑ'
|
| elif i == 15:
|
| name = digit2name.get(digit, '') + '์ฒ'
|
| spelledout.append(name)
|
| return ''.join(elem for elem in spelledout)
|
|
|
|
|
| def number_to_hangul(text):
|
| '''Reference https://github.com/Kyubyong/g2pK'''
|
| tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
|
| for token in tokens:
|
| num, classifier = token
|
| if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
|
| spelledout = hangul_number(num, sino=False)
|
| else:
|
| spelledout = hangul_number(num, sino=True)
|
| text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
|
|
|
| digits = '0123456789'
|
| names = '์์ผ์ด์ผ์ฌ์ค์ก์น ํ๊ตฌ'
|
| for d, n in zip(digits, names):
|
| text = text.replace(d, n)
|
| return text
|
|
|
|
|
| def korean_to_lazy_ipa(text):
|
| text = latin_to_hangul(text)
|
| text = number_to_hangul(text)
|
| text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
|
| for regex, replacement in _ipa_to_lazy_ipa:
|
| text = re.sub(regex, replacement, text)
|
| return text
|
|
|
| _g2p=G2p()
|
| def korean_to_ipa(text):
|
| text = latin_to_hangul(text)
|
| text = number_to_hangul(text)
|
| text = _g2p(text)
|
| text = fix_g2pk2_error(text)
|
| text = korean_to_lazy_ipa(text)
|
| return text.replace('สง','tส').replace('สฅ','dส')
|
|
|
| def post_replace_ph(ph):
|
| rep_map = {
|
| "๏ผ": ",",
|
| "๏ผ": ",",
|
| "๏ผ": ",",
|
| "ใ": ".",
|
| "๏ผ": "!",
|
| "๏ผ": "?",
|
| "\n": ".",
|
| "ยท": ",",
|
| "ใ": ",",
|
| "...": "โฆ",
|
| " ": "็ฉบ",
|
| }
|
| if ph in rep_map.keys():
|
| ph = rep_map[ph]
|
| if ph in symbols:
|
| return ph
|
| if ph not in symbols:
|
| ph = "ๅ"
|
| return ph
|
|
|
| def g2p(text):
|
| text = latin_to_hangul(text)
|
| text = _g2p(text)
|
| text = divide_hangul(text)
|
| text = fix_g2pk2_error(text)
|
| text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
|
|
|
| text = [post_replace_ph(i) for i in text]
|
| return text
|
|
|