| import re |
| import os |
| import ast |
| import json |
| from jamo import hangul_to_jamo, h2j, j2h, hcj_to_jamo, is_hcj |
| from jamo.jamo import _jamo_char_to_hcj |
| import codecs |
|
|
| PAD = '_' |
| EOS = '~' |
| PUNC = '!\'(),-.:;?' |
| SPACE = ' ' |
| _SILENCES = ['sp', 'spn', 'sil'] |
|
|
| JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)]) |
| JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)]) |
| JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)]) |
|
|
| VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE |
| ALL_SYMBOLS = list(PAD + EOS + VALID_CHARS) + _SILENCES |
| s_to_i={c: i for i, c in enumerate(ALL_SYMBOLS)} |
| |
| KOR_SYMBOLS=ALL_SYMBOLS |
|
|
| Kchar_to_id={c: i for i, c in enumerate(KOR_SYMBOLS)} |
| id_to_Kchar={i: c for i, c in enumerate(KOR_SYMBOLS)} |
|
|