Spaces:
Build error
Build error
| # Copyright (c) 2024 Amphion. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import re | |
| """ | |
| Text clean time | |
| """ | |
| english_dictionary = { | |
| "KOREA": "코리아", | |
| "IDOL": "아이돌", | |
| "IT": "아이티", | |
| "IQ": "아이큐", | |
| "UP": "업", | |
| "DOWN": "다운", | |
| "PC": "피씨", | |
| "CCTV": "씨씨티비", | |
| "SNS": "에스엔에스", | |
| "AI": "에이아이", | |
| "CEO": "씨이오", | |
| "A": "에이", | |
| "B": "비", | |
| "C": "씨", | |
| "D": "디", | |
| "E": "이", | |
| "F": "에프", | |
| "G": "지", | |
| "H": "에이치", | |
| "I": "아이", | |
| "J": "제이", | |
| "K": "케이", | |
| "L": "엘", | |
| "M": "엠", | |
| "N": "엔", | |
| "O": "오", | |
| "P": "피", | |
| "Q": "큐", | |
| "R": "알", | |
| "S": "에스", | |
| "T": "티", | |
| "U": "유", | |
| "V": "브이", | |
| "W": "더블유", | |
| "X": "엑스", | |
| "Y": "와이", | |
| "Z": "제트", | |
| } | |
| def normalize(text): | |
| text = text.strip() | |
| text = re.sub( | |
| "[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text | |
| ) | |
| text = normalize_english(text) | |
| text = text.lower() | |
| return text | |
| def normalize_english(text): | |
| def fn(m): | |
| word = m.group() | |
| if word in english_dictionary: | |
| return english_dictionary.get(word) | |
| return word | |
| text = re.sub("([A-Za-z]+)", fn, text) | |
| return text | |
| def korean_to_ipa(text, text_tokenizer): | |
| if type(text) == str: | |
| text = normalize(text) | |
| phonemes = text_tokenizer(text) | |
| return phonemes | |
| else: | |
| for i, t in enumerate(text): | |
| text[i] = normalize(t) | |
| return text_tokenizer(text) | |