| import re | |
| def japanese_cleaners(text): | |
| from text.japanese import japanese_to_romaji_with_accent | |
| text = japanese_to_romaji_with_accent(text) | |
| if len(text) == 0 or re.match('[A-Za-z]', text[-1]): | |
| text += '.' | |
| return text | |
| def japanese_cleaners2(text): | |
| text = text.replace('・・・', '…').replace('・', ' ') | |
| text = japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…') \ | |
| .replace('(', '').replace(')', '') \ | |
| .replace('[', '').replace(']', '') \ | |
| .replace('*', ' ').replace('{', '').replace('}', '') | |
| return text | |
| def ko2kata(text): | |
| return text | |
| def en2kata(text): | |
| return text | |
| def jke_cleaners(text): | |
| japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text) | |
| korean_texts = re.findall(r'\[KO\].*?\[KO\]', text) | |
| english_texts = re.findall(r'\[EN\].*?\[EN\]', text) | |
| for japanese_text in japanese_texts: | |
| cleaned_text = japanese_text[4:-4] | |
| text = text.replace(japanese_text, cleaned_text+' ', 1) | |
| for korean_text in korean_texts: | |
| cleaned_text = ko2kata(korean_text[4:-4]) | |
| text = text.replace(korean_text, cleaned_text+' ', 1) | |
| for english_text in english_texts: | |
| cleaned_text = en2kata(english_text[4:-4]) | |
| text = text.replace(english_text, cleaned_text+' ', 1) | |
| text = japanese_cleaners2(text) | |
| text = text[:-1] | |
| if re.match(r'[^\.,!\?\-…~]', text[-1]): | |
| text += '.' | |
| return text | |