File size: 1,161 Bytes
ae054e9 94aabbd ae054e9 94aabbd ae054e9 94aabbd ae054e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import re
from tokens import *
def format_input(text_ja: str) -> str:
for token, emoji in TRIGGER_EMOJI_DICT.items():
text_ja = re.sub(emoji, token, text_ja)
return text_ja
def text_to_placeholder_tokens(text_ja: str) -> tuple[str, dict]:
repls = {}
for entity_type in REPL_DICT.keys():
refs = re.findall(REPL_DICT[entity_type]["pattern"], text_ja)
token = REPL_DICT[entity_type]["token"]
repls[entity_type] = refs
for ref in refs:
text_ja = re.sub(re.escape(ref), token, text_ja)
return text_ja, repls
def placeholder_tokens_to_text(text_en: str, repls: dict[str, list[str]]) -> str:
for entity_type in REPL_DICT.keys():
for entity in repls[entity_type]:
token = REPL_DICT[entity_type]["token"]
text_en = re.sub(token, entity, text_en, count=1)
return text_en
def format_output(text_en: str, sub_emoji: bool = True) -> str:
text_en = re.sub(r'【\s(.+?)\s】\s?', r'【\1】', text_en)
if sub_emoji:
for token, emoji in TRIGGER_EMOJI_DICT.items():
text_en = re.sub(token, emoji, text_en)
return text_en
|