File size: 1,161 Bytes
ae054e9
94aabbd
ae054e9
 
 
94aabbd
 
ae054e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94aabbd
 
ae054e9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import re
from tokens import *


def format_input(text_ja: str) -> str:
    for token, emoji in TRIGGER_EMOJI_DICT.items():
        text_ja = re.sub(emoji, token, text_ja)

    return text_ja


def text_to_placeholder_tokens(text_ja: str) -> tuple[str, dict]:
    repls = {}
    for entity_type in REPL_DICT.keys():
        refs = re.findall(REPL_DICT[entity_type]["pattern"], text_ja)
        token = REPL_DICT[entity_type]["token"]
        repls[entity_type] = refs
        for ref in refs:
            text_ja = re.sub(re.escape(ref), token, text_ja)

    return text_ja, repls


def placeholder_tokens_to_text(text_en: str, repls: dict[str, list[str]]) -> str:
    for entity_type in REPL_DICT.keys():
        for entity in repls[entity_type]:
            token = REPL_DICT[entity_type]["token"]
            text_en = re.sub(token, entity, text_en, count=1)

    return text_en


def format_output(text_en: str, sub_emoji: bool = True) -> str:
    text_en = re.sub(r'【\s(.+?)\s】\s?', r'【\1】', text_en)

    if sub_emoji:
        for token, emoji in TRIGGER_EMOJI_DICT.items():
            text_en = re.sub(token, emoji, text_en)

    return text_en