File size: 5,875 Bytes
5669b22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import re
import unicodedata
from loguru import logger
from ..translate.translate_interface import TranslateInterface


def tts_filter(

    text: str,

    remove_special_char: bool,

    ignore_brackets: bool,

    ignore_parentheses: bool,

    ignore_asterisks: bool,

    ignore_angle_brackets: bool,

    translator: TranslateInterface | None = None,

) -> str:
    """

    Filter or do anything to the text before TTS generates the audio.

    Changes here do not affect subtitles or LLM's memory. The generated audio is

    the only affected thing.



    Args:

        text (str): The text to filter.

        remove_special_char (bool): Whether to remove special characters.

        ignore_brackets (bool): Whether to ignore text within brackets.

        ignore_parentheses (bool): Whether to ignore text within parentheses.

        ignore_asterisks (bool): Whether to ignore text within asterisks.

        translator (TranslateInterface, optional):

            The translator to use. If None, we'll skip the translation. Defaults to None.



    Returns:

        str: The filtered text.

    """
    if ignore_asterisks:
        try:
            text = filter_asterisks(text)
        except Exception as e:
            logger.warning(f"Error ignoring asterisks: {e}")
            logger.warning(f"Text: {text}")
            logger.warning("Skipping...")

    if ignore_brackets:
        try:
            text = filter_brackets(text)
        except Exception as e:
            logger.warning(f"Error ignoring brackets: {e}")
            logger.warning(f"Text: {text}")
            logger.warning("Skipping...")
    if ignore_parentheses:
        try:
            text = filter_parentheses(text)
        except Exception as e:
            logger.warning(f"Error ignoring parentheses: {e}")
            logger.warning(f"Text: {text}")
            logger.warning("Skipping...")
    if ignore_angle_brackets:
        try:
            text = filter_angle_brackets(text)
        except Exception as e:
            logger.warning(f"Error ignoring angle brackets: {e}")
            logger.warning(f"Text: {text}")
            logger.warning("Skipping...")
    if remove_special_char:
        try:
            text = remove_special_characters(text)
        except Exception as e:
            logger.warning(f"Error removing special characters: {e}")
            logger.warning(f"Text: {text}")
            logger.warning("Skipping...")
    if translator:
        try:
            logger.info("Translating...")
            text = translator.translate(text)
            logger.info(f"Translated: {text}")
        except Exception as e:
            logger.critical(f"Error translating: {e}")
            logger.critical(f"Text: {text}")
            logger.warning("Skipping...")

    logger.debug(f"Filtered text: {text}")
    return text


def remove_special_characters(text: str) -> str:
    """

    Filter text to remove all non-letter, non-number, and non-punctuation characters.



    Args:

        text (str): The text to filter.



    Returns:

        str: The filtered text.

    """
    normalized_text = unicodedata.normalize("NFKC", text)

    def is_valid_char(char: str) -> bool:
        category = unicodedata.category(char)
        return (
            category.startswith("L")
            or category.startswith("N")
            or category.startswith("P")
            or char.isspace()
        )

    filtered_text = "".join(char for char in normalized_text if is_valid_char(char))
    return filtered_text


def _filter_nested(text: str, left: str, right: str) -> str:
    """

    Generic function to handle nested symbols.



    Args:

        text (str): The text to filter.

        left (str): The left symbol (e.g. '[' or '(').

        right (str): The right symbol (e.g. ']' or ')').



    Returns:

        str: The filtered text.

    """
    if not isinstance(text, str):
        raise TypeError("Input must be a string")
    if not text:
        return text

    result = []
    depth = 0
    for char in text:
        if char == left:
            depth += 1
        elif char == right:
            if depth > 0:
                depth -= 1
        else:
            if depth == 0:
                result.append(char)
    filtered_text = "".join(result)
    filtered_text = re.sub(r"\s+", " ", filtered_text).strip()
    return filtered_text


def filter_brackets(text: str) -> str:
    """

    Filter text to remove all text within brackets, handling nested cases.



    Args:

        text (str): The text to filter.



    Returns:

        str: The filtered text.

    """
    return _filter_nested(text, "[", "]")


def filter_parentheses(text: str) -> str:
    """

    Filter text to remove all text within parentheses, handling nested cases.



    Args:

        text (str): The text to filter.



    Returns:

        str: The filtered text.

    """
    return _filter_nested(text, "(", ")")


def filter_angle_brackets(text: str) -> str:
    """

    Filter text to remove all text within angle brackets, handling nested cases.



    Args:

        text (str): The text to filter.



    Returns:

        str: The filtered text.

    """
    return _filter_nested(text, "<", ">")


def filter_asterisks(text: str) -> str:
    """

    Removes text enclosed within asterisks of any length (*, **, ***, etc.) from a string.



    Args:

        text: The input string.



    Returns:

        The string with asterisk-enclosed text removed.

    """
    # Handle asterisks of any length (*, **, ***, etc.)
    filtered_text = re.sub(r"\*{1,}((?!\*).)*?\*{1,}", "", text)

    # Clean up any extra spaces
    filtered_text = re.sub(r"\s+", " ", filtered_text).strip()

    return filtered_text