File size: 5,875 Bytes
5669b22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | import re
import unicodedata
from loguru import logger
from ..translate.translate_interface import TranslateInterface
def tts_filter(
text: str,
remove_special_char: bool,
ignore_brackets: bool,
ignore_parentheses: bool,
ignore_asterisks: bool,
ignore_angle_brackets: bool,
translator: TranslateInterface | None = None,
) -> str:
"""
Filter or do anything to the text before TTS generates the audio.
Changes here do not affect subtitles or LLM's memory. The generated audio is
the only affected thing.
Args:
text (str): The text to filter.
remove_special_char (bool): Whether to remove special characters.
ignore_brackets (bool): Whether to ignore text within brackets.
ignore_parentheses (bool): Whether to ignore text within parentheses.
ignore_asterisks (bool): Whether to ignore text within asterisks.
translator (TranslateInterface, optional):
The translator to use. If None, we'll skip the translation. Defaults to None.
Returns:
str: The filtered text.
"""
if ignore_asterisks:
try:
text = filter_asterisks(text)
except Exception as e:
logger.warning(f"Error ignoring asterisks: {e}")
logger.warning(f"Text: {text}")
logger.warning("Skipping...")
if ignore_brackets:
try:
text = filter_brackets(text)
except Exception as e:
logger.warning(f"Error ignoring brackets: {e}")
logger.warning(f"Text: {text}")
logger.warning("Skipping...")
if ignore_parentheses:
try:
text = filter_parentheses(text)
except Exception as e:
logger.warning(f"Error ignoring parentheses: {e}")
logger.warning(f"Text: {text}")
logger.warning("Skipping...")
if ignore_angle_brackets:
try:
text = filter_angle_brackets(text)
except Exception as e:
logger.warning(f"Error ignoring angle brackets: {e}")
logger.warning(f"Text: {text}")
logger.warning("Skipping...")
if remove_special_char:
try:
text = remove_special_characters(text)
except Exception as e:
logger.warning(f"Error removing special characters: {e}")
logger.warning(f"Text: {text}")
logger.warning("Skipping...")
if translator:
try:
logger.info("Translating...")
text = translator.translate(text)
logger.info(f"Translated: {text}")
except Exception as e:
logger.critical(f"Error translating: {e}")
logger.critical(f"Text: {text}")
logger.warning("Skipping...")
logger.debug(f"Filtered text: {text}")
return text
def remove_special_characters(text: str) -> str:
"""
Filter text to remove all non-letter, non-number, and non-punctuation characters.
Args:
text (str): The text to filter.
Returns:
str: The filtered text.
"""
normalized_text = unicodedata.normalize("NFKC", text)
def is_valid_char(char: str) -> bool:
category = unicodedata.category(char)
return (
category.startswith("L")
or category.startswith("N")
or category.startswith("P")
or char.isspace()
)
filtered_text = "".join(char for char in normalized_text if is_valid_char(char))
return filtered_text
def _filter_nested(text: str, left: str, right: str) -> str:
"""
Generic function to handle nested symbols.
Args:
text (str): The text to filter.
left (str): The left symbol (e.g. '[' or '(').
right (str): The right symbol (e.g. ']' or ')').
Returns:
str: The filtered text.
"""
if not isinstance(text, str):
raise TypeError("Input must be a string")
if not text:
return text
result = []
depth = 0
for char in text:
if char == left:
depth += 1
elif char == right:
if depth > 0:
depth -= 1
else:
if depth == 0:
result.append(char)
filtered_text = "".join(result)
filtered_text = re.sub(r"\s+", " ", filtered_text).strip()
return filtered_text
def filter_brackets(text: str) -> str:
"""
Filter text to remove all text within brackets, handling nested cases.
Args:
text (str): The text to filter.
Returns:
str: The filtered text.
"""
return _filter_nested(text, "[", "]")
def filter_parentheses(text: str) -> str:
"""
Filter text to remove all text within parentheses, handling nested cases.
Args:
text (str): The text to filter.
Returns:
str: The filtered text.
"""
return _filter_nested(text, "(", ")")
def filter_angle_brackets(text: str) -> str:
"""
Filter text to remove all text within angle brackets, handling nested cases.
Args:
text (str): The text to filter.
Returns:
str: The filtered text.
"""
return _filter_nested(text, "<", ">")
def filter_asterisks(text: str) -> str:
"""
Removes text enclosed within asterisks of any length (*, **, ***, etc.) from a string.
Args:
text: The input string.
Returns:
The string with asterisk-enclosed text removed.
"""
# Handle asterisks of any length (*, **, ***, etc.)
filtered_text = re.sub(r"\*{1,}((?!\*).)*?\*{1,}", "", text)
# Clean up any extra spaces
filtered_text = re.sub(r"\s+", " ", filtered_text).strip()
return filtered_text
|