s1-mini / fish_speech /text /spliter.py
PoTaTo721's picture
Update to V1.5
b2eb230
raw
history blame
3.96 kB
import re
import string
from fish_speech.text.clean import clean_text
def utf_8_len(text: str):
return len(text.encode("utf-8"))
def break_text(texts, length, splits: set):
for text in texts:
if utf_8_len(text) <= length:
yield text
continue
curr = ""
for char in text:
curr += char
if char in splits:
yield curr
curr = ""
if curr:
yield curr
def break_text_by_length(texts, length):
for text in texts:
if utf_8_len(text) <= length:
yield text
continue
curr = ""
for char in text:
curr += char
if utf_8_len(curr) >= length:
yield curr
curr = ""
if curr:
yield curr
def add_cleaned(curr, segments):
curr = curr.strip()
if curr and not all(c.isspace() or c in string.punctuation for c in curr):
segments.append(curr)
def protect_float(text):
# Turns 3.14 into <3_f_14> to prevent splitting
return re.sub(r"(\d+)\.(\d+)", r"<\1_f_\2>", text)
def unprotect_float(text):
# Turns <3_f_14> into 3.14
return re.sub(r"<(\d+)_f_(\d+)>", r"\1.\2", text)
def split_text(text, length):
text = clean_text(text)
# Break the text into pieces with following rules:
# 1. Split the text at ".", "!", "?" if text is NOT a float
# 2. If the text is longer than length, split at ","
# 3. If the text is still longer than length, split at " "
# 4. If the text is still longer than length, split at any character to length
texts = [text]
texts = map(protect_float, texts)
texts = break_text(texts, length, {".", "!", "?", "ใ€‚", "๏ผ", "๏ผŸ"})
texts = map(unprotect_float, texts)
texts = break_text(texts, length, {",", "๏ผŒ"})
texts = break_text(texts, length, {" "})
texts = list(break_text_by_length(texts, length))
# Then, merge the texts into segments with length <= length
segments = []
curr = ""
for text in texts:
if utf_8_len(curr) + utf_8_len(text) <= length:
curr += text
else:
add_cleaned(curr, segments)
curr = text
if curr:
add_cleaned(curr, segments)
return segments
if __name__ == "__main__":
# Test the split_text function
text = "This is a test sentence. This is another test sentence. And a third one."
assert split_text(text, 50) == [
"This is a test sentence.",
"This is another test sentence. And a third one.",
]
assert split_text("a,aaaaaa3.14", 10) == ["a,", "aaaaaa3.14"]
assert split_text(" ", 10) == []
assert split_text("a", 10) == ["a"]
text = "This is a test sentence with only commas, and no dots, and no exclamation marks, and no question marks, and no newlines."
assert split_text(text, 50) == [
"This is a test sentence with only commas,",
"and no dots, and no exclamation marks,",
"and no question marks, and no newlines.",
]
text = "This is a test sentence This is a test sentence This is a test sentence. This is a test sentence, This is a test sentence, This is a test sentence."
# First half split at " ", second half split at ","
assert split_text(text, 50) == [
"This is a test sentence This is a test sentence",
"This is a test sentence. This is a test sentence,",
"This is a test sentence, This is a test sentence.",
]
text = "่ฟ™ๆ˜ฏไธ€ๆฎตๅพˆ้•ฟ็š„ไธญๆ–‡ๆ–‡ๆœฌ,่€Œไธ”ๆฒกๆœ‰ๅฅๅท,ไนŸๆฒกๆœ‰ๆ„Ÿๅนๅท,ไนŸๆฒกๆœ‰้—ฎๅท,ไนŸๆฒกๆœ‰ๆข่กŒ็ฌฆใ€‚"
assert split_text(text, 50) == [
"่ฟ™ๆ˜ฏไธ€ๆฎตๅพˆ้•ฟ็š„ไธญๆ–‡ๆ–‡ๆœฌ,",
"่€Œไธ”ๆฒกๆœ‰ๅฅๅท,ไนŸๆฒกๆœ‰ๆ„Ÿๅนๅท,",
"ไนŸๆฒกๆœ‰้—ฎๅท,ไนŸๆฒกๆœ‰ๆข่กŒ็ฌฆ.",
]