| import re |
| import string |
|
|
| from fish_speech.text.clean import clean_text |
|
|
|
|
| def utf_8_len(text: str): |
| return len(text.encode("utf-8")) |
|
|
|
|
| def break_text(texts, length, splits: set): |
| for text in texts: |
| if utf_8_len(text) <= length: |
| yield text |
| continue |
|
|
| curr = "" |
| for char in text: |
| curr += char |
|
|
| if char in splits: |
| yield curr |
| curr = "" |
|
|
| if curr: |
| yield curr |
|
|
|
|
| def break_text_by_length(texts, length): |
| for text in texts: |
| if utf_8_len(text) <= length: |
| yield text |
| continue |
|
|
| curr = "" |
| for char in text: |
| curr += char |
|
|
| if utf_8_len(curr) >= length: |
| yield curr |
| curr = "" |
|
|
| if curr: |
| yield curr |
|
|
|
|
| def add_cleaned(curr, segments): |
| curr = curr.strip() |
| if curr and not all(c.isspace() or c in string.punctuation for c in curr): |
| segments.append(curr) |
|
|
|
|
| def protect_float(text): |
| |
| return re.sub(r"(\d+)\.(\d+)", r"<\1_f_\2>", text) |
|
|
|
|
| def unprotect_float(text): |
| |
| return re.sub(r"<(\d+)_f_(\d+)>", r"\1.\2", text) |
|
|
|
|
| def split_text(text, length): |
| text = clean_text(text) |
|
|
| |
| |
| |
| |
| |
|
|
| texts = [text] |
| texts = map(protect_float, texts) |
| texts = break_text(texts, length, {".", "!", "?", "。", "!", "?"}) |
| texts = map(unprotect_float, texts) |
| texts = break_text(texts, length, {",", ","}) |
| texts = break_text(texts, length, {" "}) |
| texts = list(break_text_by_length(texts, length)) |
|
|
| |
| segments = [] |
| curr = "" |
|
|
| for text in texts: |
| if utf_8_len(curr) + utf_8_len(text) <= length: |
| curr += text |
| else: |
| add_cleaned(curr, segments) |
| curr = text |
|
|
| if curr: |
| add_cleaned(curr, segments) |
|
|
| return segments |
|
|
|
|
| if __name__ == "__main__": |
| |
|
|
| text = "This is a test sentence. This is another test sentence. And a third one." |
|
|
| assert split_text(text, 50) == [ |
| "This is a test sentence.", |
| "This is another test sentence. And a third one.", |
| ] |
| assert split_text("a,aaaaaa3.14", 10) == ["a,", "aaaaaa3.14"] |
| assert split_text(" ", 10) == [] |
| assert split_text("a", 10) == ["a"] |
|
|
| text = "This is a test sentence with only commas, and no dots, and no exclamation marks, and no question marks, and no newlines." |
| assert split_text(text, 50) == [ |
| "This is a test sentence with only commas,", |
| "and no dots, and no exclamation marks,", |
| "and no question marks, and no newlines.", |
| ] |
|
|
| text = "This is a test sentence This is a test sentence This is a test sentence. This is a test sentence, This is a test sentence, This is a test sentence." |
| |
| assert split_text(text, 50) == [ |
| "This is a test sentence This is a test sentence", |
| "This is a test sentence. This is a test sentence,", |
| "This is a test sentence, This is a test sentence.", |
| ] |
|
|
| text = "这是一段很长的中文文本,而且没有句号,也没有感叹号,也没有问号,也没有换行符。" |
| assert split_text(text, 50) == [ |
| "这是一段很长的中文文本,", |
| "而且没有句号,也没有感叹号,", |
| "也没有问号,也没有换行符.", |
| ] |
|
|