s1-mini

Runtime error

App Files Files Community

s1-mini / fish_speech /text /spliter.py

Stardust-minus

Upload folder using huggingface_hub

a26769d verified 11 months ago

raw

history blame contribute delete

3.83 kB

	import re
	import string

	from fish_speech.text.clean import clean_text


	def utf_8_len(text: str):
	return len(text.encode("utf-8"))


	def break_text(texts, length, splits: set):
	for text in texts:
	if utf_8_len(text) <= length:
	yield text
	continue

	curr = ""
	for char in text:
	curr += char

	if char in splits:
	yield curr
	curr = ""

	if curr:
	yield curr


	def break_text_by_length(texts, length):
	for text in texts:
	if utf_8_len(text) <= length:
	yield text
	continue

	curr = ""
	for char in text:
	curr += char

	if utf_8_len(curr) >= length:
	yield curr
	curr = ""

	if curr:
	yield curr


	def add_cleaned(curr, segments):
	curr = curr.strip()
	if curr and not all(c.isspace() or c in string.punctuation for c in curr):
	segments.append(curr)


	def protect_float(text):
	# Turns 3.14 into <3_f_14> to prevent splitting
	return re.sub(r"(\d+)\.(\d+)", r"<\1_f_\2>", text)


	def unprotect_float(text):
	# Turns <3_f_14> into 3.14
	return re.sub(r"<(\d+)_f_(\d+)>", r"\1.\2", text)


	def split_text(text, length):
	text = clean_text(text)

	# Break the text into pieces with following rules:
	# 1. Split the text at ".", "!", "?" if text is NOT a float
	# 2. If the text is longer than length, split at ","
	# 3. If the text is still longer than length, split at " "
	# 4. If the text is still longer than length, split at any character to length

	texts = [text]
	texts = map(protect_float, texts)
	texts = break_text(texts, length, {".", "!", "?", "。", "！", "？"})
	texts = map(unprotect_float, texts)
	texts = break_text(texts, length, {",", "，"})
	texts = break_text(texts, length, {" "})
	texts = list(break_text_by_length(texts, length))

	# Then, merge the texts into segments with length <= length
	segments = []
	curr = ""

	for text in texts:
	if utf_8_len(curr) + utf_8_len(text) <= length:
	curr += text
	else:
	add_cleaned(curr, segments)
	curr = text

	if curr:
	add_cleaned(curr, segments)

	return segments


	if __name__ == "__main__":
	# Test the split_text function

	text = "This is a test sentence. This is another test sentence. And a third one."

	assert split_text(text, 50) == [
	"This is a test sentence.",
	"This is another test sentence. And a third one.",
	]
	assert split_text("a,aaaaaa3.14", 10) == ["a,", "aaaaaa3.14"]
	assert split_text(" ", 10) == []
	assert split_text("a", 10) == ["a"]

	text = "This is a test sentence with only commas, and no dots, and no exclamation marks, and no question marks, and no newlines."
	assert split_text(text, 50) == [
	"This is a test sentence with only commas,",
	"and no dots, and no exclamation marks,",
	"and no question marks, and no newlines.",
	]

	text = "This is a test sentence This is a test sentence This is a test sentence. This is a test sentence, This is a test sentence, This is a test sentence."
	# First half split at " ", second half split at ","
	assert split_text(text, 50) == [
	"This is a test sentence This is a test sentence",
	"This is a test sentence. This is a test sentence,",
	"This is a test sentence, This is a test sentence.",
	]

	text = "这是一段很长的中文文本,而且没有句号,也没有感叹号,也没有问号,也没有换行符。"
	assert split_text(text, 50) == [
	"这是一段很长的中文文本,",
	"而且没有句号,也没有感叹号,",
	"也没有问号,也没有换行符.",
	]