Spaces:

NeuralFalcon
/

Qwen3-TTS-Colab

Running

App Files Files Community

Qwen3-TTS-Colab / process_text.py

NeuralFalcon

Upload 7 files

528103b verified 12 days ago

raw

history blame contribute delete

8.02 kB

	# pip install sentencex
	from sentencex import segment
	import re
	import uuid
	import os
	LANGUAGE_CODE = {
	'Akan': 'aka', 'Albanian': 'sq', 'Amharic': 'am', 'Arabic': 'ar', 'Armenian': 'hy',
	'Assamese': 'as', 'Azerbaijani': 'az', 'Basque': 'eu', 'Bashkir': 'ba', 'Bengali': 'bn',
	'Bosnian': 'bs', 'Bulgarian': 'bg', 'Burmese': 'my', 'Catalan': 'ca', 'Chinese': 'zh',
	'Croatian': 'hr', 'Czech': 'cs', 'Danish': 'da', 'Dutch': 'nl', 'English': 'en',
	'Estonian': 'et', 'Faroese': 'fo', 'Finnish': 'fi', 'French': 'fr', 'Galician': 'gl',
	'Georgian': 'ka', 'German': 'de', 'Greek': 'el', 'Gujarati': 'gu', 'Haitian Creole': 'ht',
	'Hausa': 'ha', 'Hebrew': 'he', 'Hindi': 'hi', 'Hungarian': 'hu', 'Icelandic': 'is',
	'Indonesian': 'id', 'Italian': 'it', 'Japanese': 'ja', 'Kannada': 'kn', 'Kazakh': 'kk',
	'Korean': 'ko', 'Kurdish': 'ckb', 'Kyrgyz': 'ky', 'Lao': 'lo', 'Lithuanian': 'lt',
	'Luxembourgish': 'lb', 'Macedonian': 'mk', 'Malay': 'ms', 'Malayalam': 'ml', 'Maltese': 'mt',
	'Maori': 'mi', 'Marathi': 'mr', 'Mongolian': 'mn', 'Nepali': 'ne', 'Norwegian': 'no',
	'Norwegian Nynorsk': 'nn', 'Pashto': 'ps', 'Persian': 'fa', 'Polish': 'pl', 'Portuguese': 'pt',
	'Punjabi': 'pa', 'Romanian': 'ro', 'Russian': 'ru', 'Serbian': 'sr', 'Sinhala': 'si',
	'Slovak': 'sk', 'Slovenian': 'sl', 'Somali': 'so', 'Spanish': 'es', 'Sundanese': 'su',
	'Swahili': 'sw', 'Swedish': 'sv', 'Tamil': 'ta', 'Telugu': 'te', 'Thai': 'th',
	'Turkish': 'tr', 'Ukrainian': 'uk', 'Urdu': 'ur', 'Uzbek': 'uz', 'Vietnamese': 'vi',
	'Welsh': 'cy', 'Yiddish': 'yi', 'Yoruba': 'yo', 'Zulu': 'zu'
	}

	# ==================================================
	# CONSTANTS
	# ==================================================

	QUOTE_SPACE = "\uFFFF" # invisible placeholder for protected quotes
	PUNCT_RE = re.compile(r'[.,;:!?]')


	# ==================================================
	# CLEAN TEXT (KEEP PUNCTUATION)
	# ==================================================

	def clean_text(text):
	replacements = {
	"**": "",
	"*": "",
	"#": "",
	"—": "",
	"“": '"',
	"”": '"',
	"‘": "'",
	"’": "'",
	}
	for old, new in replacements.items():
	text = text.replace(old, new)

	text = re.sub(r'\s+', ' ', text).strip()
	return text


	# ==================================================
	# PROTECT SHORT QUOTES (ATOMIC QUOTE RULE)
	# ==================================================

	def protect_short_quotes(text, max_chars):
	"""
	If a quoted span fits entirely within max_chars,
	protect it so it behaves like a single token.
	"""
	def repl(match):
	quote = match.group(0)
	if len(quote) <= max_chars:
	return quote.replace(" ", QUOTE_SPACE)
	return quote

	return re.sub(r'"[^"]+"', repl, text)


	def restore_quotes(text):
	return text.replace(QUOTE_SPACE, " ")


	# ==================================================
	# SMART SPLIT FOR LONG SENTENCES (QUOTE AWARE)
	# ==================================================

	def smart_split_long_sentence(sentence, max_chars=300, lookback=60):
	words = re.findall(r'\S+\s*', sentence)
	chunks = []
	buffer = ""
	in_quote = False

	for w in words:
	tentative = buffer + w
	quote_count = w.count('"')

	# 1️⃣ SAFE ADD
	if len(tentative) <= max_chars:
	buffer = tentative
	if quote_count % 2 != 0:
	in_quote = not in_quote
	continue

	# 2️⃣ OVERFLOW INSIDE QUOTE → MOVE WHOLE QUOTE
	if in_quote:
	if buffer.strip():
	chunks.append(buffer.strip())
	buffer = w
	if quote_count % 2 != 0:
	in_quote = not in_quote
	continue

	# 3️⃣ NORMAL PUNCTUATION-AWARE REBALANCE
	split_at = None
	search_region = buffer[-lookback:]

	matches = list(PUNCT_RE.finditer(search_region))
	if matches:
	last = matches[-1]
	split_at = len(buffer) - lookback + last.end()

	if split_at:
	chunks.append(buffer[:split_at].strip())
	buffer = buffer[split_at:].lstrip() + w
	else:
	chunks.append(buffer.strip())
	buffer = w

	if quote_count % 2 != 0:
	in_quote = not in_quote

	if buffer.strip():
	chunks.append(buffer.strip())

	return chunks


	# ==================================================
	# SENTENCE-FIRST CHUNKER
	# ==================================================

	def split_into_chunks(text, lang_code="en", max_chars=300):
	if len(text) <= max_chars:
	return [text]

	sentences = list(segment(lang_code, text))
	chunks = []
	current = ""

	for sen in sentences:
	sen = sen.strip()

	if len(sen) > max_chars:
	if current:
	chunks.append(current.strip())
	current = ""
	chunks.extend(smart_split_long_sentence(sen, max_chars))
	continue

	tentative = f"{current} {sen}".strip() if current else sen

	if len(tentative) <= max_chars:
	current = tentative
	else:
	chunks.append(current.strip())
	current = sen

	if current.strip():
	chunks.append(current.strip())

	return chunks


	# ==================================================
	# FIX DANGLING QUOTES BETWEEN CHUNKS
	# ==================================================

	def repair_dangling_quotes(chunks):
	fixed = []

	for i, chunk in enumerate(chunks):
	chunk = chunk.strip()

	if i > 0:
	prev = fixed[-1]
	if prev.endswith('"') and chunk.startswith('"'):
	chunk = chunk[1:].lstrip()

	fixed.append(chunk)

	return fixed


	# ==================================================
	# TTS FILE NAME
	# ==================================================

	def get_tts_file_name(text, language="en"):
	temp_audio_dir = "./ai_tts_voice/"
	os.makedirs(temp_audio_dir, exist_ok=True)

	clean = re.sub(r'[^a-zA-Z\s]', '', text or "")
	clean = clean.lower().strip().replace(" ", "_")[:20] or "audio"

	uid = uuid.uuid4().hex[:8].upper()
	language = language.lower().strip()

	return os.path.join(
	temp_audio_dir,
	f"{clean}_{language}_{uid}.wav"
	)


	# ==================================================
	# main funtion
	# ==================================================

	def text_chunk(text, language="English", char_limit=280):
	lang_code=LANGUAGE_CODE.get('English',"en")

	# text = clean_text(text) #because Qwen3-TTS can handle that

	# 🔒 Atomic quote protection
	text = protect_short_quotes(text, char_limit)

	if len(text) > char_limit:
	print("⚠️ The text is too long. Breaking it into smaller pieces for TTS.")

	chunks = split_into_chunks(text, lang_code, char_limit)
	chunks = repair_dangling_quotes(chunks)

	# 🔓 Restore spaces inside quotes
	chunks = [restore_quotes(c) for c in chunks]

	tts_file_name = get_tts_file_name(text, lang_code)
	return chunks, tts_file_name


	# ==================================================
	# TEST
	# ==================================================

	# from process_text import text_chunk
	# text="Hi, this is a test"
	# chunks, tts_filename =text_chunk(text, language="English", char_limit=280)

	if __name__ == "__main__":
	text = "He said \"You are a looser\"" # @param {type: "string"}

	language="English" # @param {type: "string"}
	char_limit = 20 # @param {type: "number"}

	chunks, filename = text_chunk(text, language, char_limit)

	print(filename)
	print(len(chunks))
	for c in chunks:
	print(len(c), c)