Spaces:

NeuralFalcon
/

Qwen3-TTS-Colab

Running

File size: 8,023 Bytes

528103b

# pip install sentencex
from sentencex import segment
import re
import uuid
import os
LANGUAGE_CODE = {
    'Akan': 'aka', 'Albanian': 'sq', 'Amharic': 'am', 'Arabic': 'ar', 'Armenian': 'hy',
    'Assamese': 'as', 'Azerbaijani': 'az', 'Basque': 'eu', 'Bashkir': 'ba', 'Bengali': 'bn',
    'Bosnian': 'bs', 'Bulgarian': 'bg', 'Burmese': 'my', 'Catalan': 'ca', 'Chinese': 'zh',
    'Croatian': 'hr', 'Czech': 'cs', 'Danish': 'da', 'Dutch': 'nl', 'English': 'en',
    'Estonian': 'et', 'Faroese': 'fo', 'Finnish': 'fi', 'French': 'fr', 'Galician': 'gl',
    'Georgian': 'ka', 'German': 'de', 'Greek': 'el', 'Gujarati': 'gu', 'Haitian Creole': 'ht',
    'Hausa': 'ha', 'Hebrew': 'he', 'Hindi': 'hi', 'Hungarian': 'hu', 'Icelandic': 'is',
    'Indonesian': 'id', 'Italian': 'it', 'Japanese': 'ja', 'Kannada': 'kn', 'Kazakh': 'kk',
    'Korean': 'ko', 'Kurdish': 'ckb', 'Kyrgyz': 'ky', 'Lao': 'lo', 'Lithuanian': 'lt',
    'Luxembourgish': 'lb', 'Macedonian': 'mk', 'Malay': 'ms', 'Malayalam': 'ml', 'Maltese': 'mt',
    'Maori': 'mi', 'Marathi': 'mr', 'Mongolian': 'mn', 'Nepali': 'ne', 'Norwegian': 'no',
    'Norwegian Nynorsk': 'nn', 'Pashto': 'ps', 'Persian': 'fa', 'Polish': 'pl', 'Portuguese': 'pt',
    'Punjabi': 'pa', 'Romanian': 'ro', 'Russian': 'ru', 'Serbian': 'sr', 'Sinhala': 'si',
    'Slovak': 'sk', 'Slovenian': 'sl', 'Somali': 'so', 'Spanish': 'es', 'Sundanese': 'su',
    'Swahili': 'sw', 'Swedish': 'sv', 'Tamil': 'ta', 'Telugu': 'te', 'Thai': 'th',
    'Turkish': 'tr', 'Ukrainian': 'uk', 'Urdu': 'ur', 'Uzbek': 'uz', 'Vietnamese': 'vi',
    'Welsh': 'cy', 'Yiddish': 'yi', 'Yoruba': 'yo', 'Zulu': 'zu'
}

# ==================================================
# CONSTANTS
# ==================================================

QUOTE_SPACE = "\uFFFF"  # invisible placeholder for protected quotes
PUNCT_RE = re.compile(r'[.,;:!?]')


# ==================================================
# CLEAN TEXT (KEEP PUNCTUATION)
# ==================================================

def clean_text(text):
    replacements = {
        "**": "",
        "*": "",
        "#": "",
        "—": "",
        "“": '"',
        "”": '"',
        "‘": "'",
        "’": "'",
    }
    for old, new in replacements.items():
        text = text.replace(old, new)

    text = re.sub(r'\s+', ' ', text).strip()
    return text


# ==================================================
# PROTECT SHORT QUOTES (ATOMIC QUOTE RULE)
# ==================================================

def protect_short_quotes(text, max_chars):
    """

    If a quoted span fits entirely within max_chars,

    protect it so it behaves like a single token.

    """
    def repl(match):
        quote = match.group(0)
        if len(quote) <= max_chars:
            return quote.replace(" ", QUOTE_SPACE)
        return quote

    return re.sub(r'"[^"]+"', repl, text)


def restore_quotes(text):
    return text.replace(QUOTE_SPACE, " ")


# ==================================================
# SMART SPLIT FOR LONG SENTENCES (QUOTE AWARE)
# ==================================================

def smart_split_long_sentence(sentence, max_chars=300, lookback=60):
    words = re.findall(r'\S+\s*', sentence)
    chunks = []
    buffer = ""
    in_quote = False

    for w in words:
        tentative = buffer + w
        quote_count = w.count('"')

        # 1️⃣ SAFE ADD
        if len(tentative) <= max_chars:
            buffer = tentative
            if quote_count % 2 != 0:
                in_quote = not in_quote
            continue

        # 2️⃣ OVERFLOW INSIDE QUOTE → MOVE WHOLE QUOTE
        if in_quote:
            if buffer.strip():
                chunks.append(buffer.strip())
            buffer = w
            if quote_count % 2 != 0:
                in_quote = not in_quote
            continue

        # 3️⃣ NORMAL PUNCTUATION-AWARE REBALANCE
        split_at = None
        search_region = buffer[-lookback:]

        matches = list(PUNCT_RE.finditer(search_region))
        if matches:
            last = matches[-1]
            split_at = len(buffer) - lookback + last.end()

        if split_at:
            chunks.append(buffer[:split_at].strip())
            buffer = buffer[split_at:].lstrip() + w
        else:
            chunks.append(buffer.strip())
            buffer = w

        if quote_count % 2 != 0:
            in_quote = not in_quote

    if buffer.strip():
        chunks.append(buffer.strip())

    return chunks


# ==================================================
# SENTENCE-FIRST CHUNKER
# ==================================================

def split_into_chunks(text, lang_code="en", max_chars=300):
    if len(text) <= max_chars:
        return [text]

    sentences = list(segment(lang_code, text))
    chunks = []
    current = ""

    for sen in sentences:
        sen = sen.strip()

        if len(sen) > max_chars:
            if current:
                chunks.append(current.strip())
                current = ""
            chunks.extend(smart_split_long_sentence(sen, max_chars))
            continue

        tentative = f"{current} {sen}".strip() if current else sen

        if len(tentative) <= max_chars:
            current = tentative
        else:
            chunks.append(current.strip())
            current = sen

    if current.strip():
        chunks.append(current.strip())

    return chunks


# ==================================================
# FIX DANGLING QUOTES BETWEEN CHUNKS
# ==================================================

def repair_dangling_quotes(chunks):
    fixed = []

    for i, chunk in enumerate(chunks):
        chunk = chunk.strip()

        if i > 0:
            prev = fixed[-1]
            if prev.endswith('"') and chunk.startswith('"'):
                chunk = chunk[1:].lstrip()

        fixed.append(chunk)

    return fixed


# ==================================================
# TTS FILE NAME
# ==================================================

def get_tts_file_name(text, language="en"):
    temp_audio_dir = "./ai_tts_voice/"
    os.makedirs(temp_audio_dir, exist_ok=True)

    clean = re.sub(r'[^a-zA-Z\s]', '', text or "")
    clean = clean.lower().strip().replace(" ", "_")[:20] or "audio"

    uid = uuid.uuid4().hex[:8].upper()
    language = language.lower().strip()

    return os.path.join(
        temp_audio_dir,
        f"{clean}_{language}_{uid}.wav"
    )


# ==================================================
# main funtion
# ==================================================

def text_chunk(text, language="English", char_limit=280):
    lang_code=LANGUAGE_CODE.get('English',"en")

    # text = clean_text(text) #because Qwen3-TTS can handle that

    # 🔒 Atomic quote protection
    text = protect_short_quotes(text, char_limit)

    if len(text) > char_limit:
        print("⚠️ The text is too long. Breaking it into smaller pieces for TTS.")

    chunks = split_into_chunks(text, lang_code, char_limit)
    chunks = repair_dangling_quotes(chunks)

    # 🔓 Restore spaces inside quotes
    chunks = [restore_quotes(c) for c in chunks]

    tts_file_name = get_tts_file_name(text, lang_code)
    return chunks, tts_file_name


# ==================================================
# TEST
# ==================================================

# from process_text import text_chunk
# text="Hi, this is a test"
# chunks, tts_filename =text_chunk(text,  language="English", char_limit=280)

if __name__ == "__main__":
    text = "He said \"You are a looser\""  # @param {type: "string"}

    language="English"  # @param {type: "string"}
    char_limit = 20  # @param {type: "number"}

    chunks, filename = text_chunk(text, language, char_limit)

    print(filename)
    print(len(chunks))
    for c in chunks:
        print(len(c), c)