Spaces:
Running
Running
| # pip install sentencex | |
| from sentencex import segment | |
| import re | |
| import uuid | |
| import os | |
| LANGUAGE_CODE = { | |
| 'Akan': 'aka', 'Albanian': 'sq', 'Amharic': 'am', 'Arabic': 'ar', 'Armenian': 'hy', | |
| 'Assamese': 'as', 'Azerbaijani': 'az', 'Basque': 'eu', 'Bashkir': 'ba', 'Bengali': 'bn', | |
| 'Bosnian': 'bs', 'Bulgarian': 'bg', 'Burmese': 'my', 'Catalan': 'ca', 'Chinese': 'zh', | |
| 'Croatian': 'hr', 'Czech': 'cs', 'Danish': 'da', 'Dutch': 'nl', 'English': 'en', | |
| 'Estonian': 'et', 'Faroese': 'fo', 'Finnish': 'fi', 'French': 'fr', 'Galician': 'gl', | |
| 'Georgian': 'ka', 'German': 'de', 'Greek': 'el', 'Gujarati': 'gu', 'Haitian Creole': 'ht', | |
| 'Hausa': 'ha', 'Hebrew': 'he', 'Hindi': 'hi', 'Hungarian': 'hu', 'Icelandic': 'is', | |
| 'Indonesian': 'id', 'Italian': 'it', 'Japanese': 'ja', 'Kannada': 'kn', 'Kazakh': 'kk', | |
| 'Korean': 'ko', 'Kurdish': 'ckb', 'Kyrgyz': 'ky', 'Lao': 'lo', 'Lithuanian': 'lt', | |
| 'Luxembourgish': 'lb', 'Macedonian': 'mk', 'Malay': 'ms', 'Malayalam': 'ml', 'Maltese': 'mt', | |
| 'Maori': 'mi', 'Marathi': 'mr', 'Mongolian': 'mn', 'Nepali': 'ne', 'Norwegian': 'no', | |
| 'Norwegian Nynorsk': 'nn', 'Pashto': 'ps', 'Persian': 'fa', 'Polish': 'pl', 'Portuguese': 'pt', | |
| 'Punjabi': 'pa', 'Romanian': 'ro', 'Russian': 'ru', 'Serbian': 'sr', 'Sinhala': 'si', | |
| 'Slovak': 'sk', 'Slovenian': 'sl', 'Somali': 'so', 'Spanish': 'es', 'Sundanese': 'su', | |
| 'Swahili': 'sw', 'Swedish': 'sv', 'Tamil': 'ta', 'Telugu': 'te', 'Thai': 'th', | |
| 'Turkish': 'tr', 'Ukrainian': 'uk', 'Urdu': 'ur', 'Uzbek': 'uz', 'Vietnamese': 'vi', | |
| 'Welsh': 'cy', 'Yiddish': 'yi', 'Yoruba': 'yo', 'Zulu': 'zu' | |
| } | |
| # ================================================== | |
| # CONSTANTS | |
| # ================================================== | |
| QUOTE_SPACE = "\uFFFF" # invisible placeholder for protected quotes | |
| PUNCT_RE = re.compile(r'[.,;:!?]') | |
| # ================================================== | |
| # CLEAN TEXT (KEEP PUNCTUATION) | |
| # ================================================== | |
| def clean_text(text): | |
| replacements = { | |
| "**": "", | |
| "*": "", | |
| "#": "", | |
| "โ": "", | |
| "โ": '"', | |
| "โ": '"', | |
| "โ": "'", | |
| "โ": "'", | |
| } | |
| for old, new in replacements.items(): | |
| text = text.replace(old, new) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| # ================================================== | |
| # PROTECT SHORT QUOTES (ATOMIC QUOTE RULE) | |
| # ================================================== | |
| def protect_short_quotes(text, max_chars): | |
| """ | |
| If a quoted span fits entirely within max_chars, | |
| protect it so it behaves like a single token. | |
| """ | |
| def repl(match): | |
| quote = match.group(0) | |
| if len(quote) <= max_chars: | |
| return quote.replace(" ", QUOTE_SPACE) | |
| return quote | |
| return re.sub(r'"[^"]+"', repl, text) | |
| def restore_quotes(text): | |
| return text.replace(QUOTE_SPACE, " ") | |
| # ================================================== | |
| # SMART SPLIT FOR LONG SENTENCES (QUOTE AWARE) | |
| # ================================================== | |
| def smart_split_long_sentence(sentence, max_chars=300, lookback=60): | |
| words = re.findall(r'\S+\s*', sentence) | |
| chunks = [] | |
| buffer = "" | |
| in_quote = False | |
| for w in words: | |
| tentative = buffer + w | |
| quote_count = w.count('"') | |
| # 1๏ธโฃ SAFE ADD | |
| if len(tentative) <= max_chars: | |
| buffer = tentative | |
| if quote_count % 2 != 0: | |
| in_quote = not in_quote | |
| continue | |
| # 2๏ธโฃ OVERFLOW INSIDE QUOTE โ MOVE WHOLE QUOTE | |
| if in_quote: | |
| if buffer.strip(): | |
| chunks.append(buffer.strip()) | |
| buffer = w | |
| if quote_count % 2 != 0: | |
| in_quote = not in_quote | |
| continue | |
| # 3๏ธโฃ NORMAL PUNCTUATION-AWARE REBALANCE | |
| split_at = None | |
| search_region = buffer[-lookback:] | |
| matches = list(PUNCT_RE.finditer(search_region)) | |
| if matches: | |
| last = matches[-1] | |
| split_at = len(buffer) - lookback + last.end() | |
| if split_at: | |
| chunks.append(buffer[:split_at].strip()) | |
| buffer = buffer[split_at:].lstrip() + w | |
| else: | |
| chunks.append(buffer.strip()) | |
| buffer = w | |
| if quote_count % 2 != 0: | |
| in_quote = not in_quote | |
| if buffer.strip(): | |
| chunks.append(buffer.strip()) | |
| return chunks | |
| # ================================================== | |
| # SENTENCE-FIRST CHUNKER | |
| # ================================================== | |
| def split_into_chunks(text, lang_code="en", max_chars=300): | |
| if len(text) <= max_chars: | |
| return [text] | |
| sentences = list(segment(lang_code, text)) | |
| chunks = [] | |
| current = "" | |
| for sen in sentences: | |
| sen = sen.strip() | |
| if len(sen) > max_chars: | |
| if current: | |
| chunks.append(current.strip()) | |
| current = "" | |
| chunks.extend(smart_split_long_sentence(sen, max_chars)) | |
| continue | |
| tentative = f"{current} {sen}".strip() if current else sen | |
| if len(tentative) <= max_chars: | |
| current = tentative | |
| else: | |
| chunks.append(current.strip()) | |
| current = sen | |
| if current.strip(): | |
| chunks.append(current.strip()) | |
| return chunks | |
| # ================================================== | |
| # FIX DANGLING QUOTES BETWEEN CHUNKS | |
| # ================================================== | |
| def repair_dangling_quotes(chunks): | |
| fixed = [] | |
| for i, chunk in enumerate(chunks): | |
| chunk = chunk.strip() | |
| if i > 0: | |
| prev = fixed[-1] | |
| if prev.endswith('"') and chunk.startswith('"'): | |
| chunk = chunk[1:].lstrip() | |
| fixed.append(chunk) | |
| return fixed | |
| # ================================================== | |
| # TTS FILE NAME | |
| # ================================================== | |
| def get_tts_file_name(text, language="en"): | |
| temp_audio_dir = "./ai_tts_voice/" | |
| os.makedirs(temp_audio_dir, exist_ok=True) | |
| clean = re.sub(r'[^a-zA-Z\s]', '', text or "") | |
| clean = clean.lower().strip().replace(" ", "_")[:20] or "audio" | |
| uid = uuid.uuid4().hex[:8].upper() | |
| language = language.lower().strip() | |
| return os.path.join( | |
| temp_audio_dir, | |
| f"{clean}_{language}_{uid}.wav" | |
| ) | |
| # ================================================== | |
| # main funtion | |
| # ================================================== | |
| def text_chunk(text, language="English", char_limit=280): | |
| lang_code=LANGUAGE_CODE.get('English',"en") | |
| # text = clean_text(text) #because Qwen3-TTS can handle that | |
| # ๐ Atomic quote protection | |
| text = protect_short_quotes(text, char_limit) | |
| if len(text) > char_limit: | |
| print("โ ๏ธ The text is too long. Breaking it into smaller pieces for TTS.") | |
| chunks = split_into_chunks(text, lang_code, char_limit) | |
| chunks = repair_dangling_quotes(chunks) | |
| # ๐ Restore spaces inside quotes | |
| chunks = [restore_quotes(c) for c in chunks] | |
| tts_file_name = get_tts_file_name(text, lang_code) | |
| return chunks, tts_file_name | |
| # ================================================== | |
| # TEST | |
| # ================================================== | |
| # from process_text import text_chunk | |
| # text="Hi, this is a test" | |
| # chunks, tts_filename =text_chunk(text, language="English", char_limit=280) | |
| if __name__ == "__main__": | |
| text = "He said \"You are a looser\"" # @param {type: "string"} | |
| language="English" # @param {type: "string"} | |
| char_limit = 20 # @param {type: "number"} | |
| chunks, filename = text_chunk(text, language, char_limit) | |
| print(filename) | |
| print(len(chunks)) | |
| for c in chunks: | |
| print(len(c), c) | |