VieNeu-TTS-0.3B / utils /core_utils.py
pnnbao-ump's picture
Upload 9 files
8283b61 verified
import re
import os
from typing import List
def split_text_into_chunks(text: str, max_chars: int = 256) -> List[str]:
"""
Split raw text into chunks no longer than max_chars.
"""
# 1. First split by newlines - each line/paragraph is handled independently
paragraphs = re.split(r"[\r\n]+", text.strip())
final_chunks = []
for para in paragraphs:
para = para.strip()
if not para:
continue
# 2. Split current paragraph into sentences
sentences = re.split(r"(?<=[\.\!\?\…])\s+", para)
buffer = ""
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# If sentence itself is longer than max_chars, we must split it by minor punctuation or words
if len(sentence) > max_chars:
# Flush buffer before handling a giant sentence
if buffer:
final_chunks.append(buffer)
buffer = ""
# Split giant sentence by minor punctuation (, ; : -)
sub_parts = re.split(r"(?<=[\,\;\:\-\–\—])\s+", sentence)
for part in sub_parts:
part = part.strip()
if not part: continue
if len(buffer) + 1 + len(part) <= max_chars:
buffer = (buffer + " " + part) if buffer else part
else:
if buffer: final_chunks.append(buffer)
buffer = part
# If even a sub-part is too long, split by spaces (words)
if len(buffer) > max_chars:
words = buffer.split()
current = ""
for word in words:
if current and len(current) + 1 + len(word) > max_chars:
final_chunks.append(current)
current = word
else:
current = (current + " " + word) if current else word
buffer = current
else:
# Normal sentence: check if it fits in current buffer
if buffer and len(buffer) + 1 + len(sentence) > max_chars:
final_chunks.append(buffer)
buffer = sentence
else:
buffer = (buffer + " " + sentence) if buffer else sentence
# End of paragraph: flush whatever is in buffer
if buffer:
final_chunks.append(buffer)
buffer = ""
return [c.strip() for c in final_chunks if c.strip()]
def env_bool(name: str, default: bool = False) -> bool:
v = os.getenv(name)
if v is None:
return default
return v.strip().lower() in ("1", "true", "yes", "y", "on")