File size: 8,023 Bytes
528103b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# pip install sentencex
from sentencex import segment
import re
import uuid
import os
LANGUAGE_CODE = {
    'Akan': 'aka', 'Albanian': 'sq', 'Amharic': 'am', 'Arabic': 'ar', 'Armenian': 'hy',
    'Assamese': 'as', 'Azerbaijani': 'az', 'Basque': 'eu', 'Bashkir': 'ba', 'Bengali': 'bn',
    'Bosnian': 'bs', 'Bulgarian': 'bg', 'Burmese': 'my', 'Catalan': 'ca', 'Chinese': 'zh',
    'Croatian': 'hr', 'Czech': 'cs', 'Danish': 'da', 'Dutch': 'nl', 'English': 'en',
    'Estonian': 'et', 'Faroese': 'fo', 'Finnish': 'fi', 'French': 'fr', 'Galician': 'gl',
    'Georgian': 'ka', 'German': 'de', 'Greek': 'el', 'Gujarati': 'gu', 'Haitian Creole': 'ht',
    'Hausa': 'ha', 'Hebrew': 'he', 'Hindi': 'hi', 'Hungarian': 'hu', 'Icelandic': 'is',
    'Indonesian': 'id', 'Italian': 'it', 'Japanese': 'ja', 'Kannada': 'kn', 'Kazakh': 'kk',
    'Korean': 'ko', 'Kurdish': 'ckb', 'Kyrgyz': 'ky', 'Lao': 'lo', 'Lithuanian': 'lt',
    'Luxembourgish': 'lb', 'Macedonian': 'mk', 'Malay': 'ms', 'Malayalam': 'ml', 'Maltese': 'mt',
    'Maori': 'mi', 'Marathi': 'mr', 'Mongolian': 'mn', 'Nepali': 'ne', 'Norwegian': 'no',
    'Norwegian Nynorsk': 'nn', 'Pashto': 'ps', 'Persian': 'fa', 'Polish': 'pl', 'Portuguese': 'pt',
    'Punjabi': 'pa', 'Romanian': 'ro', 'Russian': 'ru', 'Serbian': 'sr', 'Sinhala': 'si',
    'Slovak': 'sk', 'Slovenian': 'sl', 'Somali': 'so', 'Spanish': 'es', 'Sundanese': 'su',
    'Swahili': 'sw', 'Swedish': 'sv', 'Tamil': 'ta', 'Telugu': 'te', 'Thai': 'th',
    'Turkish': 'tr', 'Ukrainian': 'uk', 'Urdu': 'ur', 'Uzbek': 'uz', 'Vietnamese': 'vi',
    'Welsh': 'cy', 'Yiddish': 'yi', 'Yoruba': 'yo', 'Zulu': 'zu'
}

# ==================================================
# CONSTANTS
# ==================================================

QUOTE_SPACE = "\uFFFF"  # invisible placeholder for protected quotes
PUNCT_RE = re.compile(r'[.,;:!?]')


# ==================================================
# CLEAN TEXT (KEEP PUNCTUATION)
# ==================================================

def clean_text(text):
    replacements = {
        "**": "",
        "*": "",
        "#": "",
        "β€”": "",
        "β€œ": '"',
        "”": '"',
        "β€˜": "'",
        "’": "'",
    }
    for old, new in replacements.items():
        text = text.replace(old, new)

    text = re.sub(r'\s+', ' ', text).strip()
    return text


# ==================================================
# PROTECT SHORT QUOTES (ATOMIC QUOTE RULE)
# ==================================================

def protect_short_quotes(text, max_chars):
    """

    If a quoted span fits entirely within max_chars,

    protect it so it behaves like a single token.

    """
    def repl(match):
        quote = match.group(0)
        if len(quote) <= max_chars:
            return quote.replace(" ", QUOTE_SPACE)
        return quote

    return re.sub(r'"[^"]+"', repl, text)


def restore_quotes(text):
    return text.replace(QUOTE_SPACE, " ")


# ==================================================
# SMART SPLIT FOR LONG SENTENCES (QUOTE AWARE)
# ==================================================

def smart_split_long_sentence(sentence, max_chars=300, lookback=60):
    words = re.findall(r'\S+\s*', sentence)
    chunks = []
    buffer = ""
    in_quote = False

    for w in words:
        tentative = buffer + w
        quote_count = w.count('"')

        # 1️⃣ SAFE ADD
        if len(tentative) <= max_chars:
            buffer = tentative
            if quote_count % 2 != 0:
                in_quote = not in_quote
            continue

        # 2️⃣ OVERFLOW INSIDE QUOTE β†’ MOVE WHOLE QUOTE
        if in_quote:
            if buffer.strip():
                chunks.append(buffer.strip())
            buffer = w
            if quote_count % 2 != 0:
                in_quote = not in_quote
            continue

        # 3️⃣ NORMAL PUNCTUATION-AWARE REBALANCE
        split_at = None
        search_region = buffer[-lookback:]

        matches = list(PUNCT_RE.finditer(search_region))
        if matches:
            last = matches[-1]
            split_at = len(buffer) - lookback + last.end()

        if split_at:
            chunks.append(buffer[:split_at].strip())
            buffer = buffer[split_at:].lstrip() + w
        else:
            chunks.append(buffer.strip())
            buffer = w

        if quote_count % 2 != 0:
            in_quote = not in_quote

    if buffer.strip():
        chunks.append(buffer.strip())

    return chunks


# ==================================================
# SENTENCE-FIRST CHUNKER
# ==================================================

def split_into_chunks(text, lang_code="en", max_chars=300):
    if len(text) <= max_chars:
        return [text]

    sentences = list(segment(lang_code, text))
    chunks = []
    current = ""

    for sen in sentences:
        sen = sen.strip()

        if len(sen) > max_chars:
            if current:
                chunks.append(current.strip())
                current = ""
            chunks.extend(smart_split_long_sentence(sen, max_chars))
            continue

        tentative = f"{current} {sen}".strip() if current else sen

        if len(tentative) <= max_chars:
            current = tentative
        else:
            chunks.append(current.strip())
            current = sen

    if current.strip():
        chunks.append(current.strip())

    return chunks


# ==================================================
# FIX DANGLING QUOTES BETWEEN CHUNKS
# ==================================================

def repair_dangling_quotes(chunks):
    fixed = []

    for i, chunk in enumerate(chunks):
        chunk = chunk.strip()

        if i > 0:
            prev = fixed[-1]
            if prev.endswith('"') and chunk.startswith('"'):
                chunk = chunk[1:].lstrip()

        fixed.append(chunk)

    return fixed


# ==================================================
# TTS FILE NAME
# ==================================================

def get_tts_file_name(text, language="en"):
    temp_audio_dir = "./ai_tts_voice/"
    os.makedirs(temp_audio_dir, exist_ok=True)

    clean = re.sub(r'[^a-zA-Z\s]', '', text or "")
    clean = clean.lower().strip().replace(" ", "_")[:20] or "audio"

    uid = uuid.uuid4().hex[:8].upper()
    language = language.lower().strip()

    return os.path.join(
        temp_audio_dir,
        f"{clean}_{language}_{uid}.wav"
    )


# ==================================================
# main funtion
# ==================================================

def text_chunk(text, language="English", char_limit=280):
    lang_code=LANGUAGE_CODE.get('English',"en")

    # text = clean_text(text) #because Qwen3-TTS can handle that

    # πŸ”’ Atomic quote protection
    text = protect_short_quotes(text, char_limit)

    if len(text) > char_limit:
        print("⚠️ The text is too long. Breaking it into smaller pieces for TTS.")

    chunks = split_into_chunks(text, lang_code, char_limit)
    chunks = repair_dangling_quotes(chunks)

    # πŸ”“ Restore spaces inside quotes
    chunks = [restore_quotes(c) for c in chunks]

    tts_file_name = get_tts_file_name(text, lang_code)
    return chunks, tts_file_name


# ==================================================
# TEST
# ==================================================

# from process_text import text_chunk
# text="Hi, this is a test"
# chunks, tts_filename =text_chunk(text,  language="English", char_limit=280)

if __name__ == "__main__":
    text = "He said \"You are a looser\""  # @param {type: "string"}

    language="English"  # @param {type: "string"}
    char_limit = 20  # @param {type: "number"}

    chunks, filename = text_chunk(text, language, char_limit)

    print(filename)
    print(len(chunks))
    for c in chunks:
        print(len(c), c)