notmax123 commited on
Commit
5d79055
·
1 Parent(s): 6f91e56

Align phoneme pipeline with modular reference; default UI lang en.

Browse files

Add BLUE_SYNTH_MAX_CHUNK_LEN, _split_hebrew_prephoneme / _split_oversized_hebrew_clause, IPA chunk_text (.!? + tag boundary fix). Renikud uses renikud_max_clause_chars; espeak-ng subprocess fallback. Unknown segment lang falls back to en.

Made-with: Cursor

Files changed (1) hide show
  1. app.py +199 -155
app.py CHANGED
@@ -5,6 +5,7 @@ Upstream: https://github.com/maxmelichov/BlueTTS
5
  import os
6
  import re
7
  import sys
 
8
  import json
9
  import time
10
  import base64
@@ -128,6 +129,9 @@ def text_to_indices_multilang(text: str, base_lang: str = "en") -> list[int]:
128
  ids.extend(CHAR_TO_ID.get(ch, PAD_ID) for ch in seg)
129
  return ids
130
 
 
 
 
131
  # ============================================================
132
  # Text Processing & Chunking
133
  # ============================================================
@@ -137,6 +141,160 @@ class Style:
137
  ttl: Any
138
  dp: Optional[Any] = None
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  class TextProcessor:
141
  _ESPEAK_MAP = {
142
  "en": "en-us", "en-us": "en-us", "de": "de", "ge": "de", "it": "it",
@@ -144,8 +302,14 @@ class TextProcessor:
144
  }
145
  _INLINE_LANG_PAIR = re.compile(r"<(\w+)>(.*?)(?:</\1>|<\1>)", re.DOTALL)
146
 
147
- def __init__(self, renikud_path: Optional[str] = None):
 
 
 
 
 
148
  self.renikud = None
 
149
  self._espeak_backends: Dict[str, Any] = {}
150
  self._espeak_separator: Any = None
151
  self._espeak_ready = False
@@ -208,26 +372,27 @@ class TextProcessor:
208
  return text
209
  if not self._espeak_ready:
210
  self._init_espeak()
211
- if not self._espeak_ready:
212
- print(f"[WARN] espeak-ng not available, returning raw text for lang={lang}")
213
- return text
 
 
 
 
 
 
214
  try:
215
- backend = self._get_espeak_backend(espeak_lang)
216
- raw = backend.phonemize(
217
- [text], separator=self._espeak_separator
218
- )[0]
 
 
 
219
  return normalize_text(raw, lang=lang)
220
  except Exception as e:
221
- print(f"[WARN] Phonemization failed for lang={lang}: {e}")
222
- return text
223
-
224
- def _renikud_phonemize_hebrew(self, text: str) -> str:
225
- """Chunk long Hebrew only for Renikud; join IPA so BlueTTS still chunks at chunk_len."""
226
- g2p_chunks = _renikud_chunk_hebrew(text)
227
- if len(g2p_chunks) <= 1:
228
- return self.renikud.phonemize(text)
229
- parts = [self.renikud.phonemize(c) for c in g2p_chunks]
230
- return _join_renikud_ipa_parts(parts)
231
 
232
  def _phonemize_segment(self, content: str, lang: str) -> str:
233
  content = content.strip()
@@ -240,7 +405,13 @@ class TextProcessor:
240
  if has_hebrew:
241
  if self.renikud is None:
242
  raise self._hebrew_requires_renikud_error()
243
- return normalize_text(self._renikud_phonemize_hebrew(content), lang="he")
 
 
 
 
 
 
244
  if lang == "he":
245
  return normalize_text(content, lang="he")
246
  return self._espeak_phonemize(content, lang)
@@ -271,150 +442,23 @@ class TextProcessor:
271
  return re.sub(r"\s+", " ", " ".join(pieces)).strip()
272
 
273
  def phonemize(self, text: str, lang: str = "en") -> str:
274
- # Clean up repeated punctuation to prevent model hallucinations
275
- text = re.sub(r"\.+", ".", text)
276
- text = re.sub(r"\?+", "?", text)
277
- text = re.sub(r"!+", "!", text)
278
- text = text.replace("…", ".")
279
-
280
  if self._INLINE_LANG_PAIR.search(text):
281
  return self._phonemize_mixed(text, base_lang=lang)
282
- is_hebrew = any('\u0590' <= c <= '\u05ff' for c in text)
283
  if lang == "he" or is_hebrew:
284
  if not is_hebrew:
285
  return normalize_text(text, lang="he")
286
  if self.renikud is not None:
287
- return normalize_text(self._renikud_phonemize_hebrew(text), lang="he")
 
 
 
 
 
 
288
  raise self._hebrew_requires_renikud_error()
289
  return self._espeak_phonemize(text, lang)
290
 
291
- def _hard_split_chunk(s: str, max_len: int) -> List[str]:
292
- s = s.strip()
293
- if not s or max_len <= 0:
294
- return [s] if s else []
295
- if len(s) <= max_len:
296
- return [s]
297
- out: List[str] = []
298
- start = 0
299
- n = len(s)
300
- while start < n:
301
- end = min(start + max_len, n)
302
- if end < n:
303
- window = s[start:end]
304
- cut = window.rfind(" ")
305
- if cut > max(max_len // 4, 8):
306
- end = start + cut
307
- piece = s[start:end].strip()
308
- if piece:
309
- out.append(piece)
310
- start = end
311
- while start < n and s[start] == " ":
312
- start += 1
313
- return out
314
-
315
- def chunk_text(text: str, max_len: int = 300) -> List[str]:
316
- pattern = (
317
- r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)"
318
- r"(?<!Ph\.D\.)(?<!etc\.)(?<!e\.g\.)(?<!i\.e\.)(?<!vs\.)(?<!Inc\.)"
319
- r"(?<!Ltd\.)(?<!Co\.)(?<!Corp\.)(?<!St\.)(?<!Ave\.)(?<!Blvd\.)"
320
- r"(?<!\b[A-Z]\.)(?<=[.!?:,;])\s+"
321
- )
322
- chunks: List[str] = []
323
- for paragraph in re.split(r"\n\s*\n+", text.strip()):
324
- paragraph = paragraph.strip()
325
- if not paragraph:
326
- continue
327
- current = ""
328
- for sentence in re.split(pattern, paragraph):
329
- if len(current) + len(sentence) + 1 <= max_len:
330
- current += (" " if current else "") + sentence
331
- else:
332
- if current:
333
- chunks.append(current.strip())
334
- if len(sentence) > max_len:
335
- chunks.extend(_hard_split_chunk(sentence, max_len))
336
- current = ""
337
- else:
338
- current = sentence
339
- if current:
340
- chunks.append(current.strip())
341
- base = chunks if chunks else ([text.strip()] if text.strip() else [])
342
- # TensorRT engines cap T_text; long IPA without ".!?" must never stay in one oversized chunk.
343
- out: List[str] = []
344
- for c in base:
345
- out.extend(_hard_split_chunk(c, max_len))
346
-
347
- # Fix language tags that span across chunks
348
- fixed_out = []
349
- active_tag = None
350
- for c in out:
351
- c = c.strip()
352
- if not c:
353
- continue
354
-
355
- if active_tag and not c.startswith(f"<{active_tag}>"):
356
- c = f"<{active_tag}>" + c
357
-
358
- for m in re.finditer(r"<(/)?([a-z]{2,8})>", c):
359
- is_close = bool(m.group(1))
360
- tag = m.group(2)
361
- if is_close:
362
- if active_tag == tag:
363
- active_tag = None
364
- else:
365
- active_tag = tag
366
-
367
- if active_tag and not c.endswith(f"</{active_tag}>"):
368
- c = c + f"</{active_tag}>"
369
-
370
- fixed_out.append(c)
371
-
372
- return fixed_out or ([text.strip()] if text.strip() else [])
373
-
374
- def _join_renikud_ipa_parts(parts: List[str]) -> str:
375
- """Join IPA from multiple Renikud calls; normalize whitespace (no duplicate words from join gaps)."""
376
- merged = " ".join(p.strip() for p in parts if p and p.strip())
377
- return re.sub(r"\s+", " ", merged).strip()
378
-
379
- def _renikud_chunk_hebrew(text: str, max_len: int = 168) -> List[str]:
380
- """Split raw Hebrew for Renikud only.
381
-
382
- Uses sentence breaks (.!?) plus length cap — not the same rules as ``chunk_text`` for IPA
383
- (which splits on , : ;). Fewer G2P segments avoids prosodic 'mini-sentence' artifacts that
384
- can sound like repetition when stitched. BlueTTS still chunks phoneme strings at chunk_len.
385
- """
386
- text = text.strip()
387
- if not text:
388
- return []
389
- if len(text) <= max_len:
390
- return [text]
391
- # Sentence boundaries only; keep commas/colons inside a segment when possible.
392
- sent_pat = r"(?<=[.!?])\s+"
393
- chunks: List[str] = []
394
- for paragraph in re.split(r"\n\s*\n+", text):
395
- paragraph = paragraph.strip()
396
- if not paragraph:
397
- continue
398
- current = ""
399
- for sentence in re.split(sent_pat, paragraph):
400
- if len(current) + len(sentence) + 1 <= max_len:
401
- current += (" " if current else "") + sentence
402
- else:
403
- if current:
404
- chunks.append(current.strip())
405
- if len(sentence) > max_len:
406
- chunks.extend(_hard_split_chunk(sentence, max_len))
407
- current = ""
408
- else:
409
- current = sentence
410
- if current:
411
- chunks.append(current.strip())
412
- base = chunks if chunks else [text]
413
- out: List[str] = []
414
- for c in base:
415
- out.extend(_hard_split_chunk(c, max_len))
416
- return out or [text]
417
-
418
  # ============================================================
419
  # BlueTTS Core
420
  # ============================================================
@@ -430,7 +474,7 @@ class BlueTTS:
430
  speed: float = 1.0,
431
  seed: int = 42,
432
  use_gpu: bool = False,
433
- chunk_len: int = 150,
434
  silence_sec: float = 0.15,
435
  fade_duration: float = 0.02,
436
  renikud_path: Optional[str] = None,
 
5
  import os
6
  import re
7
  import sys
8
+ import subprocess
9
  import json
10
  import time
11
  import base64
 
129
  ids.extend(CHAR_TO_ID.get(ch, PAD_ID) for ch in seg)
130
  return ids
131
 
132
+ # Max IPA characters per synthesis forward pass (ONNX). Independent of Renikud clause splitting.
133
+ BLUE_SYNTH_MAX_CHUNK_LEN = 150
134
+
135
  # ============================================================
136
  # Text Processing & Chunking
137
  # ============================================================
 
141
  ttl: Any
142
  dp: Optional[Any] = None
143
 
144
+
145
+ def _hard_split_chunk(s: str, max_len: int) -> List[str]:
146
+ """Split ``s`` into segments of at most ``max_len`` chars (prefer last space)."""
147
+ s = s.strip()
148
+ if not s or max_len <= 0:
149
+ return [s] if s else []
150
+ if len(s) <= max_len:
151
+ return [s]
152
+ out: List[str] = []
153
+ start = 0
154
+ n = len(s)
155
+ while start < n:
156
+ end = min(start + max_len, n)
157
+ if end < n:
158
+ window = s[start:end]
159
+ cut = window.rfind(" ")
160
+ if cut > max(max_len // 4, 8):
161
+ end = start + cut
162
+ piece = s[start:end].strip()
163
+ if piece:
164
+ out.append(piece)
165
+ start = end
166
+ while start < n and s[start] == " ":
167
+ start += 1
168
+ return out
169
+
170
+
171
+ def _split_oversized_hebrew_clause(part: str, max_clause_chars: int) -> List[str]:
172
+ """Only used when a single sentence is longer than ``max_clause_chars``."""
173
+ p = part.strip()
174
+ if not p:
175
+ return []
176
+ if len(p) <= max_clause_chars:
177
+ return [p]
178
+ if re.search(r":\s", p):
179
+ pieces = [x.strip() for x in re.split(r"(?<=:)\s+", p) if x.strip()]
180
+ if len(pieces) > 1:
181
+ out: List[str] = []
182
+ for x in pieces:
183
+ out.extend(_split_oversized_hebrew_clause(x, max_clause_chars))
184
+ return out
185
+ if re.search(r"[\u0590-\u05ff]-\s+[\u0590-\u05ff]", p):
186
+ pieces = [x.strip() for x in re.split(r"(?<=[\u0590-\u05ff])-\s+", p) if x.strip()]
187
+ if len(pieces) > 1:
188
+ out2: List[str] = []
189
+ for x in pieces:
190
+ out2.extend(_split_oversized_hebrew_clause(x, max_clause_chars))
191
+ return out2
192
+ if re.search(r",\s", p):
193
+ pieces = [x.strip() for x in re.split(r",\s+", p) if x.strip()]
194
+ if len(pieces) > 1:
195
+ out3: List[str] = []
196
+ for x in pieces:
197
+ out3.extend(_split_oversized_hebrew_clause(x, max_clause_chars))
198
+ return out3
199
+ return _hard_split_chunk(p, max_clause_chars)
200
+
201
+
202
+ def _split_hebrew_prephoneme(text: str, max_clause_chars: int = 96) -> List[str]:
203
+ """Split raw Hebrew before Renikud G2P.
204
+
205
+ By default only sentence boundaries (``.?!``); colon / hyphen / comma splits run
206
+ only when one sentence is longer than ``max_clause_chars``.
207
+ """
208
+ t = text.strip()
209
+ if not t:
210
+ return []
211
+ t = re.sub(r"\.+", ".", t)
212
+ t = re.sub(r"\?+", "?", t)
213
+ t = re.sub(r"!+", "!", t)
214
+ t = t.replace("…", ".")
215
+ t = re.sub(r"\s+", " ", t)
216
+
217
+ def refine_one(s: str) -> List[str]:
218
+ s = s.strip()
219
+ if not s:
220
+ return []
221
+ out: List[str] = []
222
+ for sent in re.split(r"(?<=[.!?])\s+", s):
223
+ sent = sent.strip()
224
+ if not sent:
225
+ continue
226
+ out.extend(_split_oversized_hebrew_clause(sent, max_clause_chars))
227
+ return out
228
+
229
+ clauses: List[str] = []
230
+ for block in re.split(r"\n+", t):
231
+ block = block.strip()
232
+ if block:
233
+ clauses.extend(refine_one(block))
234
+ return clauses if clauses else [t]
235
+
236
+
237
+ def chunk_text(text: str, max_len: int = 300) -> List[str]:
238
+ """Split IPA/text into sentence-boundary chunks no longer than max_len chars."""
239
+ text = re.sub(r"([.!?])(</[a-z]{2,8}>)\s+", r"\1\2\n\n", text)
240
+
241
+ pattern = (
242
+ r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)"
243
+ r"(?<!Ph\.D\.)(?<!etc\.)(?<!e\.g\.)(?<!i\.e\.)(?<!vs\.)(?<!Inc\.)"
244
+ r"(?<!Ltd\.)(?<!Co\.)(?<!Corp\.)(?<!St\.)(?<!Ave\.)(?<!Blvd\.)"
245
+ r"(?<!\b[A-Z]\.)(?<=[.!?])\s+"
246
+ )
247
+ chunks: List[str] = []
248
+ for paragraph in re.split(r"\n\s*\n+", text.strip()):
249
+ paragraph = paragraph.strip()
250
+ if not paragraph:
251
+ continue
252
+ current = ""
253
+ for sentence in re.split(pattern, paragraph):
254
+ if len(current) + len(sentence) + 1 <= max_len:
255
+ current += (" " if current else "") + sentence
256
+ else:
257
+ if current:
258
+ chunks.append(current.strip())
259
+ if len(sentence) > max_len:
260
+ chunks.extend(_hard_split_chunk(sentence, max_len))
261
+ current = ""
262
+ else:
263
+ current = sentence
264
+ if current:
265
+ chunks.append(current.strip())
266
+ base = chunks if chunks else ([text.strip()] if text.strip() else [])
267
+ out: List[str] = []
268
+ for c in base:
269
+ out.extend(_hard_split_chunk(c, max_len))
270
+
271
+ fixed_out = []
272
+ active_tag = None
273
+ for c in out:
274
+ c = c.strip()
275
+ if not c:
276
+ continue
277
+
278
+ if active_tag and not c.startswith(f"<{active_tag}>"):
279
+ c = f"<{active_tag}>" + c
280
+
281
+ for m in re.finditer(r"<(/)?([a-z]{2,8})>", c):
282
+ is_close = bool(m.group(1))
283
+ tag = m.group(2)
284
+ if is_close:
285
+ if active_tag == tag:
286
+ active_tag = None
287
+ else:
288
+ active_tag = tag
289
+
290
+ if active_tag and not c.endswith(f"</{active_tag}>"):
291
+ c = c + f"</{active_tag}>"
292
+
293
+ fixed_out.append(c)
294
+
295
+ return fixed_out or ([text.strip()] if text.strip() else [])
296
+
297
+
298
  class TextProcessor:
299
  _ESPEAK_MAP = {
300
  "en": "en-us", "en-us": "en-us", "de": "de", "ge": "de", "it": "it",
 
302
  }
303
  _INLINE_LANG_PAIR = re.compile(r"<(\w+)>(.*?)(?:</\1>|<\1>)", re.DOTALL)
304
 
305
+ def __init__(
306
+ self,
307
+ renikud_path: Optional[str] = None,
308
+ *,
309
+ renikud_max_clause_chars: int = 96,
310
+ ):
311
  self.renikud = None
312
+ self._renikud_max_clause_chars = renikud_max_clause_chars
313
  self._espeak_backends: Dict[str, Any] = {}
314
  self._espeak_separator: Any = None
315
  self._espeak_ready = False
 
372
  return text
373
  if not self._espeak_ready:
374
  self._init_espeak()
375
+ if self._espeak_ready:
376
+ try:
377
+ backend = self._get_espeak_backend(espeak_lang)
378
+ raw = backend.phonemize(
379
+ [text], separator=self._espeak_separator
380
+ )[0]
381
+ return normalize_text(raw, lang=lang)
382
+ except Exception as e:
383
+ print(f"[WARN] Phonemizer backend failed for lang={lang}: {e}")
384
  try:
385
+ result = subprocess.run(
386
+ ["espeak-ng", "-q", "--ipa=1", "-v", espeak_lang, text],
387
+ check=True,
388
+ capture_output=True,
389
+ text=True,
390
+ )
391
+ raw = result.stdout.replace("\n", " ").strip()
392
  return normalize_text(raw, lang=lang)
393
  except Exception as e:
394
+ print(f"[WARN] espeak-ng fallback failed for lang={lang}: {e}")
395
+ return text
 
 
 
 
 
 
 
 
396
 
397
  def _phonemize_segment(self, content: str, lang: str) -> str:
398
  content = content.strip()
 
405
  if has_hebrew:
406
  if self.renikud is None:
407
  raise self._hebrew_requires_renikud_error()
408
+ clauses = _split_hebrew_prephoneme(content, self._renikud_max_clause_chars)
409
+ ipa_parts = [
410
+ normalize_text(self.renikud.phonemize(c), lang="he")
411
+ for c in clauses
412
+ if c.strip()
413
+ ]
414
+ return re.sub(r"\s+", " ", " ".join(ipa_parts)).strip()
415
  if lang == "he":
416
  return normalize_text(content, lang="he")
417
  return self._espeak_phonemize(content, lang)
 
442
  return re.sub(r"\s+", " ", " ".join(pieces)).strip()
443
 
444
  def phonemize(self, text: str, lang: str = "en") -> str:
 
 
 
 
 
 
445
  if self._INLINE_LANG_PAIR.search(text):
446
  return self._phonemize_mixed(text, base_lang=lang)
447
+ is_hebrew = any("\u0590" <= c <= "\u05ff" for c in text)
448
  if lang == "he" or is_hebrew:
449
  if not is_hebrew:
450
  return normalize_text(text, lang="he")
451
  if self.renikud is not None:
452
+ clauses = _split_hebrew_prephoneme(text, self._renikud_max_clause_chars)
453
+ ipa_parts = [
454
+ normalize_text(self.renikud.phonemize(c), lang="he")
455
+ for c in clauses
456
+ if c.strip()
457
+ ]
458
+ return re.sub(r"\s+", " ", " ".join(ipa_parts)).strip()
459
  raise self._hebrew_requires_renikud_error()
460
  return self._espeak_phonemize(text, lang)
461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  # ============================================================
463
  # BlueTTS Core
464
  # ============================================================
 
474
  speed: float = 1.0,
475
  seed: int = 42,
476
  use_gpu: bool = False,
477
+ chunk_len: int = BLUE_SYNTH_MAX_CHUNK_LEN,
478
  silence_sec: float = 0.15,
479
  fade_duration: float = 0.02,
480
  renikud_path: Optional[str] = None,