ShadowHunter222 commited on
Commit
06dac54
Β·
verified Β·
1 Parent(s): 5bd9d0c

Upload 2 files

Browse files
Files changed (2) hide show
  1. config.py +15 -3
  2. text_processor.py +41 -7
config.py CHANGED
@@ -17,6 +17,12 @@ def _get_bool(name: str, default: bool) -> bool:
17
  return raw.strip().lower() in {"1", "true", "yes", "on"}
18
 
19
 
 
 
 
 
 
 
20
  class Config:
21
  # ── Model ────────────────────────────────────────────────────
22
  MODEL_ID: str = os.getenv("CB_MODEL_ID", "ResembleAI/chatterbox-turbo-ONNX")
@@ -67,6 +73,12 @@ class Config:
67
  # (not a model β€” just a reference WAV, safe to use from any source).
68
  DEFAULT_VOICE_REPO: str = "onnx-community/chatterbox-ONNX"
69
  DEFAULT_VOICE_FILE: str = "default_voice.wav"
 
 
 
 
 
 
70
  MAX_VOICE_UPLOAD_BYTES: int = 10 * 1024 * 1024 # 10 MB
71
  MIN_REF_DURATION_SEC: float = 1.5
72
  MAX_REF_DURATION_SEC: float = 30.0
@@ -76,15 +88,15 @@ class Config:
76
  # ── Streaming ────────────────────────────────────────────────
77
  # Smaller chunks = faster TTFB (first audio arrives sooner)
78
  # ~200 chars β‰ˆ 1–2 sentences β‰ˆ fastest first-chunk on 2 vCPU
79
- MAX_CHUNK_CHARS: int = int(os.getenv("CB_MAX_CHUNK_CHARS", "100"))
80
  # Additive parallel mode (3-way split: primary + helper1 + helper2).
81
  ENABLE_PARALLEL_MODE: bool = _get_bool("CB_ENABLE_PARALLEL_MODE", True)
82
- HELPER_BASE_URL: str = os.getenv("CB_HELPER_BASE_URL", "").strip()
83
  HELPER1_BASE_URL: str = os.getenv(
84
  "CB_HELPER1_BASE_URL",
85
  HELPER_BASE_URL,
86
  ).strip()
87
- HELPER2_BASE_URL: str = os.getenv("CB_HELPER2_BASE_URL", "").strip()
88
  HELPER_TIMEOUT_SEC: float = float(os.getenv("CB_HELPER_TIMEOUT_SEC", "45"))
89
  HELPER_RETRY_ONCE: bool = _get_bool("CB_HELPER_RETRY_ONCE", True)
90
  # Optional shared secret for internal chunk endpoints.
 
17
  return raw.strip().lower() in {"1", "true", "yes", "on"}
18
 
19
 
20
+ def _get_csv(name: str, default: str) -> tuple[str, ...]:
21
+ raw = os.getenv(name, default)
22
+ items = [x.strip() for x in raw.split(",")]
23
+ return tuple(x for x in items if x)
24
+
25
+
26
  class Config:
27
  # ── Model ────────────────────────────────────────────────────
28
  MODEL_ID: str = os.getenv("CB_MODEL_ID", "ResembleAI/chatterbox-turbo-ONNX")
 
73
  # (not a model β€” just a reference WAV, safe to use from any source).
74
  DEFAULT_VOICE_REPO: str = "onnx-community/chatterbox-ONNX"
75
  DEFAULT_VOICE_FILE: str = "default_voice.wav"
76
+ DEFAULT_VOICE_REPOS: tuple[str, ...] = _get_csv(
77
+ "CB_DEFAULT_VOICE_REPOS",
78
+ DEFAULT_VOICE_REPO,
79
+ )
80
+ PRELOAD_BUILTIN_VOICES: bool = _get_bool("CB_PRELOAD_BUILTIN_VOICES", True)
81
+ MAX_PRELOAD_BUILTIN_VOICES: int = int(os.getenv("CB_MAX_PRELOAD_BUILTIN_VOICES", "64"))
82
  MAX_VOICE_UPLOAD_BYTES: int = 10 * 1024 * 1024 # 10 MB
83
  MIN_REF_DURATION_SEC: float = 1.5
84
  MAX_REF_DURATION_SEC: float = 30.0
 
88
  # ── Streaming ────────────────────────────────────────────────
89
  # Smaller chunks = faster TTFB (first audio arrives sooner)
90
  # ~200 chars β‰ˆ 1–2 sentences β‰ˆ fastest first-chunk on 2 vCPU
91
+ MAX_CHUNK_CHARS: int = int(os.getenv("CB_MAX_CHUNK_CHARS", "150"))
92
  # Additive parallel mode (3-way split: primary + helper1 + helper2).
93
  ENABLE_PARALLEL_MODE: bool = _get_bool("CB_ENABLE_PARALLEL_MODE", True)
94
+ HELPER_BASE_URL: str = os.getenv("CB_HELPER_BASE_URL", "https://shadowhunter222-chab2.hf.space").strip()
95
  HELPER1_BASE_URL: str = os.getenv(
96
  "CB_HELPER1_BASE_URL",
97
  HELPER_BASE_URL,
98
  ).strip()
99
+ HELPER2_BASE_URL: str = os.getenv("CB_HELPER2_BASE_URL", "https://shadowhunter222-chab3.hf.space").strip()
100
  HELPER_TIMEOUT_SEC: float = float(os.getenv("CB_HELPER_TIMEOUT_SEC", "45"))
101
  HELPER_RETRY_ONCE: bool = _get_bool("CB_HELPER_RETRY_ONCE", True)
102
  # Optional shared secret for internal chunk endpoints.
text_processor.py CHANGED
@@ -231,6 +231,23 @@ def sanitize(text: str) -> str:
231
  for idx, original in tags_found:
232
  text = text.replace(f"Β§TAG{idx}Β§", original)
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  return text
235
 
236
 
@@ -286,25 +303,42 @@ def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> L
286
  # ═══════════════════════════════════════════════════════════════════
287
 
288
  def _break_long_chunk(text: str, max_chars: int) -> List[str]:
289
- """Break a chunk longer than max_chars on commas or word boundaries."""
 
 
 
 
 
 
 
290
  parts: List[str] = []
291
  remaining = text
292
  while len(remaining) > max_chars:
293
  break_pos = -1
294
  include_break_char = False
295
 
296
- # Prefer punctuation/pauses first to keep prosody natural.
 
 
 
 
 
 
 
297
  for marker in (",", ";", ":", "β€”", "-", "!", "?"):
298
  pos = remaining.rfind(marker, 0, max_chars)
299
  if pos > break_pos:
300
  break_pos = pos
301
  include_break_char = True
302
 
303
- # Then prefer nearest space before limit.
304
- space_pos = remaining.rfind(" ", 0, max_chars)
305
- if space_pos > break_pos:
306
- break_pos = space_pos
307
- include_break_char = False
 
 
 
308
 
309
  # If nothing before limit, look slightly ahead to avoid mid-word cuts.
310
  if break_pos == -1:
 
231
  for idx, original in tags_found:
232
  text = text.replace(f"Β§TAG{idx}Β§", original)
233
 
234
+ # 11. Ensure paralinguistic tags have spaces around them.
235
+ # The model needs whitespace boundaries to properly render tags like
236
+ # [clear throat]. Without spaces (e.g. "Jerry.[clear throat]I'm"),
237
+ # the tag gets swallowed or produces silence instead of the sound.
238
+ text = re.sub(
239
+ r"(\w)(\[(?:" + _TAG_NAMES + r")\])",
240
+ r"\1 \2",
241
+ text,
242
+ flags=re.IGNORECASE,
243
+ )
244
+ text = re.sub(
245
+ r"(\[(?:" + _TAG_NAMES + r")\])(\w)",
246
+ r"\1 \2",
247
+ text,
248
+ flags=re.IGNORECASE,
249
+ )
250
+
251
  return text
252
 
253
 
 
303
  # ═══════════════════════════════════════════════════════════════════
304
 
305
  def _break_long_chunk(text: str, max_chars: int) -> List[str]:
306
+ """Break a chunk longer than max_chars on natural pause boundaries.
307
+
308
+ Priority order for break points:
309
+ 1. Ellipsis '...' β€” strongest natural pause within a long sentence
310
+ 2. Punctuation (comma, semicolon, colon, dash, !, ?)
311
+ 3. Nearest space before limit
312
+ 4. Look ahead slightly to avoid mid-word cuts
313
+ """
314
  parts: List[str] = []
315
  remaining = text
316
  while len(remaining) > max_chars:
317
  break_pos = -1
318
  include_break_char = False
319
 
320
+ # First try: break at ellipsis '...' β€” the strongest internal pause.
321
+ ellipsis_pos = remaining.rfind("...", 0, max_chars)
322
+ if ellipsis_pos > 0:
323
+ # Include all three dots in the current segment
324
+ break_pos = ellipsis_pos + 3
325
+ include_break_char = False # already moved past the dots
326
+
327
+ # Then try punctuation markers (only upgrade if at a later position).
328
  for marker in (",", ";", ":", "β€”", "-", "!", "?"):
329
  pos = remaining.rfind(marker, 0, max_chars)
330
  if pos > break_pos:
331
  break_pos = pos
332
  include_break_char = True
333
 
334
+ # Space is a FALLBACK only β€” never override a punctuation/ellipsis break.
335
+ # Cutting at punctuation gives the model proper prosody cues;
336
+ # cutting at a random space creates mid-phrase fragments ("handle the").
337
+ if break_pos <= 0:
338
+ space_pos = remaining.rfind(" ", 0, max_chars)
339
+ if space_pos > 0:
340
+ break_pos = space_pos
341
+ include_break_char = False
342
 
343
  # If nothing before limit, look slightly ahead to avoid mid-word cuts.
344
  if break_pos == -1: