XiaoBai1221 commited on
Commit
313bef8
·
1 Parent(s): 190de1a

Fix TTS language detection for VI/ID & stabilize TTS streaming buffer

Browse files
Files changed (3) hide show
  1. app.py +29 -4
  2. requirements.txt +1 -0
  3. services/tts_service.py +28 -3
app.py CHANGED
@@ -135,15 +135,42 @@ def _normalize_bcp47_language_tag(tag: Optional[str]) -> Optional[str]:
135
 
136
 
137
  def _preferred_language_from_text(text: str) -> Optional[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  script_counts: Dict[str, int] = {}
139
- for ch in str(text or ""):
140
  if ch.isspace():
141
  continue
142
  try:
143
  name = unicodedata.name(ch)
144
  except ValueError:
145
  continue
146
- for script in ("HIRAGANA", "KATAKANA", "HANGUL", "CJK UNIFIED IDEOGRAPH", "LATIN", "CYRILLIC", "THAI"):
147
  if script in name:
148
  script_counts[script] = script_counts.get(script, 0) + 1
149
  break
@@ -156,8 +183,6 @@ def _preferred_language_from_text(text: str) -> Optional[str]:
156
  return "th-TH"
157
  if script_counts.get("CYRILLIC", 0):
158
  return "ru-RU"
159
- if script_counts.get("LATIN", 0) and not script_counts.get("CJK UNIFIED IDEOGRAPH", 0):
160
- return "en-US"
161
  if script_counts.get("CJK UNIFIED IDEOGRAPH", 0):
162
  return "zh-TW"
163
  return None
 
135
 
136
 
137
  def _preferred_language_from_text(text: str) -> Optional[str]:
138
+ text = str(text or "").strip()
139
+ if not text:
140
+ return None
141
+
142
+ try:
143
+ from langdetect import detect
144
+ lang = detect(text)
145
+
146
+ lang_map = {
147
+ "zh-cn": "zh-TW",
148
+ "zh-tw": "zh-TW",
149
+ "en": "en-US",
150
+ "ja": "ja-JP",
151
+ "ko": "ko-KR",
152
+ "th": "th-TH",
153
+ "vi": "vi-VN",
154
+ "id": "id-ID",
155
+ "ru": "ru-RU",
156
+ "es": "es-ES",
157
+ "fr": "fr-FR",
158
+ "de": "de-DE"
159
+ }
160
+ if lang in lang_map:
161
+ return lang_map[lang]
162
+ except Exception:
163
+ pass
164
+
165
  script_counts: Dict[str, int] = {}
166
+ for ch in text:
167
  if ch.isspace():
168
  continue
169
  try:
170
  name = unicodedata.name(ch)
171
  except ValueError:
172
  continue
173
+ for script in ("HIRAGANA", "KATAKANA", "HANGUL", "CJK UNIFIED IDEOGRAPH", "CYRILLIC", "THAI"):
174
  if script in name:
175
  script_counts[script] = script_counts.get(script, 0) + 1
176
  break
 
183
  return "th-TH"
184
  if script_counts.get("CYRILLIC", 0):
185
  return "ru-RU"
 
 
186
  if script_counts.get("CJK UNIFIED IDEOGRAPH", 0):
187
  return "zh-TW"
188
  return None
requirements.txt CHANGED
@@ -26,6 +26,7 @@ jsonschema>=4.17.0
26
 
27
  # Geospatial / directions
28
  geohash2
 
29
 
30
  # Machine Learning dependencies
31
  numpy>=1.24.0,<2.0.0
 
26
 
27
  # Geospatial / directions
28
  geohash2
29
+ langdetect
30
 
31
  # Machine Learning dependencies
32
  numpy>=1.24.0,<2.0.0
services/tts_service.py CHANGED
@@ -472,14 +472,39 @@ class TTSService:
472
  total_chunks = 0
473
  total_bytes = 0
474
 
475
- # 🎯 設定超時時間,避免無限等待
 
 
 
 
476
  response_iter = await client.streaming_synthesize(requests=request_iter(), timeout=20.0)
477
  async for response in response_iter:
478
  chunk = getattr(response, "audio_content", b"")
479
  if chunk:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  total_chunks += 1
481
- total_bytes += len(chunk)
482
- yield bytes(chunk)
483
 
484
  logger.debug("✅ TTS 串流完成: total_chunks=%d, total_bytes=%d", total_chunks, total_bytes)
485
 
 
472
  total_chunks = 0
473
  total_bytes = 0
474
 
475
+ # 🎯 緩衝區計:避免發送過小的 chunk 導致前端處理效能崩潰或斷音
476
+ # 同時確保每次送出的 PCM 數據長度都是偶數 (16-bit = 2 bytes)
477
+ audio_buffer = bytearray()
478
+ MIN_CHUNK_SIZE = 4096 # 約 85ms 的音訊 @ 24kHz
479
+
480
  response_iter = await client.streaming_synthesize(requests=request_iter(), timeout=20.0)
481
  async for response in response_iter:
482
  chunk = getattr(response, "audio_content", b"")
483
  if chunk:
484
+ audio_buffer.extend(chunk)
485
+ # 當累積超過最低大小時送出,且確保送出長度為偶數
486
+ while len(audio_buffer) >= MIN_CHUNK_SIZE:
487
+ # 計算可送出的最大偶數長度
488
+ send_len = len(audio_buffer) - (len(audio_buffer) % 2)
489
+ if send_len == 0:
490
+ break
491
+
492
+ send_chunk = bytes(audio_buffer[:send_len])
493
+ audio_buffer = audio_buffer[send_len:]
494
+
495
+ total_chunks += 1
496
+ total_bytes += len(send_chunk)
497
+ yield send_chunk
498
+
499
+ # 處理剩餘的尾部資料
500
+ if len(audio_buffer) > 0:
501
+ # 確保長度為偶數
502
+ send_len = len(audio_buffer) - (len(audio_buffer) % 2)
503
+ if send_len > 0:
504
+ send_chunk = bytes(audio_buffer[:send_len])
505
  total_chunks += 1
506
+ total_bytes += len(send_chunk)
507
+ yield send_chunk
508
 
509
  logger.debug("✅ TTS 串流完成: total_chunks=%d, total_bytes=%d", total_chunks, total_bytes)
510