Spaces:
Running
Running
Commit ·
313bef8
1
Parent(s): 190de1a
Fix TTS language detection for VI/ID & stabilize TTS streaming buffer
Browse files- app.py +29 -4
- requirements.txt +1 -0
- services/tts_service.py +28 -3
app.py
CHANGED
|
@@ -135,15 +135,42 @@ def _normalize_bcp47_language_tag(tag: Optional[str]) -> Optional[str]:
|
|
| 135 |
|
| 136 |
|
| 137 |
def _preferred_language_from_text(text: str) -> Optional[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
script_counts: Dict[str, int] = {}
|
| 139 |
-
for ch in
|
| 140 |
if ch.isspace():
|
| 141 |
continue
|
| 142 |
try:
|
| 143 |
name = unicodedata.name(ch)
|
| 144 |
except ValueError:
|
| 145 |
continue
|
| 146 |
-
for script in ("HIRAGANA", "KATAKANA", "HANGUL", "CJK UNIFIED IDEOGRAPH", "
|
| 147 |
if script in name:
|
| 148 |
script_counts[script] = script_counts.get(script, 0) + 1
|
| 149 |
break
|
|
@@ -156,8 +183,6 @@ def _preferred_language_from_text(text: str) -> Optional[str]:
|
|
| 156 |
return "th-TH"
|
| 157 |
if script_counts.get("CYRILLIC", 0):
|
| 158 |
return "ru-RU"
|
| 159 |
-
if script_counts.get("LATIN", 0) and not script_counts.get("CJK UNIFIED IDEOGRAPH", 0):
|
| 160 |
-
return "en-US"
|
| 161 |
if script_counts.get("CJK UNIFIED IDEOGRAPH", 0):
|
| 162 |
return "zh-TW"
|
| 163 |
return None
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
def _preferred_language_from_text(text: str) -> Optional[str]:
|
| 138 |
+
text = str(text or "").strip()
|
| 139 |
+
if not text:
|
| 140 |
+
return None
|
| 141 |
+
|
| 142 |
+
try:
|
| 143 |
+
from langdetect import detect
|
| 144 |
+
lang = detect(text)
|
| 145 |
+
|
| 146 |
+
lang_map = {
|
| 147 |
+
"zh-cn": "zh-TW",
|
| 148 |
+
"zh-tw": "zh-TW",
|
| 149 |
+
"en": "en-US",
|
| 150 |
+
"ja": "ja-JP",
|
| 151 |
+
"ko": "ko-KR",
|
| 152 |
+
"th": "th-TH",
|
| 153 |
+
"vi": "vi-VN",
|
| 154 |
+
"id": "id-ID",
|
| 155 |
+
"ru": "ru-RU",
|
| 156 |
+
"es": "es-ES",
|
| 157 |
+
"fr": "fr-FR",
|
| 158 |
+
"de": "de-DE"
|
| 159 |
+
}
|
| 160 |
+
if lang in lang_map:
|
| 161 |
+
return lang_map[lang]
|
| 162 |
+
except Exception:
|
| 163 |
+
pass
|
| 164 |
+
|
| 165 |
script_counts: Dict[str, int] = {}
|
| 166 |
+
for ch in text:
|
| 167 |
if ch.isspace():
|
| 168 |
continue
|
| 169 |
try:
|
| 170 |
name = unicodedata.name(ch)
|
| 171 |
except ValueError:
|
| 172 |
continue
|
| 173 |
+
for script in ("HIRAGANA", "KATAKANA", "HANGUL", "CJK UNIFIED IDEOGRAPH", "CYRILLIC", "THAI"):
|
| 174 |
if script in name:
|
| 175 |
script_counts[script] = script_counts.get(script, 0) + 1
|
| 176 |
break
|
|
|
|
| 183 |
return "th-TH"
|
| 184 |
if script_counts.get("CYRILLIC", 0):
|
| 185 |
return "ru-RU"
|
|
|
|
|
|
|
| 186 |
if script_counts.get("CJK UNIFIED IDEOGRAPH", 0):
|
| 187 |
return "zh-TW"
|
| 188 |
return None
|
requirements.txt
CHANGED
|
@@ -26,6 +26,7 @@ jsonschema>=4.17.0
|
|
| 26 |
|
| 27 |
# Geospatial / directions
|
| 28 |
geohash2
|
|
|
|
| 29 |
|
| 30 |
# Machine Learning dependencies
|
| 31 |
numpy>=1.24.0,<2.0.0
|
|
|
|
| 26 |
|
| 27 |
# Geospatial / directions
|
| 28 |
geohash2
|
| 29 |
+
langdetect
|
| 30 |
|
| 31 |
# Machine Learning dependencies
|
| 32 |
numpy>=1.24.0,<2.0.0
|
services/tts_service.py
CHANGED
|
@@ -472,14 +472,39 @@ class TTSService:
|
|
| 472 |
total_chunks = 0
|
| 473 |
total_bytes = 0
|
| 474 |
|
| 475 |
-
# 🎯 設
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
response_iter = await client.streaming_synthesize(requests=request_iter(), timeout=20.0)
|
| 477 |
async for response in response_iter:
|
| 478 |
chunk = getattr(response, "audio_content", b"")
|
| 479 |
if chunk:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
total_chunks += 1
|
| 481 |
-
total_bytes += len(
|
| 482 |
-
yield
|
| 483 |
|
| 484 |
logger.debug("✅ TTS 串流完成: total_chunks=%d, total_bytes=%d", total_chunks, total_bytes)
|
| 485 |
|
|
|
|
| 472 |
total_chunks = 0
|
| 473 |
total_bytes = 0
|
| 474 |
|
| 475 |
+
# 🎯 緩衝區設計:避免發送過小的 chunk 導致前端處理效能崩潰或斷音
|
| 476 |
+
# 同時確保每次送出的 PCM 數據長度都是偶數 (16-bit = 2 bytes)
|
| 477 |
+
audio_buffer = bytearray()
|
| 478 |
+
MIN_CHUNK_SIZE = 4096 # 約 85ms 的音訊 @ 24kHz
|
| 479 |
+
|
| 480 |
response_iter = await client.streaming_synthesize(requests=request_iter(), timeout=20.0)
|
| 481 |
async for response in response_iter:
|
| 482 |
chunk = getattr(response, "audio_content", b"")
|
| 483 |
if chunk:
|
| 484 |
+
audio_buffer.extend(chunk)
|
| 485 |
+
# 當累積超過最低大小時送出,且確保送出長度為偶數
|
| 486 |
+
while len(audio_buffer) >= MIN_CHUNK_SIZE:
|
| 487 |
+
# 計算可送出的最大偶數長度
|
| 488 |
+
send_len = len(audio_buffer) - (len(audio_buffer) % 2)
|
| 489 |
+
if send_len == 0:
|
| 490 |
+
break
|
| 491 |
+
|
| 492 |
+
send_chunk = bytes(audio_buffer[:send_len])
|
| 493 |
+
audio_buffer = audio_buffer[send_len:]
|
| 494 |
+
|
| 495 |
+
total_chunks += 1
|
| 496 |
+
total_bytes += len(send_chunk)
|
| 497 |
+
yield send_chunk
|
| 498 |
+
|
| 499 |
+
# 處理剩餘的尾部資料
|
| 500 |
+
if len(audio_buffer) > 0:
|
| 501 |
+
# 確保長度為偶數
|
| 502 |
+
send_len = len(audio_buffer) - (len(audio_buffer) % 2)
|
| 503 |
+
if send_len > 0:
|
| 504 |
+
send_chunk = bytes(audio_buffer[:send_len])
|
| 505 |
total_chunks += 1
|
| 506 |
+
total_bytes += len(send_chunk)
|
| 507 |
+
yield send_chunk
|
| 508 |
|
| 509 |
logger.debug("✅ TTS 串流完成: total_chunks=%d, total_bytes=%d", total_chunks, total_bytes)
|
| 510 |
|