Spaces:

XiaoBai1221
/

Bloom_Ware

Running

App Files Files Community

XiaoBai1221 commited on 19 days ago

Commit

313bef8

1 Parent(s): 190de1a

Fix TTS language detection for VI/ID & stabilize TTS streaming buffer

Browse files

Files changed (3) hide show

app.py +29 -4
requirements.txt +1 -0
services/tts_service.py +28 -3

app.py CHANGED Viewed

@@ -135,15 +135,42 @@ def _normalize_bcp47_language_tag(tag: Optional[str]) -> Optional[str]:
 def _preferred_language_from_text(text: str) -> Optional[str]:
     script_counts: Dict[str, int] = {}
-    for ch in str(text or ""):
         if ch.isspace():
             continue
         try:
             name = unicodedata.name(ch)
         except ValueError:
             continue
-        for script in ("HIRAGANA", "KATAKANA", "HANGUL", "CJK UNIFIED IDEOGRAPH", "LATIN", "CYRILLIC", "THAI"):
             if script in name:
                 script_counts[script] = script_counts.get(script, 0) + 1
                 break
@@ -156,8 +183,6 @@ def _preferred_language_from_text(text: str) -> Optional[str]:
         return "th-TH"
     if script_counts.get("CYRILLIC", 0):
         return "ru-RU"
-    if script_counts.get("LATIN", 0) and not script_counts.get("CJK UNIFIED IDEOGRAPH", 0):
-        return "en-US"
     if script_counts.get("CJK UNIFIED IDEOGRAPH", 0):
         return "zh-TW"
     return None

 def _preferred_language_from_text(text: str) -> Optional[str]:
+    text = str(text or "").strip()
+    if not text:
+        return None
+    try:
+        from langdetect import detect
+        lang = detect(text)
+        lang_map = {
+            "zh-cn": "zh-TW",
+            "zh-tw": "zh-TW",
+            "en": "en-US",
+            "ja": "ja-JP",
+            "ko": "ko-KR",
+            "th": "th-TH",
+            "vi": "vi-VN",
+            "id": "id-ID",
+            "ru": "ru-RU",
+            "es": "es-ES",
+            "fr": "fr-FR",
+            "de": "de-DE"
+        }
+        if lang in lang_map:
+            return lang_map[lang]
+    except Exception:
+        pass
     script_counts: Dict[str, int] = {}
+    for ch in text:
         if ch.isspace():
             continue
         try:
             name = unicodedata.name(ch)
         except ValueError:
             continue
+        for script in ("HIRAGANA", "KATAKANA", "HANGUL", "CJK UNIFIED IDEOGRAPH", "CYRILLIC", "THAI"):
             if script in name:
                 script_counts[script] = script_counts.get(script, 0) + 1
                 break
         return "th-TH"
     if script_counts.get("CYRILLIC", 0):
         return "ru-RU"
     if script_counts.get("CJK UNIFIED IDEOGRAPH", 0):
         return "zh-TW"
     return None

requirements.txt CHANGED Viewed

@@ -26,6 +26,7 @@ jsonschema>=4.17.0
 # Geospatial / directions
 geohash2
 # Machine Learning dependencies
 numpy>=1.24.0,<2.0.0

 # Geospatial / directions
 geohash2
+langdetect
 # Machine Learning dependencies
 numpy>=1.24.0,<2.0.0

services/tts_service.py CHANGED Viewed

@@ -472,14 +472,39 @@ class TTSService:
             total_chunks = 0
             total_bytes = 0
-            # 🎯 設定超時時間，避免無限等待
             response_iter = await client.streaming_synthesize(requests=request_iter(), timeout=20.0)
             async for response in response_iter:
                 chunk = getattr(response, "audio_content", b"")
                 if chunk:
                     total_chunks += 1
-                    total_bytes += len(chunk)
-                    yield bytes(chunk)
             logger.debug("✅ TTS 串流完成: total_chunks=%d, total_bytes=%d", total_chunks, total_bytes)

             total_chunks = 0
             total_bytes = 0
+            # 🎯 緩衝區設計：避免發送過小的 chunk 導致前端處理效能崩潰或斷音
+            # 同時確保每次送出的 PCM 數據長度都是偶數 (16-bit = 2 bytes)
+            audio_buffer = bytearray()
+            MIN_CHUNK_SIZE = 4096  # 約 85ms 的音訊 @ 24kHz
             response_iter = await client.streaming_synthesize(requests=request_iter(), timeout=20.0)
             async for response in response_iter:
                 chunk = getattr(response, "audio_content", b"")
                 if chunk:
+                    audio_buffer.extend(chunk)
+                    # 當累積超過最低大小時送出，且確保送出長度為偶數
+                    while len(audio_buffer) >= MIN_CHUNK_SIZE:
+                        # 計算可送出的最大偶數長度
+                        send_len = len(audio_buffer) - (len(audio_buffer) % 2)
+                        if send_len == 0:
+                            break
+                        send_chunk = bytes(audio_buffer[:send_len])
+                        audio_buffer = audio_buffer[send_len:]
+                        total_chunks += 1
+                        total_bytes += len(send_chunk)
+                        yield send_chunk
+            # 處理剩餘的尾部資料
+            if len(audio_buffer) > 0:
+                # 確保長度為偶數
+                send_len = len(audio_buffer) - (len(audio_buffer) % 2)
+                if send_len > 0:
+                    send_chunk = bytes(audio_buffer[:send_len])
                     total_chunks += 1
+                    total_bytes += len(send_chunk)
+                    yield send_chunk
             logger.debug("✅ TTS 串流完成: total_chunks=%d, total_bytes=%d", total_chunks, total_bytes)