piper

Sleeping

App Files Files Community

Percy3822 commited on Oct 10, 2025

Commit

24e5167

verified ·

1 Parent(s): 7ece229

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -27

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ def pick_writable_dir(candidates):
             probe.unlink(missing_ok=True)
             return p
         except Exception as e:
-            errs.append(f"{p}: {type(e)._name_}({e})")
     raise RuntimeError("No writable dir. Tried:\n  " + "\n  ".join(errs))
 ENV_DIR = os.getenv("TTS_DATA_DIR")
@@ -79,6 +79,10 @@ STREAM_BATCH_MS  = int(os.getenv("STREAM_BATCH_MS", "100"))            # ~100 ms
 DEFAULT_CH = 1  # mono
 # Input clamp (basic DoS protection)
 MAX_TEXT_CHARS = int(os.getenv("MAX_TEXT_CHARS", "800"))
@@ -234,10 +238,13 @@ async def piper_stream_raw(
     length_scale: float,
     noise_scale: float,
     noise_w: float,
 ):
     """
-    Stream RAW PCM frames over WS at steady ~STREAM_BATCH_MS cadence.
-    Send stderr as 'log' events; signal 'done' at completion.
     """
     cmd = build_piper_cmd(text, voice, to_stdout=True,
                           length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
@@ -266,29 +273,72 @@ async def piper_stream_raw(
     stderr_task = asyncio.create_task(pump_stderr())
     total = 0
-    # framing
     bytes_per_ms = max(1, int(sr * channels * 2 / 1000))
-    frame_bytes = max(bytes_per_ms, int(STREAM_BATCH_MS * bytes_per_ms))
     buf = bytearray()
     try:
         while True:
-            chunk = await proc.stdout.read(4096)
-            if not chunk:
-                # flush remainder
-                if buf:
-                    await ws.send_bytes(bytes(buf))
-                    total += len(buf)
-                    buf.clear()
-                break
-            buf.extend(chunk)
-            # send in steady frames
-            while len(buf) >= frame_bytes:
-                await ws.send_bytes(buf[:frame_bytes])
-                total += frame_bytes
-                del buf[:frame_bytes]
         await proc.wait()
         await stderr_task
@@ -333,8 +383,8 @@ def health():
     # optional environment versions
     try:
         import numpy, onnxruntime as ort
-        numpy_version = numpy._version_
-        onnxruntime_version = ort._version_
     except Exception:
         numpy_version = onnxruntime_version = None
@@ -525,6 +575,9 @@ async def ws_tts(ws: WebSocket):
     voice = DEFAULT_VOICE
     length_scale, noise_scale, noise_w = 1.08, 0.35, 0.90
     voice_sr = 22050  # will be set from config on init
     try:
         while True:
@@ -535,7 +588,7 @@ async def ws_tts(ws: WebSocket):
                 continue
             ev = data.get("event")
             if ev == "init":
-                # optional shared-secret over WS: accept via querystring token or in 'token' field
                 token = (data.get("token") or "")
                 if AUTH_SHARED_SECRET and token != AUTH_SHARED_SECRET:
                     await ws.send_text(json.dumps({"event": "error", "detail": "unauthorized"}))
@@ -550,10 +603,17 @@ async def ws_tts(ws: WebSocket):
                 if "length_scale" not in data and "rate_wpm" in data:
                     try:
                         rate_wpm = int(data.get("rate_wpm", 165))
-                        # crude monotonic mapping: faster WPM → smaller length_scale
                         length_scale = max(0.70, min(1.40, 165.0 / max(100, rate_wpm)))
                     except Exception:
                         pass
                 try:
                     info = ensure_voice(voice)
                     voice_sr = int(info.get("sr", 22050))
@@ -571,7 +631,10 @@ async def ws_tts(ws: WebSocket):
                 if len(text) > MAX_TEXT_CHARS:
                     await ws.send_text(json.dumps({"event":"error","detail": f"text too long (>{MAX_TEXT_CHARS})"}))
                     continue
-                await piper_stream_raw(text, voice, ws, voice_sr, DEFAULT_CH, length_scale, noise_scale, noise_w)
             # ignore others
     except WebSocketDisconnect:
         return
@@ -586,4 +649,4 @@ async def ws_tts(ws: WebSocket):
             pass
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")), reload=False)

             probe.unlink(missing_ok=True)
             return p
         except Exception as e:
+            errs.append(f"{p}: {type(e).__name__}({e})")
     raise RuntimeError("No writable dir. Tried:\n  " + "\n  ".join(errs))
 ENV_DIR = os.getenv("TTS_DATA_DIR")
 DEFAULT_CH = 1  # mono
+# NEW: Prebuffer to avoid pauses (synthesize immediately, start streaming later)
+PREBUFFER_MS = int(os.getenv("PREBUFFER_MS", "6000"))                  # ~6s buffer before first bytes
+PREBUFFER_MAX_WAIT_MS = int(os.getenv("PREBUFFER_MAX_WAIT_MS", "15000"))  # fail-safe cap
 # Input clamp (basic DoS protection)
 MAX_TEXT_CHARS = int(os.getenv("MAX_TEXT_CHARS", "800"))
     length_scale: float,
     noise_scale: float,
     noise_w: float,
+    prebuffer_ms: int,
+    prebuffer_max_wait_ms: int,
 ):
     """
+    Synthesize immediately; stream in *batched, clock-paced* frames:
+      - Accumulate audio until `prebuffer_ms` (or `prebuffer_max_wait_ms` elapses).
+      - Then send fixed batches of STREAM_BATCH_MS at a steady cadence.
     """
     cmd = build_piper_cmd(text, voice, to_stdout=True,
                           length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
     stderr_task = asyncio.create_task(pump_stderr())
     total = 0
+    # framing + pacing
     bytes_per_ms = max(1, int(sr * channels * 2 / 1000))
+    batch_bytes = max(bytes_per_ms, int(STREAM_BATCH_MS * bytes_per_ms))
+    target_prebuffer_bytes = max(0, int(prebuffer_ms) * bytes_per_ms)
     buf = bytearray()
+    started_streaming = False
+    first_audio_ts = None
+    pace_start_t = None
+    batches_sent = 0
     try:
         while True:
+            chunk = await proc.stdout.read(8192)
+            if chunk:
+                if first_audio_ts is None:
+                    first_audio_ts = time.time()
+                buf.extend(chunk)
+                # Flip to streaming once prebuffer is satisfied (or the fail-safe wait elapsed)
+                if not started_streaming:
+                    enough = (len(buf) >= target_prebuffer_bytes) if target_prebuffer_bytes > 0 else True
+                    waited = False
+                    if first_audio_ts is not None and prebuffer_max_wait_ms > 0:
+                        waited = ((time.time() - first_audio_ts) * 1000.0) >= prebuffer_max_wait_ms
+                    if enough or waited:
+                        started_streaming = True
+                        pace_start_t = time.time()
+                        batches_sent = 0
+                # If streaming, emit batches on schedule
+                if started_streaming:
+                    while len(buf) >= batch_bytes:
+                        due_t = pace_start_t + (batches_sent * STREAM_BATCH_MS) / 1000.0
+                        sleep_s = due_t - time.time()
+                        if sleep_s > 0:
+                            await asyncio.sleep(sleep_s)
+                        await ws.send_bytes(buf[:batch_bytes])
+                        del buf[:batch_bytes]
+                        total += batch_bytes
+                        batches_sent += 1
+                continue
+            # EOF from Piper
+            if not started_streaming and len(buf) > 0:
+                started_streaming = True
+                pace_start_t = time.time()
+                batches_sent = 0
+            # Flush any remaining bytes in paced batches
+            while len(buf) >= batch_bytes:
+                due_t = pace_start_t + (batches_sent * STREAM_BATCH_MS) / 1000.0
+                sleep_s = due_t - time.time()
+                if sleep_s > 0:
+                    await asyncio.sleep(sleep_s)
+                await ws.send_bytes(buf[:batch_bytes])
+                del buf[:batch_bytes]
+                total += batch_bytes
+                batches_sent += 1
+            # Send any tail (less than a full batch) now
+            if len(buf) > 0:
+                await ws.send_bytes(bytes(buf))
+                total += len(buf)
+                buf.clear()
+            break  # done reading
         await proc.wait()
         await stderr_task
     # optional environment versions
     try:
         import numpy, onnxruntime as ort
+        numpy_version = numpy.__version__
+        onnxruntime_version = ort.__version__
     except Exception:
         numpy_version = onnxruntime_version = None
     voice = DEFAULT_VOICE
     length_scale, noise_scale, noise_w = 1.08, 0.35, 0.90
     voice_sr = 22050  # will be set from config on init
+    # NEW: defaults (client can override in init)
+    prebuffer_ms = PREBUFFER_MS
+    prebuffer_max_wait_ms = PREBUFFER_MAX_WAIT_MS
     try:
         while True:
                 continue
             ev = data.get("event")
             if ev == "init":
+                # optional shared-secret over WS: accept via token field
                 token = (data.get("token") or "")
                 if AUTH_SHARED_SECRET and token != AUTH_SHARED_SECRET:
                     await ws.send_text(json.dumps({"event": "error", "detail": "unauthorized"}))
                 if "length_scale" not in data and "rate_wpm" in data:
                     try:
                         rate_wpm = int(data.get("rate_wpm", 165))
                         length_scale = max(0.70, min(1.40, 165.0 / max(100, rate_wpm)))
                     except Exception:
                         pass
+                # NEW: allow client override for prebuffer knobs
+                if "prebuffer_ms" in data:
+                    try: prebuffer_ms = max(0, int(data["prebuffer_ms"]))
+                    except Exception: pass
+                if "prebuffer_max_wait_ms" in data:
+                    try: prebuffer_max_wait_ms = max(0, int(data["prebuffer_max_wait_ms"]))
+                    except Exception: pass
                 try:
                     info = ensure_voice(voice)
                     voice_sr = int(info.get("sr", 22050))
                 if len(text) > MAX_TEXT_CHARS:
                     await ws.send_text(json.dumps({"event":"error","detail": f"text too long (>{MAX_TEXT_CHARS})"}))
                     continue
+                await piper_stream_raw(
+                    text, voice, ws, voice_sr, DEFAULT_CH, length_scale, noise_scale, noise_w,
+                    prebuffer_ms, prebuffer_max_wait_ms
+                )
             # ignore others
     except WebSocketDisconnect:
         return
             pass
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")), reload=False)