Percy3822 commited on
Commit
a5fe67c
·
verified ·
1 Parent(s): 06dd823

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -203
app.py CHANGED
@@ -1,239 +1,232 @@
1
- import os
2
- import json
3
- import time
4
- import wave
5
- from pathlib import Path
6
- from typing import Optional, Dict, Any
7
-
8
- import uvicorn
9
- import requests
10
- from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
11
- from fastapi.responses import JSONResponse
12
- from fastapi.staticfiles import StaticFiles
13
-
14
- # Piper (CPU TTS)
15
- from piper.voice import PiperVoice
16
-
17
- # ------------ Config ------------
18
- BASE_DIR = Path(os.getenv("BASE_DIR", "/tmp/brain_app")).resolve()
19
- FILES_DIR = (BASE_DIR / "files").resolve()
20
- FILES_DIR.mkdir(parents=True, exist_ok=True)
21
 
22
- VOICE_DIR = Path(os.getenv("VOICE_DIR", "/home/user/voices")).resolve()
 
 
 
 
 
 
 
 
 
23
  VOICE_DIR.mkdir(parents=True, exist_ok=True)
24
 
25
- DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "en_US-lessac-high")
26
- DEFAULT_SR = 22050
27
- DEFAULT_CHANNELS = 1
28
-
29
- # Hugging Face Piper voice (Lessac high quality)
30
- PIPER_HF_BASE = (
31
- "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/"
32
- "en/en_US/lessac/high"
 
 
 
 
 
33
  )
34
 
35
- app = FastAPI(title="ActualTTS (CPU Piper, streaming)")
36
- app.mount("/file", StaticFiles(directory=str(FILES_DIR)), name="file")
37
-
38
- # ------------ Voice Loader ------------
39
-
40
- _loaded_voices: Dict[str, PiperVoice] = {}
41
-
42
- def list_voices() -> list[str]:
43
- return sorted([p.stem for p in VOICE_DIR.glob("*.onnx")])
44
-
45
- def ensure_voice(voice: str) -> tuple[Path, Path]:
46
  """
47
- Ensure .onnx and .onnx.json for 'voice' exist. If not and voice is
48
- en_US-lessac-high, download them automatically.
 
 
49
  """
50
- onnx = VOICE_DIR / f"{voice}.onnx"
51
- cfg = VOICE_DIR / f"{voice}.onnx.json"
52
-
53
  if onnx.exists() and cfg.exists():
54
  return onnx, cfg
55
 
56
- if voice == "en_US-lessac-high":
57
- files = {
58
- onnx: f"{PIPER_HF_BASE}/en_US-lessac-high.onnx",
59
- cfg: f"{PIPER_HF_BASE}/en_US-lessac-high.onnx.json",
60
- }
61
- for path, url in files.items():
62
- r = requests.get(url, timeout=180)
 
63
  r.raise_for_status()
64
- path.write_bytes(r.content)
65
- return onnx, cfg
66
-
67
- raise FileNotFoundError(f"Voice '{voice}' not found in {VOICE_DIR}")
68
-
69
- def get_voice(voice: str) -> PiperVoice:
70
- if voice in _loaded_voices:
71
- return _loaded_voices[voice]
72
- onnx, cfg = ensure_voice(voice)
73
- v = PiperVoice.load(str(onnx), config_path=str(cfg))
74
- _loaded_voices[voice] = v
75
- return v
76
 
77
- # ------------ Helpers ------------
78
-
79
- def prosody(body: Dict[str, Any]) -> dict:
 
80
  """
81
- Extract prosody params with natural defaults.
 
82
  """
83
- return {
84
- "length_scale": float(body.get("length_scale", 1.12)), # a bit slower
85
- "noise_scale": float(body.get("noise_scale", 0.33)), # reduce buzz
86
- "noise_w": float(body.get("noise_w", 0.8)), # stabilize
87
- "sentence_silence": float(body.get("sentence_pause", 0.18)),
88
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- def write_wav_int16(path: Path, sr: int, pcm: bytes, channels: int = 1):
91
- with wave.open(str(path), "wb") as wf:
92
- wf.setnchannels(channels)
93
- wf.setsampwidth(2) # int16
94
- wf.setframerate(sr)
95
- wf.writeframes(pcm)
96
 
97
- # ------------ Routes ------------
 
 
 
 
 
98
 
99
  @app.get("/health")
100
  def health():
 
 
 
 
 
 
101
  return {
102
  "ok": True,
103
- "engine": "piper-tts (CPU)",
104
- "default_voice": DEFAULT_VOICE,
105
  "voice_dir": str(VOICE_DIR),
106
- "available_voices": list_voices(),
107
- "files_dir": str(FILES_DIR),
108
- "tip": "Use WS /ws/tts for streaming or POST /speak for one-shot",
109
  }
110
 
111
  @app.post("/speak")
112
- async def speak(request: Request):
 
 
 
 
 
 
113
  """
114
- JSON body:
115
- {
116
- "text": "Hello world",
117
- "voice": "en_US-lessac-high",
118
- "rate_wpm": 165,
119
- "length_scale": 1.12, "noise_scale": 0.33, "noise_w": 0.8, "sentence_pause": 0.18
120
- }
121
- Returns: { ok, audio_url, sr, channels }
122
  """
123
- try:
124
- body = await request.json()
125
- except Exception:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  return JSONResponse(
127
- status_code=400,
128
- content={"ok": False, "error": "Invalid JSON body"}
129
  )
130
 
131
- text: str = str(body.get("text", "")).strip()
132
- if not text:
133
- return JSONResponse(status_code=400, content={"ok": False, "error": "Missing text"})
134
-
135
- voice = body.get("voice", DEFAULT_VOICE)
136
- try:
137
- voice_obj = get_voice(voice)
138
- except Exception as e:
139
- return JSONResponse(status_code=400, content={
140
- "ok": False,
141
- "error": f"Voice '{voice}' not found. Available: {list_voices()}"
142
- })
143
-
144
- # Piper ignores WPM internally; we simulate tempo via length_scale default.
145
- p = prosody(body)
146
- sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
147
- ch = DEFAULT_CHANNELS
148
-
149
- # Synthesize to PCM buffer
150
- pcm_chunks: list[bytes] = []
151
- for chunk in voice_obj.synthesize_stream(
152
- text,
153
- length_scale=p["length_scale"],
154
- noise_scale=p["noise_scale"],
155
- noise_w=p["noise_w"],
156
- sentence_silence=p["sentence_silence"],
157
- ):
158
- if isinstance(chunk, (bytes, bytearray)):
159
- pcm_chunks.append(bytes(chunk))
160
-
161
- # small tail-silence to avoid cut offs
162
- tail_ms = int(0.22 * 1000)
163
- tail_frames = int(sr * (tail_ms / 1000.0))
164
- pcm_chunks.append(b"\x00" * (tail_frames * ch * 2))
165
-
166
- pcm = b"".join(pcm_chunks)
167
- fname = f"tts-{int(time.time()*1000)}.wav"
168
- fpath = FILES_DIR / fname
169
- write_wav_int16(fpath, sr, pcm, channels=ch)
170
-
171
- return {
172
- "ok": True,
173
- "audio_url": f"/file/{fname}",
174
- "sr": sr,
175
- "channels": ch,
176
- }
177
 
178
  @app.websocket("/ws/tts")
179
  async def ws_tts(ws: WebSocket):
180
- """
181
- Protocol:
182
- <- {"event":"init","voice":"en_US-lessac-high"}
183
- <- {"event":"speak","text":"Hello there...","length_scale":1.12,...}
184
- -> {"event":"ready","sr":22050,"channels":1}
185
- -> <binary PCM16> ... many frames ...
186
- -> {"event":"done"}
187
- """
188
  await ws.accept()
189
- voice_name = DEFAULT_VOICE
190
- voice_obj: Optional[PiperVoice] = None
191
- sr = DEFAULT_SR
192
- channels = DEFAULT_CHANNELS
193
 
194
  try:
195
- while True:
196
- raw = await ws.receive_text()
197
- msg = json.loads(raw)
198
-
199
- if msg.get("event") == "init":
200
- voice_name = msg.get("voice", DEFAULT_VOICE)
201
- voice_obj = get_voice(voice_name)
202
- sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
203
- await ws.send_text(json.dumps({"event": "ready", "sr": sr, "channels": channels}))
204
-
205
- elif msg.get("event") == "speak":
206
- if not voice_obj:
207
- voice_obj = get_voice(voice_name)
208
- sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
209
- await ws.send_text(json.dumps({"event": "ready", "sr": sr, "channels": channels}))
210
-
211
- text = str(msg.get("text", "")).strip()
212
- if not text:
213
- await ws.send_text(json.dumps({"event": "error", "detail": "Missing text"}))
214
- continue
215
-
216
- p = prosody(msg)
217
-
218
- # Stream PCM16 chunks as binary frames
219
- for chunk in voice_obj.synthesize_stream(
220
- text,
221
- length_scale=p["length_scale"],
222
- noise_scale=p["noise_scale"],
223
- noise_w=p["noise_w"],
224
- sentence_silence=p["sentence_silence"],
225
- ):
226
- if isinstance(chunk, (bytes, bytearray)):
227
- await ws.send_bytes(bytes(chunk))
228
-
229
- # tail-silence 220 ms
230
- tail_frames = int(sr * 0.22)
231
- await ws.send_bytes(b"\x00" * (tail_frames * channels * 2))
232
-
233
- await ws.send_text(json.dumps({"event": "done"}))
234
-
235
- else:
236
- await ws.send_text(json.dumps({"event": "error", "detail": "Unknown event"}))
237
 
238
  except WebSocketDisconnect:
239
  pass
@@ -242,6 +235,7 @@ async def ws_tts(ws: WebSocket):
242
  await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
243
  except Exception:
244
  pass
245
-
246
- if __name__ == "__main-_":
247
- uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
 
 
1
+ import asyncio, json, os, pathlib, re, time, uuid, shutil, tempfile, subprocess
2
+ from typing import Optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Body
5
+ from fastapi.responses import JSONResponse
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+
8
+ # ---------- CONFIG ----------
9
+ BASE_DIR = pathlib.Path(os.environ.get("FILES_DIR", "/tmp/tts_app/files"))
10
+ VOICE_DIR = pathlib.Path(os.environ.get("VOICE_DIR", "/home/user/voices"))
11
+ DEFAULT_VOICE= os.environ.get("DEFAULT_VOICE", "en_US-libritts-high")
12
+ PIPER_BIN = os.environ.get("PIPER_BIN", "piper") # comes from piper-tts wheel
13
+ BASE_DIR.mkdir(parents=True, exist_ok=True)
14
  VOICE_DIR.mkdir(parents=True, exist_ok=True)
15
 
16
+ # Rhasspy Piper voice hub (static URLs; no API key needed)
17
+ # We'll lazy-download exactly two files: .onnx + .onnx.json
18
+ VOICE_INDEX = {
19
+ "en_US-libritts-high": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/libritts/high/en_US-libritts-high",
20
+ "en_US-lessac-high": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/lessac/high/en_US-lessac-high",
21
+ }
22
+
23
+ # Safe defaults for naturalness (a bit slower/clearer)
24
+ DEFAULT_OPTS = dict(
25
+ sentence_pause=0.18, # seconds between sentences
26
+ length_scale=1.12, # pacing
27
+ noise_scale=0.33, # prosody randomness
28
+ noise_w=0.8 # breathiness
29
  )
30
 
31
+ # ---------- UTIL ----------
32
+ async def ensure_voice(voice_name: str) -> tuple[pathlib.Path, pathlib.Path]:
 
 
 
 
 
 
 
 
 
33
  """
34
+ Ensure required voice files exist locally:
35
+ VOICE_DIR/<name>.onnx
36
+ VOICE_DIR/<name>.onnx.json
37
+ Returns (onnx_path, cfg_path).
38
  """
39
+ stem = VOICE_INDEX.get(voice_name, VOICE_INDEX[DEFAULT_VOICE])
40
+ onnx = VOICE_DIR / f"{voice_name}.onnx"
41
+ cfg = VOICE_DIR / f"{voice_name}.onnx.json"
42
  if onnx.exists() and cfg.exists():
43
  return onnx, cfg
44
 
45
+ # Pick the right base URL from VOICE_INDEX; fallback to default
46
+ base = VOICE_INDEX.get(voice_name) or VOICE_INDEX[DEFAULT_VOICE]
47
+ import requests
48
+ for ext in (".onnx", ".onnx.json"):
49
+ url = base + ext
50
+ dst = VOICE_DIR / f"{voice_name}{ext}"
51
+ if not dst.exists():
52
+ r = requests.get(url, timeout=120)
53
  r.raise_for_status()
54
+ dst.write_bytes(r.content)
55
+ return onnx, cfg
 
 
 
 
 
 
 
 
 
 
56
 
57
+ def _piper_cmd(onnx_path: pathlib.Path, cfg_path: pathlib.Path,
58
+ out_wav: Optional[pathlib.Path]=None,
59
+ raw: bool=False,
60
+ opts: Optional[dict]=None):
61
  """
62
+ Build Piper CLI command.
63
+ If raw=True, output is 16-bit PCM raw to stdout. Otherwise WAV to --output_file.
64
  """
65
+ cmd = [PIPER_BIN, "--model", str(onnx_path), "--config", str(cfg_path)]
66
+ opts = opts or {}
67
+ # pacing and quality knobs; if provided by user, we’ll pass them
68
+ if "length_scale" in opts: cmd += ["--length_scale", str(opts["length_scale"])]
69
+ if "noise_scale" in opts: cmd += ["--noise_scale", str(opts["noise_scale"])]
70
+ if "noise_w" in opts: cmd += ["--noise_w", str(opts["noise_w"])]
71
+
72
+ if raw:
73
+ cmd += ["--output-raw"]
74
+ else:
75
+ assert out_wav is not None
76
+ cmd += ["--output_file", str(out_wav)]
77
+
78
+ # Let Piper auto sentence-split; we add extra pause at client side if needed
79
+ return cmd
80
+
81
+ async def run_piper_stream(text: str, onnx: pathlib.Path, cfg: pathlib.Path, ws: WebSocket, opts: dict):
82
+ """
83
+ Stream raw PCM16 from Piper stdout to the websocket as binary frames.
84
+ We send a small JSON "ready" event first with suggested sample rate/ch.
85
+ """
86
+ # Piper default SR is 22050 mono for most US voices
87
+ ready_evt = {"event": "ready", "sr": 22050, "channels": 1}
88
+ await ws.send_text(json.dumps(ready_evt))
89
+
90
+ # Start Piper process
91
+ cmd = _piper_cmd(onnx, cfg, raw=True, opts=opts)
92
+ # Piper takes the text from stdin
93
+ proc = await asyncio.create_subprocess_exec(
94
+ *cmd,
95
+ stdin=asyncio.subprocess.PIPE,
96
+ stdout=asyncio.subprocess.PIPE,
97
+ stderr=asyncio.subprocess.PIPE,
98
+ )
99
+
100
+ # Write text then close stdin to let Piper synthesize
101
+ proc.stdin.write(text.encode("utf-8"))
102
+ await proc.stdin.drain()
103
+ proc.stdin.close()
104
+
105
+ # Read stdout in chunks and forward to client
106
+ try:
107
+ while True:
108
+ chunk = await proc.stdout.read(8192)
109
+ if not chunk:
110
+ break
111
+ await ws.send_bytes(chunk)
112
+ finally:
113
+ # Drain any remaining stderr (optional debugging)
114
+ try:
115
+ _ = await asyncio.wait_for(proc.stderr.read(), timeout=0.1)
116
+ except asyncio.TimeoutError:
117
+ pass
118
 
119
+ await proc.wait()
120
+ await ws.send_text(json.dumps({"event": "done"}))
 
 
 
 
121
 
122
+ # ---------- APP ----------
123
+ app = FastAPI()
124
+ app.add_middleware(
125
+ CORSMiddleware,
126
+ allow_origins=[""], allow_credentials=True, allow_methods=[""], allow_headers=["*"],
127
+ )
128
 
129
  @app.get("/health")
130
  def health():
131
+ voices = []
132
+ for k in VOICE_INDEX.keys():
133
+ onnx = VOICE_DIR / f"{k}.onnx"
134
+ cfg = VOICE_DIR / f"{k}.onnx.json"
135
+ if onnx.exists() and cfg.exists():
136
+ voices.append(k)
137
  return {
138
  "ok": True,
139
+ "engine": "piper-tts (CLI, CPU)",
140
+ "default_voice": DEFAULT_VOICE if (VOICE_DIR / f"{DEFAULT_VOICE}.onnx").exists() else None,
141
  "voice_dir": str(VOICE_DIR),
142
+ "available_voices": voices,
143
+ "files_dir": str(BASE_DIR),
 
144
  }
145
 
146
  @app.post("/speak")
147
+ async def speak(
148
+ text: str = Body(...),
149
+ voice: Optional[str] = Body(None),
150
+ length_scale: Optional[float] = Body(None),
151
+ noise_scale: Optional[float] = Body(None),
152
+ noise_w: Optional[float] = Body(None),
153
+ ):
154
  """
155
+ One-shot synthesis to WAV file. Returns a short-lived /file/<name>.wav URL.
 
 
 
 
 
 
 
156
  """
157
+ vname = voice or DEFAULT_VOICE
158
+ onnx, cfg = await asyncio.to_thread(ensure_voice, vname) # Run blocking download in thread
159
+
160
+ # Build opts
161
+ opts = dict(DEFAULT_OPTS)
162
+ if length_scale is not None: opts["length_scale"] = length_scale
163
+ if noise_scale is not None: opts["noise_scale"] = noise_scale
164
+ if noise_w is not None: opts["noise_w"] = noise_w
165
+
166
+ # Output path
167
+ BASE_DIR.mkdir(parents=True, exist_ok=True)
168
+ out_wav = BASE_DIR / f"tts-{int(time.time()*1000)}.wav"
169
+
170
+ # Run Piper to file (non-streaming)
171
+ cmd = _piper_cmd(onnx, cfg, out_wav=out_wav, raw=False, opts=opts)
172
+ proc = await asyncio.create_subprocess_exec(
173
+ *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.DEVNULL, stderr=asyncio.subprocess.PIPE
174
+ )
175
+ proc.stdin.write(text.encode("utf-8"))
176
+ await proc.stdin.drain()
177
+ proc.stdin.close()
178
+
179
+ stderr_txt = (await proc.stderr.read()).decode("utf-8", "ignore")
180
+ rc = await proc.wait()
181
+ if rc != 0:
182
  return JSONResponse(
183
+ status_code=500,
184
+ content={"ok": False, "error": "Piper synthesis failed", "detail": stderr_txt[:4000]},
185
  )
186
 
187
+ return {"ok": True, "audio_url": f"/file/{out_wav.name}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  @app.websocket("/ws/tts")
190
  async def ws_tts(ws: WebSocket):
 
 
 
 
 
 
 
 
191
  await ws.accept()
192
+ voice = DEFAULT_VOICE
193
+ opts = dict(DEFAULT_OPTS)
194
+ text = ""
 
195
 
196
  try:
197
+ # Expect: {"event":"init","voice":..., "length_scale":..., "noise_scale":..., "noise_w":...}
198
+ # then: {"event":"speak","text": "..."}
199
+ # (you can send init and speak back-to-back)
200
+ # 1) Wait for init
201
+ init_raw = await ws.receive_text()
202
+ init = json.loads(init_raw)
203
+ if init.get("event") != "init":
204
+ await ws.send_text(json.dumps({"event": "error", "detail": "First message must be {'event':'init',...}"}))
205
+ await ws.close(code=1002)
206
+ return
207
+
208
+ voice = init.get("voice") or DEFAULT_VOICE
209
+ for k in ("length_scale", "noise_scale", "noise_w"):
210
+ if k in init and init[k] is not None:
211
+ opts[k] = init[k]
212
+
213
+ # 2) Wait for speak
214
+ speak_raw = await ws.receive_text()
215
+ sp = json.loads(speak_raw)
216
+ if sp.get("event") != "speak":
217
+ await ws.send_text(json.dumps({"event": "error", "detail": "Expected {'event':'speak','text':...}"}))
218
+ await ws.close(code=1002)
219
+ return
220
+
221
+ text = sp.get("text") or ""
222
+ if not text.strip():
223
+ await ws.send_text(json.dumps({"event": "error", "detail": "Empty text"}))
224
+ await ws.close(code=1002)
225
+ return
226
+
227
+ # 3) Ensure voice files then stream Piper stdout
228
+ onnx, cfg = await asyncio.to_thread(ensure_voice, voice)
229
+ await run_piper_stream(text, onnx, cfg, ws, opts)
 
 
 
 
 
 
 
 
 
230
 
231
  except WebSocketDisconnect:
232
  pass
 
235
  await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
236
  except Exception:
237
  pass
238
+ try:
239
+ await ws.close(code=1011)
240
+ except Exception:
241
+ pass